xref: /openbmc/qemu/migration/ram.c (revision 44602af8)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
62 
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
66 
67 /***********************************************************/
68 /* ram save/restore */
69 
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71  * worked for pages that where filled with the same char.  We switched
72  * it to only search for the zero value.  And to avoid confusion with
73  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74  */
75 
76 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO     0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE     0x08
80 #define RAM_SAVE_FLAG_EOS      0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE   0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
85 
86 XBZRLECacheStats xbzrle_counters;
87 
88 /* struct contains XBZRLE cache and a static page
89    used by the compression */
90 static struct {
91     /* buffer used for XBZRLE encoding */
92     uint8_t *encoded_buf;
93     /* buffer for storing page content */
94     uint8_t *current_buf;
95     /* Cache for XBZRLE, Protected by lock. */
96     PageCache *cache;
97     QemuMutex lock;
98     /* it will store a page full of zeros */
99     uint8_t *zero_target_page;
100     /* buffer used for XBZRLE decoding */
101     uint8_t *decoded_buf;
102 } XBZRLE;
103 
104 static void XBZRLE_cache_lock(void)
105 {
106     if (migrate_use_xbzrle()) {
107         qemu_mutex_lock(&XBZRLE.lock);
108     }
109 }
110 
111 static void XBZRLE_cache_unlock(void)
112 {
113     if (migrate_use_xbzrle()) {
114         qemu_mutex_unlock(&XBZRLE.lock);
115     }
116 }
117 
118 /**
119  * xbzrle_cache_resize: resize the xbzrle cache
120  *
121  * This function is called from migrate_params_apply in main
122  * thread, possibly while a migration is in progress.  A running
123  * migration may be using the cache and might finish during this call,
124  * hence changes to the cache are protected by XBZRLE.lock().
125  *
126  * Returns 0 for success or -1 for error
127  *
128  * @new_size: new cache size
129  * @errp: set *errp if the check failed, with reason
130  */
131 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
132 {
133     PageCache *new_cache;
134     int64_t ret = 0;
135 
136     /* Check for truncation */
137     if (new_size != (size_t)new_size) {
138         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
139                    "exceeding address space");
140         return -1;
141     }
142 
143     if (new_size == migrate_xbzrle_cache_size()) {
144         /* nothing to do */
145         return 0;
146     }
147 
148     XBZRLE_cache_lock();
149 
150     if (XBZRLE.cache != NULL) {
151         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
152         if (!new_cache) {
153             ret = -1;
154             goto out;
155         }
156 
157         cache_fini(XBZRLE.cache);
158         XBZRLE.cache = new_cache;
159     }
160 out:
161     XBZRLE_cache_unlock();
162     return ret;
163 }
164 
165 bool ramblock_is_ignored(RAMBlock *block)
166 {
167     return !qemu_ram_is_migratable(block) ||
168            (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 }
170 
171 #undef RAMBLOCK_FOREACH
172 
173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
174 {
175     RAMBlock *block;
176     int ret = 0;
177 
178     RCU_READ_LOCK_GUARD();
179 
180     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
181         ret = func(block, opaque);
182         if (ret) {
183             break;
184         }
185     }
186     return ret;
187 }
188 
189 static void ramblock_recv_map_init(void)
190 {
191     RAMBlock *rb;
192 
193     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
194         assert(!rb->receivedmap);
195         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
196     }
197 }
198 
199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
200 {
201     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
202                     rb->receivedmap);
203 }
204 
205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
206 {
207     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 }
209 
210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
211 {
212     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 }
214 
215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
216                                     size_t nr)
217 {
218     bitmap_set_atomic(rb->receivedmap,
219                       ramblock_recv_bitmap_offset(host_addr, rb),
220                       nr);
221 }
222 
223 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
224 
225 /*
226  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
227  *
228  * Returns >0 if success with sent bytes, or <0 if error.
229  */
230 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
231                                   const char *block_name)
232 {
233     RAMBlock *block = qemu_ram_block_by_name(block_name);
234     unsigned long *le_bitmap, nbits;
235     uint64_t size;
236 
237     if (!block) {
238         error_report("%s: invalid block name: %s", __func__, block_name);
239         return -1;
240     }
241 
242     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
243 
244     /*
245      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
246      * machines we may need 4 more bytes for padding (see below
247      * comment). So extend it a bit before hand.
248      */
249     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250 
251     /*
252      * Always use little endian when sending the bitmap. This is
253      * required that when source and destination VMs are not using the
254      * same endianness. (Note: big endian won't work.)
255      */
256     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
257 
258     /* Size of the bitmap, in bytes */
259     size = DIV_ROUND_UP(nbits, 8);
260 
261     /*
262      * size is always aligned to 8 bytes for 64bit machines, but it
263      * may not be true for 32bit machines. We need this padding to
264      * make sure the migration can survive even between 32bit and
265      * 64bit machines.
266      */
267     size = ROUND_UP(size, 8);
268 
269     qemu_put_be64(file, size);
270     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
271     /*
272      * Mark as an end, in case the middle part is screwed up due to
273      * some "mysterious" reason.
274      */
275     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
276     qemu_fflush(file);
277 
278     g_free(le_bitmap);
279 
280     if (qemu_file_get_error(file)) {
281         return qemu_file_get_error(file);
282     }
283 
284     return size + sizeof(size);
285 }
286 
287 /*
288  * An outstanding page request, on the source, having been received
289  * and queued
290  */
291 struct RAMSrcPageRequest {
292     RAMBlock *rb;
293     hwaddr    offset;
294     hwaddr    len;
295 
296     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 };
298 
299 typedef struct {
300     /*
301      * Cached ramblock/offset values if preempted.  They're only meaningful if
302      * preempted==true below.
303      */
304     RAMBlock *ram_block;
305     unsigned long ram_page;
306     /*
307      * Whether a postcopy preemption just happened.  Will be reset after
308      * precopy recovered to background migration.
309      */
310     bool preempted;
311 } PostcopyPreemptState;
312 
313 /* State of RAM for migration */
314 struct RAMState {
315     /* QEMUFile used for this migration */
316     QEMUFile *f;
317     /* UFFD file descriptor, used in 'write-tracking' migration */
318     int uffdio_fd;
319     /* Last block that we have visited searching for dirty pages */
320     RAMBlock *last_seen_block;
321     /* Last block from where we have sent data */
322     RAMBlock *last_sent_block;
323     /* Last dirty target page we have sent */
324     ram_addr_t last_page;
325     /* last ram version we have seen */
326     uint32_t last_version;
327     /* How many times we have dirty too many pages */
328     int dirty_rate_high_cnt;
329     /* these variables are used for bitmap sync */
330     /* last time we did a full bitmap_sync */
331     int64_t time_last_bitmap_sync;
332     /* bytes transferred at start_time */
333     uint64_t bytes_xfer_prev;
334     /* number of dirty pages since start_time */
335     uint64_t num_dirty_pages_period;
336     /* xbzrle misses since the beginning of the period */
337     uint64_t xbzrle_cache_miss_prev;
338     /* Amount of xbzrle pages since the beginning of the period */
339     uint64_t xbzrle_pages_prev;
340     /* Amount of xbzrle encoded bytes since the beginning of the period */
341     uint64_t xbzrle_bytes_prev;
342     /* Start using XBZRLE (e.g., after the first round). */
343     bool xbzrle_enabled;
344     /* Are we on the last stage of migration */
345     bool last_stage;
346     /* compression statistics since the beginning of the period */
347     /* amount of count that no free thread to compress data */
348     uint64_t compress_thread_busy_prev;
349     /* amount bytes after compression */
350     uint64_t compressed_size_prev;
351     /* amount of compressed pages */
352     uint64_t compress_pages_prev;
353 
354     /* total handled target pages at the beginning of period */
355     uint64_t target_page_count_prev;
356     /* total handled target pages since start */
357     uint64_t target_page_count;
358     /* number of dirty bits in the bitmap */
359     uint64_t migration_dirty_pages;
360     /* Protects modification of the bitmap and migration dirty pages */
361     QemuMutex bitmap_mutex;
362     /* The RAMBlock used in the last src_page_requests */
363     RAMBlock *last_req_rb;
364     /* Queue of outstanding page requests from the destination */
365     QemuMutex src_page_req_mutex;
366     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
367 
368     /* Postcopy preemption informations */
369     PostcopyPreemptState postcopy_preempt_state;
370     /*
371      * Current channel we're using on src VM.  Only valid if postcopy-preempt
372      * is enabled.
373      */
374     unsigned int postcopy_channel;
375 };
376 typedef struct RAMState RAMState;
377 
378 static RAMState *ram_state;
379 
380 static NotifierWithReturnList precopy_notifier_list;
381 
382 static void postcopy_preempt_reset(RAMState *rs)
383 {
384     memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
385 }
386 
387 /* Whether postcopy has queued requests? */
388 static bool postcopy_has_request(RAMState *rs)
389 {
390     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
391 }
392 
393 void precopy_infrastructure_init(void)
394 {
395     notifier_with_return_list_init(&precopy_notifier_list);
396 }
397 
398 void precopy_add_notifier(NotifierWithReturn *n)
399 {
400     notifier_with_return_list_add(&precopy_notifier_list, n);
401 }
402 
403 void precopy_remove_notifier(NotifierWithReturn *n)
404 {
405     notifier_with_return_remove(n);
406 }
407 
408 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
409 {
410     PrecopyNotifyData pnd;
411     pnd.reason = reason;
412     pnd.errp = errp;
413 
414     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
415 }
416 
417 uint64_t ram_bytes_remaining(void)
418 {
419     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
420                        0;
421 }
422 
423 MigrationStats ram_counters;
424 
425 static void ram_transferred_add(uint64_t bytes)
426 {
427     if (runstate_is_running()) {
428         ram_counters.precopy_bytes += bytes;
429     } else if (migration_in_postcopy()) {
430         ram_counters.postcopy_bytes += bytes;
431     } else {
432         ram_counters.downtime_bytes += bytes;
433     }
434     ram_counters.transferred += bytes;
435 }
436 
437 void dirty_sync_missed_zero_copy(void)
438 {
439     ram_counters.dirty_sync_missed_zero_copy++;
440 }
441 
442 /* used by the search for pages to send */
443 struct PageSearchStatus {
444     /* Current block being searched */
445     RAMBlock    *block;
446     /* Current page to search from */
447     unsigned long page;
448     /* Set once we wrap around */
449     bool         complete_round;
450     /*
451      * [POSTCOPY-ONLY] Whether current page is explicitly requested by
452      * postcopy.  When set, the request is "urgent" because the dest QEMU
453      * threads are waiting for us.
454      */
455     bool         postcopy_requested;
456     /*
457      * [POSTCOPY-ONLY] The target channel to use to send current page.
458      *
459      * Note: This may _not_ match with the value in postcopy_requested
460      * above. Let's imagine the case where the postcopy request is exactly
461      * the page that we're sending in progress during precopy. In this case
462      * we'll have postcopy_requested set to true but the target channel
463      * will be the precopy channel (so that we don't split brain on that
464      * specific page since the precopy channel already contains partial of
465      * that page data).
466      *
467      * Besides that specific use case, postcopy_target_channel should
468      * always be equal to postcopy_requested, because by default we send
469      * postcopy pages via postcopy preempt channel.
470      */
471     bool         postcopy_target_channel;
472 };
473 typedef struct PageSearchStatus PageSearchStatus;
474 
475 CompressionStats compression_counters;
476 
477 struct CompressParam {
478     bool done;
479     bool quit;
480     bool zero_page;
481     QEMUFile *file;
482     QemuMutex mutex;
483     QemuCond cond;
484     RAMBlock *block;
485     ram_addr_t offset;
486 
487     /* internally used fields */
488     z_stream stream;
489     uint8_t *originbuf;
490 };
491 typedef struct CompressParam CompressParam;
492 
493 struct DecompressParam {
494     bool done;
495     bool quit;
496     QemuMutex mutex;
497     QemuCond cond;
498     void *des;
499     uint8_t *compbuf;
500     int len;
501     z_stream stream;
502 };
503 typedef struct DecompressParam DecompressParam;
504 
505 static CompressParam *comp_param;
506 static QemuThread *compress_threads;
507 /* comp_done_cond is used to wake up the migration thread when
508  * one of the compression threads has finished the compression.
509  * comp_done_lock is used to co-work with comp_done_cond.
510  */
511 static QemuMutex comp_done_lock;
512 static QemuCond comp_done_cond;
513 
514 static QEMUFile *decomp_file;
515 static DecompressParam *decomp_param;
516 static QemuThread *decompress_threads;
517 static QemuMutex decomp_done_lock;
518 static QemuCond decomp_done_cond;
519 
520 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
521                                  ram_addr_t offset, uint8_t *source_buf);
522 
523 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
524                                      bool postcopy_requested);
525 
526 static void *do_data_compress(void *opaque)
527 {
528     CompressParam *param = opaque;
529     RAMBlock *block;
530     ram_addr_t offset;
531     bool zero_page;
532 
533     qemu_mutex_lock(&param->mutex);
534     while (!param->quit) {
535         if (param->block) {
536             block = param->block;
537             offset = param->offset;
538             param->block = NULL;
539             qemu_mutex_unlock(&param->mutex);
540 
541             zero_page = do_compress_ram_page(param->file, &param->stream,
542                                              block, offset, param->originbuf);
543 
544             qemu_mutex_lock(&comp_done_lock);
545             param->done = true;
546             param->zero_page = zero_page;
547             qemu_cond_signal(&comp_done_cond);
548             qemu_mutex_unlock(&comp_done_lock);
549 
550             qemu_mutex_lock(&param->mutex);
551         } else {
552             qemu_cond_wait(&param->cond, &param->mutex);
553         }
554     }
555     qemu_mutex_unlock(&param->mutex);
556 
557     return NULL;
558 }
559 
560 static void compress_threads_save_cleanup(void)
561 {
562     int i, thread_count;
563 
564     if (!migrate_use_compression() || !comp_param) {
565         return;
566     }
567 
568     thread_count = migrate_compress_threads();
569     for (i = 0; i < thread_count; i++) {
570         /*
571          * we use it as a indicator which shows if the thread is
572          * properly init'd or not
573          */
574         if (!comp_param[i].file) {
575             break;
576         }
577 
578         qemu_mutex_lock(&comp_param[i].mutex);
579         comp_param[i].quit = true;
580         qemu_cond_signal(&comp_param[i].cond);
581         qemu_mutex_unlock(&comp_param[i].mutex);
582 
583         qemu_thread_join(compress_threads + i);
584         qemu_mutex_destroy(&comp_param[i].mutex);
585         qemu_cond_destroy(&comp_param[i].cond);
586         deflateEnd(&comp_param[i].stream);
587         g_free(comp_param[i].originbuf);
588         qemu_fclose(comp_param[i].file);
589         comp_param[i].file = NULL;
590     }
591     qemu_mutex_destroy(&comp_done_lock);
592     qemu_cond_destroy(&comp_done_cond);
593     g_free(compress_threads);
594     g_free(comp_param);
595     compress_threads = NULL;
596     comp_param = NULL;
597 }
598 
599 static int compress_threads_save_setup(void)
600 {
601     int i, thread_count;
602 
603     if (!migrate_use_compression()) {
604         return 0;
605     }
606     thread_count = migrate_compress_threads();
607     compress_threads = g_new0(QemuThread, thread_count);
608     comp_param = g_new0(CompressParam, thread_count);
609     qemu_cond_init(&comp_done_cond);
610     qemu_mutex_init(&comp_done_lock);
611     for (i = 0; i < thread_count; i++) {
612         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
613         if (!comp_param[i].originbuf) {
614             goto exit;
615         }
616 
617         if (deflateInit(&comp_param[i].stream,
618                         migrate_compress_level()) != Z_OK) {
619             g_free(comp_param[i].originbuf);
620             goto exit;
621         }
622 
623         /* comp_param[i].file is just used as a dummy buffer to save data,
624          * set its ops to empty.
625          */
626         comp_param[i].file = qemu_file_new_output(
627             QIO_CHANNEL(qio_channel_null_new()));
628         comp_param[i].done = true;
629         comp_param[i].quit = false;
630         qemu_mutex_init(&comp_param[i].mutex);
631         qemu_cond_init(&comp_param[i].cond);
632         qemu_thread_create(compress_threads + i, "compress",
633                            do_data_compress, comp_param + i,
634                            QEMU_THREAD_JOINABLE);
635     }
636     return 0;
637 
638 exit:
639     compress_threads_save_cleanup();
640     return -1;
641 }
642 
643 /**
644  * save_page_header: write page header to wire
645  *
646  * If this is the 1st block, it also writes the block identification
647  *
648  * Returns the number of bytes written
649  *
650  * @f: QEMUFile where to send the data
651  * @block: block that contains the page we want to send
652  * @offset: offset inside the block for the page
653  *          in the lower bits, it contains flags
654  */
655 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
656                                ram_addr_t offset)
657 {
658     size_t size, len;
659 
660     if (block == rs->last_sent_block) {
661         offset |= RAM_SAVE_FLAG_CONTINUE;
662     }
663     qemu_put_be64(f, offset);
664     size = 8;
665 
666     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
667         len = strlen(block->idstr);
668         qemu_put_byte(f, len);
669         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
670         size += 1 + len;
671         rs->last_sent_block = block;
672     }
673     return size;
674 }
675 
676 /**
677  * mig_throttle_guest_down: throttle down the guest
678  *
679  * Reduce amount of guest cpu execution to hopefully slow down memory
680  * writes. If guest dirty memory rate is reduced below the rate at
681  * which we can transfer pages to the destination then we should be
682  * able to complete migration. Some workloads dirty memory way too
683  * fast and will not effectively converge, even with auto-converge.
684  */
685 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
686                                     uint64_t bytes_dirty_threshold)
687 {
688     MigrationState *s = migrate_get_current();
689     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
690     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
691     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
692     int pct_max = s->parameters.max_cpu_throttle;
693 
694     uint64_t throttle_now = cpu_throttle_get_percentage();
695     uint64_t cpu_now, cpu_ideal, throttle_inc;
696 
697     /* We have not started throttling yet. Let's start it. */
698     if (!cpu_throttle_active()) {
699         cpu_throttle_set(pct_initial);
700     } else {
701         /* Throttling already on, just increase the rate */
702         if (!pct_tailslow) {
703             throttle_inc = pct_increment;
704         } else {
705             /* Compute the ideal CPU percentage used by Guest, which may
706              * make the dirty rate match the dirty rate threshold. */
707             cpu_now = 100 - throttle_now;
708             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
709                         bytes_dirty_period);
710             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
711         }
712         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
713     }
714 }
715 
716 void mig_throttle_counter_reset(void)
717 {
718     RAMState *rs = ram_state;
719 
720     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
721     rs->num_dirty_pages_period = 0;
722     rs->bytes_xfer_prev = ram_counters.transferred;
723 }
724 
725 /**
726  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
727  *
728  * @rs: current RAM state
729  * @current_addr: address for the zero page
730  *
731  * Update the xbzrle cache to reflect a page that's been sent as all 0.
732  * The important thing is that a stale (not-yet-0'd) page be replaced
733  * by the new data.
734  * As a bonus, if the page wasn't in the cache it gets added so that
735  * when a small write is made into the 0'd page it gets XBZRLE sent.
736  */
737 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
738 {
739     if (!rs->xbzrle_enabled) {
740         return;
741     }
742 
743     /* We don't care if this fails to allocate a new cache page
744      * as long as it updated an old one */
745     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
746                  ram_counters.dirty_sync_count);
747 }
748 
749 #define ENCODING_FLAG_XBZRLE 0x1
750 
751 /**
752  * save_xbzrle_page: compress and send current page
753  *
754  * Returns: 1 means that we wrote the page
755  *          0 means that page is identical to the one already sent
756  *          -1 means that xbzrle would be longer than normal
757  *
758  * @rs: current RAM state
759  * @current_data: pointer to the address of the page contents
760  * @current_addr: addr of the page
761  * @block: block that contains the page we want to send
762  * @offset: offset inside the block for the page
763  */
764 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
765                             ram_addr_t current_addr, RAMBlock *block,
766                             ram_addr_t offset)
767 {
768     int encoded_len = 0, bytes_xbzrle;
769     uint8_t *prev_cached_page;
770 
771     if (!cache_is_cached(XBZRLE.cache, current_addr,
772                          ram_counters.dirty_sync_count)) {
773         xbzrle_counters.cache_miss++;
774         if (!rs->last_stage) {
775             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
776                              ram_counters.dirty_sync_count) == -1) {
777                 return -1;
778             } else {
779                 /* update *current_data when the page has been
780                    inserted into cache */
781                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
782             }
783         }
784         return -1;
785     }
786 
787     /*
788      * Reaching here means the page has hit the xbzrle cache, no matter what
789      * encoding result it is (normal encoding, overflow or skipping the page),
790      * count the page as encoded. This is used to calculate the encoding rate.
791      *
792      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
793      * 2nd page turns out to be skipped (i.e. no new bytes written to the
794      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
795      * skipped page included. In this way, the encoding rate can tell if the
796      * guest page is good for xbzrle encoding.
797      */
798     xbzrle_counters.pages++;
799     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
800 
801     /* save current buffer into memory */
802     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
803 
804     /* XBZRLE encoding (if there is no overflow) */
805     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
806                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
807                                        TARGET_PAGE_SIZE);
808 
809     /*
810      * Update the cache contents, so that it corresponds to the data
811      * sent, in all cases except where we skip the page.
812      */
813     if (!rs->last_stage && encoded_len != 0) {
814         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
815         /*
816          * In the case where we couldn't compress, ensure that the caller
817          * sends the data from the cache, since the guest might have
818          * changed the RAM since we copied it.
819          */
820         *current_data = prev_cached_page;
821     }
822 
823     if (encoded_len == 0) {
824         trace_save_xbzrle_page_skipping();
825         return 0;
826     } else if (encoded_len == -1) {
827         trace_save_xbzrle_page_overflow();
828         xbzrle_counters.overflow++;
829         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
830         return -1;
831     }
832 
833     /* Send XBZRLE based compressed page */
834     bytes_xbzrle = save_page_header(rs, rs->f, block,
835                                     offset | RAM_SAVE_FLAG_XBZRLE);
836     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
837     qemu_put_be16(rs->f, encoded_len);
838     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
839     bytes_xbzrle += encoded_len + 1 + 2;
840     /*
841      * Like compressed_size (please see update_compress_thread_counts),
842      * the xbzrle encoded bytes don't count the 8 byte header with
843      * RAM_SAVE_FLAG_CONTINUE.
844      */
845     xbzrle_counters.bytes += bytes_xbzrle - 8;
846     ram_transferred_add(bytes_xbzrle);
847 
848     return 1;
849 }
850 
851 /**
852  * migration_bitmap_find_dirty: find the next dirty page from start
853  *
854  * Returns the page offset within memory region of the start of a dirty page
855  *
856  * @rs: current RAM state
857  * @rb: RAMBlock where to search for dirty pages
858  * @start: page where we start the search
859  */
860 static inline
861 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
862                                           unsigned long start)
863 {
864     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
865     unsigned long *bitmap = rb->bmap;
866 
867     if (ramblock_is_ignored(rb)) {
868         return size;
869     }
870 
871     return find_next_bit(bitmap, size, start);
872 }
873 
874 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
875                                                        unsigned long page)
876 {
877     uint8_t shift;
878     hwaddr size, start;
879 
880     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
881         return;
882     }
883 
884     shift = rb->clear_bmap_shift;
885     /*
886      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
887      * can make things easier sometimes since then start address
888      * of the small chunk will always be 64 pages aligned so the
889      * bitmap will always be aligned to unsigned long. We should
890      * even be able to remove this restriction but I'm simply
891      * keeping it.
892      */
893     assert(shift >= 6);
894 
895     size = 1ULL << (TARGET_PAGE_BITS + shift);
896     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
897     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
898     memory_region_clear_dirty_bitmap(rb->mr, start, size);
899 }
900 
901 static void
902 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
903                                                  unsigned long start,
904                                                  unsigned long npages)
905 {
906     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
907     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
908     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
909 
910     /*
911      * Clear pages from start to start + npages - 1, so the end boundary is
912      * exclusive.
913      */
914     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
915         migration_clear_memory_region_dirty_bitmap(rb, i);
916     }
917 }
918 
919 /*
920  * colo_bitmap_find_diry:find contiguous dirty pages from start
921  *
922  * Returns the page offset within memory region of the start of the contiguout
923  * dirty page
924  *
925  * @rs: current RAM state
926  * @rb: RAMBlock where to search for dirty pages
927  * @start: page where we start the search
928  * @num: the number of contiguous dirty pages
929  */
930 static inline
931 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
932                                      unsigned long start, unsigned long *num)
933 {
934     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
935     unsigned long *bitmap = rb->bmap;
936     unsigned long first, next;
937 
938     *num = 0;
939 
940     if (ramblock_is_ignored(rb)) {
941         return size;
942     }
943 
944     first = find_next_bit(bitmap, size, start);
945     if (first >= size) {
946         return first;
947     }
948     next = find_next_zero_bit(bitmap, size, first + 1);
949     assert(next >= first);
950     *num = next - first;
951     return first;
952 }
953 
954 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
955                                                 RAMBlock *rb,
956                                                 unsigned long page)
957 {
958     bool ret;
959 
960     /*
961      * Clear dirty bitmap if needed.  This _must_ be called before we
962      * send any of the page in the chunk because we need to make sure
963      * we can capture further page content changes when we sync dirty
964      * log the next time.  So as long as we are going to send any of
965      * the page in the chunk we clear the remote dirty bitmap for all.
966      * Clearing it earlier won't be a problem, but too late will.
967      */
968     migration_clear_memory_region_dirty_bitmap(rb, page);
969 
970     ret = test_and_clear_bit(page, rb->bmap);
971     if (ret) {
972         rs->migration_dirty_pages--;
973     }
974 
975     return ret;
976 }
977 
978 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
979                                        void *opaque)
980 {
981     const hwaddr offset = section->offset_within_region;
982     const hwaddr size = int128_get64(section->size);
983     const unsigned long start = offset >> TARGET_PAGE_BITS;
984     const unsigned long npages = size >> TARGET_PAGE_BITS;
985     RAMBlock *rb = section->mr->ram_block;
986     uint64_t *cleared_bits = opaque;
987 
988     /*
989      * We don't grab ram_state->bitmap_mutex because we expect to run
990      * only when starting migration or during postcopy recovery where
991      * we don't have concurrent access.
992      */
993     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
994         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
995     }
996     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
997     bitmap_clear(rb->bmap, start, npages);
998 }
999 
1000 /*
1001  * Exclude all dirty pages from migration that fall into a discarded range as
1002  * managed by a RamDiscardManager responsible for the mapped memory region of
1003  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1004  *
1005  * Discarded pages ("logically unplugged") have undefined content and must
1006  * not get migrated, because even reading these pages for migration might
1007  * result in undesired behavior.
1008  *
1009  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1010  *
1011  * Note: The result is only stable while migrating (precopy/postcopy).
1012  */
1013 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1014 {
1015     uint64_t cleared_bits = 0;
1016 
1017     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1018         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1019         MemoryRegionSection section = {
1020             .mr = rb->mr,
1021             .offset_within_region = 0,
1022             .size = int128_make64(qemu_ram_get_used_length(rb)),
1023         };
1024 
1025         ram_discard_manager_replay_discarded(rdm, &section,
1026                                              dirty_bitmap_clear_section,
1027                                              &cleared_bits);
1028     }
1029     return cleared_bits;
1030 }
1031 
1032 /*
1033  * Check if a host-page aligned page falls into a discarded range as managed by
1034  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1035  *
1036  * Note: The result is only stable while migrating (precopy/postcopy).
1037  */
1038 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1039 {
1040     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1041         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1042         MemoryRegionSection section = {
1043             .mr = rb->mr,
1044             .offset_within_region = start,
1045             .size = int128_make64(qemu_ram_pagesize(rb)),
1046         };
1047 
1048         return !ram_discard_manager_is_populated(rdm, &section);
1049     }
1050     return false;
1051 }
1052 
1053 /* Called with RCU critical section */
1054 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1055 {
1056     uint64_t new_dirty_pages =
1057         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1058 
1059     rs->migration_dirty_pages += new_dirty_pages;
1060     rs->num_dirty_pages_period += new_dirty_pages;
1061 }
1062 
1063 /**
1064  * ram_pagesize_summary: calculate all the pagesizes of a VM
1065  *
1066  * Returns a summary bitmap of the page sizes of all RAMBlocks
1067  *
1068  * For VMs with just normal pages this is equivalent to the host page
1069  * size. If it's got some huge pages then it's the OR of all the
1070  * different page sizes.
1071  */
1072 uint64_t ram_pagesize_summary(void)
1073 {
1074     RAMBlock *block;
1075     uint64_t summary = 0;
1076 
1077     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1078         summary |= block->page_size;
1079     }
1080 
1081     return summary;
1082 }
1083 
1084 uint64_t ram_get_total_transferred_pages(void)
1085 {
1086     return  ram_counters.normal + ram_counters.duplicate +
1087                 compression_counters.pages + xbzrle_counters.pages;
1088 }
1089 
1090 static void migration_update_rates(RAMState *rs, int64_t end_time)
1091 {
1092     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1093     double compressed_size;
1094 
1095     /* calculate period counters */
1096     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1097                 / (end_time - rs->time_last_bitmap_sync);
1098 
1099     if (!page_count) {
1100         return;
1101     }
1102 
1103     if (migrate_use_xbzrle()) {
1104         double encoded_size, unencoded_size;
1105 
1106         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1107             rs->xbzrle_cache_miss_prev) / page_count;
1108         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1109         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1110                          TARGET_PAGE_SIZE;
1111         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1112         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1113             xbzrle_counters.encoding_rate = 0;
1114         } else {
1115             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1116         }
1117         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1118         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1119     }
1120 
1121     if (migrate_use_compression()) {
1122         compression_counters.busy_rate = (double)(compression_counters.busy -
1123             rs->compress_thread_busy_prev) / page_count;
1124         rs->compress_thread_busy_prev = compression_counters.busy;
1125 
1126         compressed_size = compression_counters.compressed_size -
1127                           rs->compressed_size_prev;
1128         if (compressed_size) {
1129             double uncompressed_size = (compression_counters.pages -
1130                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1131 
1132             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1133             compression_counters.compression_rate =
1134                                         uncompressed_size / compressed_size;
1135 
1136             rs->compress_pages_prev = compression_counters.pages;
1137             rs->compressed_size_prev = compression_counters.compressed_size;
1138         }
1139     }
1140 }
1141 
1142 static void migration_trigger_throttle(RAMState *rs)
1143 {
1144     MigrationState *s = migrate_get_current();
1145     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1146 
1147     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1148     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1149     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1150 
1151     /* During block migration the auto-converge logic incorrectly detects
1152      * that ram migration makes no progress. Avoid this by disabling the
1153      * throttling logic during the bulk phase of block migration. */
1154     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1155         /* The following detection logic can be refined later. For now:
1156            Check to see if the ratio between dirtied bytes and the approx.
1157            amount of bytes that just got transferred since the last time
1158            we were in this routine reaches the threshold. If that happens
1159            twice, start or increase throttling. */
1160 
1161         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1162             (++rs->dirty_rate_high_cnt >= 2)) {
1163             trace_migration_throttle();
1164             rs->dirty_rate_high_cnt = 0;
1165             mig_throttle_guest_down(bytes_dirty_period,
1166                                     bytes_dirty_threshold);
1167         }
1168     }
1169 }
1170 
1171 static void migration_bitmap_sync(RAMState *rs)
1172 {
1173     RAMBlock *block;
1174     int64_t end_time;
1175 
1176     ram_counters.dirty_sync_count++;
1177 
1178     if (!rs->time_last_bitmap_sync) {
1179         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1180     }
1181 
1182     trace_migration_bitmap_sync_start();
1183     memory_global_dirty_log_sync();
1184 
1185     qemu_mutex_lock(&rs->bitmap_mutex);
1186     WITH_RCU_READ_LOCK_GUARD() {
1187         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1188             ramblock_sync_dirty_bitmap(rs, block);
1189         }
1190         ram_counters.remaining = ram_bytes_remaining();
1191     }
1192     qemu_mutex_unlock(&rs->bitmap_mutex);
1193 
1194     memory_global_after_dirty_log_sync();
1195     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1196 
1197     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1198 
1199     /* more than 1 second = 1000 millisecons */
1200     if (end_time > rs->time_last_bitmap_sync + 1000) {
1201         migration_trigger_throttle(rs);
1202 
1203         migration_update_rates(rs, end_time);
1204 
1205         rs->target_page_count_prev = rs->target_page_count;
1206 
1207         /* reset period counters */
1208         rs->time_last_bitmap_sync = end_time;
1209         rs->num_dirty_pages_period = 0;
1210         rs->bytes_xfer_prev = ram_counters.transferred;
1211     }
1212     if (migrate_use_events()) {
1213         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1214     }
1215 }
1216 
1217 static void migration_bitmap_sync_precopy(RAMState *rs)
1218 {
1219     Error *local_err = NULL;
1220 
1221     /*
1222      * The current notifier usage is just an optimization to migration, so we
1223      * don't stop the normal migration process in the error case.
1224      */
1225     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1226         error_report_err(local_err);
1227         local_err = NULL;
1228     }
1229 
1230     migration_bitmap_sync(rs);
1231 
1232     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1233         error_report_err(local_err);
1234     }
1235 }
1236 
1237 static void ram_release_page(const char *rbname, uint64_t offset)
1238 {
1239     if (!migrate_release_ram() || !migration_in_postcopy()) {
1240         return;
1241     }
1242 
1243     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1244 }
1245 
1246 /**
1247  * save_zero_page_to_file: send the zero page to the file
1248  *
1249  * Returns the size of data written to the file, 0 means the page is not
1250  * a zero page
1251  *
1252  * @rs: current RAM state
1253  * @file: the file where the data is saved
1254  * @block: block that contains the page we want to send
1255  * @offset: offset inside the block for the page
1256  */
1257 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1258                                   RAMBlock *block, ram_addr_t offset)
1259 {
1260     uint8_t *p = block->host + offset;
1261     int len = 0;
1262 
1263     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1264         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1265         qemu_put_byte(file, 0);
1266         len += 1;
1267         ram_release_page(block->idstr, offset);
1268     }
1269     return len;
1270 }
1271 
1272 /**
1273  * save_zero_page: send the zero page to the stream
1274  *
1275  * Returns the number of pages written.
1276  *
1277  * @rs: current RAM state
1278  * @block: block that contains the page we want to send
1279  * @offset: offset inside the block for the page
1280  */
1281 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1282 {
1283     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1284 
1285     if (len) {
1286         ram_counters.duplicate++;
1287         ram_transferred_add(len);
1288         return 1;
1289     }
1290     return -1;
1291 }
1292 
1293 /*
1294  * @pages: the number of pages written by the control path,
1295  *        < 0 - error
1296  *        > 0 - number of pages written
1297  *
1298  * Return true if the pages has been saved, otherwise false is returned.
1299  */
1300 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1301                               int *pages)
1302 {
1303     uint64_t bytes_xmit = 0;
1304     int ret;
1305 
1306     *pages = -1;
1307     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1308                                 &bytes_xmit);
1309     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1310         return false;
1311     }
1312 
1313     if (bytes_xmit) {
1314         ram_transferred_add(bytes_xmit);
1315         *pages = 1;
1316     }
1317 
1318     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1319         return true;
1320     }
1321 
1322     if (bytes_xmit > 0) {
1323         ram_counters.normal++;
1324     } else if (bytes_xmit == 0) {
1325         ram_counters.duplicate++;
1326     }
1327 
1328     return true;
1329 }
1330 
1331 /*
1332  * directly send the page to the stream
1333  *
1334  * Returns the number of pages written.
1335  *
1336  * @rs: current RAM state
1337  * @block: block that contains the page we want to send
1338  * @offset: offset inside the block for the page
1339  * @buf: the page to be sent
1340  * @async: send to page asyncly
1341  */
1342 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1343                             uint8_t *buf, bool async)
1344 {
1345     ram_transferred_add(save_page_header(rs, rs->f, block,
1346                                          offset | RAM_SAVE_FLAG_PAGE));
1347     if (async) {
1348         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1349                               migrate_release_ram() &&
1350                               migration_in_postcopy());
1351     } else {
1352         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1353     }
1354     ram_transferred_add(TARGET_PAGE_SIZE);
1355     ram_counters.normal++;
1356     return 1;
1357 }
1358 
1359 /**
1360  * ram_save_page: send the given page to the stream
1361  *
1362  * Returns the number of pages written.
1363  *          < 0 - error
1364  *          >=0 - Number of pages written - this might legally be 0
1365  *                if xbzrle noticed the page was the same.
1366  *
1367  * @rs: current RAM state
1368  * @block: block that contains the page we want to send
1369  * @offset: offset inside the block for the page
1370  */
1371 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1372 {
1373     int pages = -1;
1374     uint8_t *p;
1375     bool send_async = true;
1376     RAMBlock *block = pss->block;
1377     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1378     ram_addr_t current_addr = block->offset + offset;
1379 
1380     p = block->host + offset;
1381     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1382 
1383     XBZRLE_cache_lock();
1384     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1385         pages = save_xbzrle_page(rs, &p, current_addr, block,
1386                                  offset);
1387         if (!rs->last_stage) {
1388             /* Can't send this cached data async, since the cache page
1389              * might get updated before it gets to the wire
1390              */
1391             send_async = false;
1392         }
1393     }
1394 
1395     /* XBZRLE overflow or normal page */
1396     if (pages == -1) {
1397         pages = save_normal_page(rs, block, offset, p, send_async);
1398     }
1399 
1400     XBZRLE_cache_unlock();
1401 
1402     return pages;
1403 }
1404 
1405 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1406                                  ram_addr_t offset)
1407 {
1408     if (multifd_queue_page(rs->f, block, offset) < 0) {
1409         return -1;
1410     }
1411     ram_counters.normal++;
1412 
1413     return 1;
1414 }
1415 
1416 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1417                                  ram_addr_t offset, uint8_t *source_buf)
1418 {
1419     RAMState *rs = ram_state;
1420     uint8_t *p = block->host + offset;
1421     int ret;
1422 
1423     if (save_zero_page_to_file(rs, f, block, offset)) {
1424         return true;
1425     }
1426 
1427     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1428 
1429     /*
1430      * copy it to a internal buffer to avoid it being modified by VM
1431      * so that we can catch up the error during compression and
1432      * decompression
1433      */
1434     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1435     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1436     if (ret < 0) {
1437         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1438         error_report("compressed data failed!");
1439     }
1440     return false;
1441 }
1442 
1443 static void
1444 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1445 {
1446     ram_transferred_add(bytes_xmit);
1447 
1448     if (param->zero_page) {
1449         ram_counters.duplicate++;
1450         return;
1451     }
1452 
1453     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1454     compression_counters.compressed_size += bytes_xmit - 8;
1455     compression_counters.pages++;
1456 }
1457 
1458 static bool save_page_use_compression(RAMState *rs);
1459 
1460 static void flush_compressed_data(RAMState *rs)
1461 {
1462     int idx, len, thread_count;
1463 
1464     if (!save_page_use_compression(rs)) {
1465         return;
1466     }
1467     thread_count = migrate_compress_threads();
1468 
1469     qemu_mutex_lock(&comp_done_lock);
1470     for (idx = 0; idx < thread_count; idx++) {
1471         while (!comp_param[idx].done) {
1472             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1473         }
1474     }
1475     qemu_mutex_unlock(&comp_done_lock);
1476 
1477     for (idx = 0; idx < thread_count; idx++) {
1478         qemu_mutex_lock(&comp_param[idx].mutex);
1479         if (!comp_param[idx].quit) {
1480             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1481             /*
1482              * it's safe to fetch zero_page without holding comp_done_lock
1483              * as there is no further request submitted to the thread,
1484              * i.e, the thread should be waiting for a request at this point.
1485              */
1486             update_compress_thread_counts(&comp_param[idx], len);
1487         }
1488         qemu_mutex_unlock(&comp_param[idx].mutex);
1489     }
1490 }
1491 
1492 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1493                                        ram_addr_t offset)
1494 {
1495     param->block = block;
1496     param->offset = offset;
1497 }
1498 
1499 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1500                                            ram_addr_t offset)
1501 {
1502     int idx, thread_count, bytes_xmit = -1, pages = -1;
1503     bool wait = migrate_compress_wait_thread();
1504 
1505     thread_count = migrate_compress_threads();
1506     qemu_mutex_lock(&comp_done_lock);
1507 retry:
1508     for (idx = 0; idx < thread_count; idx++) {
1509         if (comp_param[idx].done) {
1510             comp_param[idx].done = false;
1511             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1512             qemu_mutex_lock(&comp_param[idx].mutex);
1513             set_compress_params(&comp_param[idx], block, offset);
1514             qemu_cond_signal(&comp_param[idx].cond);
1515             qemu_mutex_unlock(&comp_param[idx].mutex);
1516             pages = 1;
1517             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1518             break;
1519         }
1520     }
1521 
1522     /*
1523      * wait for the free thread if the user specifies 'compress-wait-thread',
1524      * otherwise we will post the page out in the main thread as normal page.
1525      */
1526     if (pages < 0 && wait) {
1527         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1528         goto retry;
1529     }
1530     qemu_mutex_unlock(&comp_done_lock);
1531 
1532     return pages;
1533 }
1534 
1535 /**
1536  * find_dirty_block: find the next dirty page and update any state
1537  * associated with the search process.
1538  *
1539  * Returns true if a page is found
1540  *
1541  * @rs: current RAM state
1542  * @pss: data about the state of the current dirty page scan
1543  * @again: set to false if the search has scanned the whole of RAM
1544  */
1545 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1546 {
1547     /*
1548      * This is not a postcopy requested page, mark it "not urgent", and use
1549      * precopy channel to send it.
1550      */
1551     pss->postcopy_requested = false;
1552     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
1553 
1554     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1555     if (pss->complete_round && pss->block == rs->last_seen_block &&
1556         pss->page >= rs->last_page) {
1557         /*
1558          * We've been once around the RAM and haven't found anything.
1559          * Give up.
1560          */
1561         *again = false;
1562         return false;
1563     }
1564     if (!offset_in_ramblock(pss->block,
1565                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1566         /* Didn't find anything in this RAM Block */
1567         pss->page = 0;
1568         pss->block = QLIST_NEXT_RCU(pss->block, next);
1569         if (!pss->block) {
1570             /*
1571              * If memory migration starts over, we will meet a dirtied page
1572              * which may still exists in compression threads's ring, so we
1573              * should flush the compressed data to make sure the new page
1574              * is not overwritten by the old one in the destination.
1575              *
1576              * Also If xbzrle is on, stop using the data compression at this
1577              * point. In theory, xbzrle can do better than compression.
1578              */
1579             flush_compressed_data(rs);
1580 
1581             /* Hit the end of the list */
1582             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1583             /* Flag that we've looped */
1584             pss->complete_round = true;
1585             /* After the first round, enable XBZRLE. */
1586             if (migrate_use_xbzrle()) {
1587                 rs->xbzrle_enabled = true;
1588             }
1589         }
1590         /* Didn't find anything this time, but try again on the new block */
1591         *again = true;
1592         return false;
1593     } else {
1594         /* Can go around again, but... */
1595         *again = true;
1596         /* We've found something so probably don't need to */
1597         return true;
1598     }
1599 }
1600 
1601 /**
1602  * unqueue_page: gets a page of the queue
1603  *
1604  * Helper for 'get_queued_page' - gets a page off the queue
1605  *
1606  * Returns the block of the page (or NULL if none available)
1607  *
1608  * @rs: current RAM state
1609  * @offset: used to return the offset within the RAMBlock
1610  */
1611 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1612 {
1613     struct RAMSrcPageRequest *entry;
1614     RAMBlock *block = NULL;
1615     size_t page_size;
1616 
1617     if (!postcopy_has_request(rs)) {
1618         return NULL;
1619     }
1620 
1621     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1622 
1623     /*
1624      * This should _never_ change even after we take the lock, because no one
1625      * should be taking anything off the request list other than us.
1626      */
1627     assert(postcopy_has_request(rs));
1628 
1629     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1630     block = entry->rb;
1631     *offset = entry->offset;
1632     page_size = qemu_ram_pagesize(block);
1633     /* Each page request should only be multiple page size of the ramblock */
1634     assert((entry->len % page_size) == 0);
1635 
1636     if (entry->len > page_size) {
1637         entry->len -= page_size;
1638         entry->offset += page_size;
1639     } else {
1640         memory_region_unref(block->mr);
1641         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1642         g_free(entry);
1643         migration_consume_urgent_request();
1644     }
1645 
1646     trace_unqueue_page(block->idstr, *offset,
1647                        test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1648 
1649     return block;
1650 }
1651 
1652 #if defined(__linux__)
1653 /**
1654  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1655  *   is found, return RAM block pointer and page offset
1656  *
1657  * Returns pointer to the RAMBlock containing faulting page,
1658  *   NULL if no write faults are pending
1659  *
1660  * @rs: current RAM state
1661  * @offset: page offset from the beginning of the block
1662  */
1663 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1664 {
1665     struct uffd_msg uffd_msg;
1666     void *page_address;
1667     RAMBlock *block;
1668     int res;
1669 
1670     if (!migrate_background_snapshot()) {
1671         return NULL;
1672     }
1673 
1674     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1675     if (res <= 0) {
1676         return NULL;
1677     }
1678 
1679     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1680     block = qemu_ram_block_from_host(page_address, false, offset);
1681     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1682     return block;
1683 }
1684 
1685 /**
1686  * ram_save_release_protection: release UFFD write protection after
1687  *   a range of pages has been saved
1688  *
1689  * @rs: current RAM state
1690  * @pss: page-search-status structure
1691  * @start_page: index of the first page in the range relative to pss->block
1692  *
1693  * Returns 0 on success, negative value in case of an error
1694 */
1695 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1696         unsigned long start_page)
1697 {
1698     int res = 0;
1699 
1700     /* Check if page is from UFFD-managed region. */
1701     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1702         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1703         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1704 
1705         /* Flush async buffers before un-protect. */
1706         qemu_fflush(rs->f);
1707         /* Un-protect memory range. */
1708         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1709                 false, false);
1710     }
1711 
1712     return res;
1713 }
1714 
1715 /* ram_write_tracking_available: check if kernel supports required UFFD features
1716  *
1717  * Returns true if supports, false otherwise
1718  */
1719 bool ram_write_tracking_available(void)
1720 {
1721     uint64_t uffd_features;
1722     int res;
1723 
1724     res = uffd_query_features(&uffd_features);
1725     return (res == 0 &&
1726             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1727 }
1728 
1729 /* ram_write_tracking_compatible: check if guest configuration is
1730  *   compatible with 'write-tracking'
1731  *
1732  * Returns true if compatible, false otherwise
1733  */
1734 bool ram_write_tracking_compatible(void)
1735 {
1736     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1737     int uffd_fd;
1738     RAMBlock *block;
1739     bool ret = false;
1740 
1741     /* Open UFFD file descriptor */
1742     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1743     if (uffd_fd < 0) {
1744         return false;
1745     }
1746 
1747     RCU_READ_LOCK_GUARD();
1748 
1749     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1750         uint64_t uffd_ioctls;
1751 
1752         /* Nothing to do with read-only and MMIO-writable regions */
1753         if (block->mr->readonly || block->mr->rom_device) {
1754             continue;
1755         }
1756         /* Try to register block memory via UFFD-IO to track writes */
1757         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1758                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1759             goto out;
1760         }
1761         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1762             goto out;
1763         }
1764     }
1765     ret = true;
1766 
1767 out:
1768     uffd_close_fd(uffd_fd);
1769     return ret;
1770 }
1771 
1772 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1773                                        ram_addr_t size)
1774 {
1775     /*
1776      * We read one byte of each page; this will preallocate page tables if
1777      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1778      * where no page was populated yet. This might require adaption when
1779      * supporting other mappings, like shmem.
1780      */
1781     for (; offset < size; offset += block->page_size) {
1782         char tmp = *((char *)block->host + offset);
1783 
1784         /* Don't optimize the read out */
1785         asm volatile("" : "+r" (tmp));
1786     }
1787 }
1788 
1789 static inline int populate_read_section(MemoryRegionSection *section,
1790                                         void *opaque)
1791 {
1792     const hwaddr size = int128_get64(section->size);
1793     hwaddr offset = section->offset_within_region;
1794     RAMBlock *block = section->mr->ram_block;
1795 
1796     populate_read_range(block, offset, size);
1797     return 0;
1798 }
1799 
1800 /*
1801  * ram_block_populate_read: preallocate page tables and populate pages in the
1802  *   RAM block by reading a byte of each page.
1803  *
1804  * Since it's solely used for userfault_fd WP feature, here we just
1805  *   hardcode page size to qemu_real_host_page_size.
1806  *
1807  * @block: RAM block to populate
1808  */
1809 static void ram_block_populate_read(RAMBlock *rb)
1810 {
1811     /*
1812      * Skip populating all pages that fall into a discarded range as managed by
1813      * a RamDiscardManager responsible for the mapped memory region of the
1814      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1815      * must not get populated automatically. We don't have to track
1816      * modifications via userfaultfd WP reliably, because these pages will
1817      * not be part of the migration stream either way -- see
1818      * ramblock_dirty_bitmap_exclude_discarded_pages().
1819      *
1820      * Note: The result is only stable while migrating (precopy/postcopy).
1821      */
1822     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1823         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1824         MemoryRegionSection section = {
1825             .mr = rb->mr,
1826             .offset_within_region = 0,
1827             .size = rb->mr->size,
1828         };
1829 
1830         ram_discard_manager_replay_populated(rdm, &section,
1831                                              populate_read_section, NULL);
1832     } else {
1833         populate_read_range(rb, 0, rb->used_length);
1834     }
1835 }
1836 
1837 /*
1838  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1839  */
1840 void ram_write_tracking_prepare(void)
1841 {
1842     RAMBlock *block;
1843 
1844     RCU_READ_LOCK_GUARD();
1845 
1846     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1847         /* Nothing to do with read-only and MMIO-writable regions */
1848         if (block->mr->readonly || block->mr->rom_device) {
1849             continue;
1850         }
1851 
1852         /*
1853          * Populate pages of the RAM block before enabling userfault_fd
1854          * write protection.
1855          *
1856          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1857          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1858          * pages with pte_none() entries in page table.
1859          */
1860         ram_block_populate_read(block);
1861     }
1862 }
1863 
1864 /*
1865  * ram_write_tracking_start: start UFFD-WP memory tracking
1866  *
1867  * Returns 0 for success or negative value in case of error
1868  */
1869 int ram_write_tracking_start(void)
1870 {
1871     int uffd_fd;
1872     RAMState *rs = ram_state;
1873     RAMBlock *block;
1874 
1875     /* Open UFFD file descriptor */
1876     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1877     if (uffd_fd < 0) {
1878         return uffd_fd;
1879     }
1880     rs->uffdio_fd = uffd_fd;
1881 
1882     RCU_READ_LOCK_GUARD();
1883 
1884     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1885         /* Nothing to do with read-only and MMIO-writable regions */
1886         if (block->mr->readonly || block->mr->rom_device) {
1887             continue;
1888         }
1889 
1890         /* Register block memory with UFFD to track writes */
1891         if (uffd_register_memory(rs->uffdio_fd, block->host,
1892                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1893             goto fail;
1894         }
1895         /* Apply UFFD write protection to the block memory range */
1896         if (uffd_change_protection(rs->uffdio_fd, block->host,
1897                 block->max_length, true, false)) {
1898             goto fail;
1899         }
1900         block->flags |= RAM_UF_WRITEPROTECT;
1901         memory_region_ref(block->mr);
1902 
1903         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1904                 block->host, block->max_length);
1905     }
1906 
1907     return 0;
1908 
1909 fail:
1910     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1911 
1912     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1913         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1914             continue;
1915         }
1916         /*
1917          * In case some memory block failed to be write-protected
1918          * remove protection and unregister all succeeded RAM blocks
1919          */
1920         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1921                 false, false);
1922         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1923         /* Cleanup flags and remove reference */
1924         block->flags &= ~RAM_UF_WRITEPROTECT;
1925         memory_region_unref(block->mr);
1926     }
1927 
1928     uffd_close_fd(uffd_fd);
1929     rs->uffdio_fd = -1;
1930     return -1;
1931 }
1932 
1933 /**
1934  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1935  */
1936 void ram_write_tracking_stop(void)
1937 {
1938     RAMState *rs = ram_state;
1939     RAMBlock *block;
1940 
1941     RCU_READ_LOCK_GUARD();
1942 
1943     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1944         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1945             continue;
1946         }
1947         /* Remove protection and unregister all affected RAM blocks */
1948         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1949                 false, false);
1950         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1951 
1952         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1953                 block->host, block->max_length);
1954 
1955         /* Cleanup flags and remove reference */
1956         block->flags &= ~RAM_UF_WRITEPROTECT;
1957         memory_region_unref(block->mr);
1958     }
1959 
1960     /* Finally close UFFD file descriptor */
1961     uffd_close_fd(rs->uffdio_fd);
1962     rs->uffdio_fd = -1;
1963 }
1964 
1965 #else
1966 /* No target OS support, stubs just fail or ignore */
1967 
1968 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1969 {
1970     (void) rs;
1971     (void) offset;
1972 
1973     return NULL;
1974 }
1975 
1976 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1977         unsigned long start_page)
1978 {
1979     (void) rs;
1980     (void) pss;
1981     (void) start_page;
1982 
1983     return 0;
1984 }
1985 
1986 bool ram_write_tracking_available(void)
1987 {
1988     return false;
1989 }
1990 
1991 bool ram_write_tracking_compatible(void)
1992 {
1993     assert(0);
1994     return false;
1995 }
1996 
1997 int ram_write_tracking_start(void)
1998 {
1999     assert(0);
2000     return -1;
2001 }
2002 
2003 void ram_write_tracking_stop(void)
2004 {
2005     assert(0);
2006 }
2007 #endif /* defined(__linux__) */
2008 
2009 /*
2010  * Check whether two addr/offset of the ramblock falls onto the same host huge
2011  * page.  Returns true if so, false otherwise.
2012  */
2013 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2014                                      uint64_t addr2)
2015 {
2016     size_t page_size = qemu_ram_pagesize(rb);
2017 
2018     addr1 = ROUND_DOWN(addr1, page_size);
2019     addr2 = ROUND_DOWN(addr2, page_size);
2020 
2021     return addr1 == addr2;
2022 }
2023 
2024 /*
2025  * Whether a previous preempted precopy huge page contains current requested
2026  * page?  Returns true if so, false otherwise.
2027  *
2028  * This should really happen very rarely, because it means when we were sending
2029  * during background migration for postcopy we're sending exactly the page that
2030  * some vcpu got faulted on on dest node.  When it happens, we probably don't
2031  * need to do much but drop the request, because we know right after we restore
2032  * the precopy stream it'll be serviced.  It'll slightly affect the order of
2033  * postcopy requests to be serviced (e.g. it'll be the same as we move current
2034  * request to the end of the queue) but it shouldn't be a big deal.  The most
2035  * imporant thing is we can _never_ try to send a partial-sent huge page on the
2036  * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2037  * two channels (PRECOPY, POSTCOPY).
2038  */
2039 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2040                                         ram_addr_t offset)
2041 {
2042     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2043 
2044     /* No preemption at all? */
2045     if (!state->preempted) {
2046         return false;
2047     }
2048 
2049     /* Not even the same ramblock? */
2050     if (state->ram_block != block) {
2051         return false;
2052     }
2053 
2054     return offset_on_same_huge_page(block, offset,
2055                                     state->ram_page << TARGET_PAGE_BITS);
2056 }
2057 
2058 /**
2059  * get_queued_page: unqueue a page from the postcopy requests
2060  *
2061  * Skips pages that are already sent (!dirty)
2062  *
2063  * Returns true if a queued page is found
2064  *
2065  * @rs: current RAM state
2066  * @pss: data about the state of the current dirty page scan
2067  */
2068 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2069 {
2070     RAMBlock  *block;
2071     ram_addr_t offset;
2072 
2073     block = unqueue_page(rs, &offset);
2074 
2075     if (block) {
2076         /* See comment above postcopy_preempted_contains() */
2077         if (postcopy_preempted_contains(rs, block, offset)) {
2078             trace_postcopy_preempt_hit(block->idstr, offset);
2079             /*
2080              * If what we preempted previously was exactly what we're
2081              * requesting right now, restore the preempted precopy
2082              * immediately, boosting its priority as it's requested by
2083              * postcopy.
2084              */
2085             postcopy_preempt_restore(rs, pss, true);
2086             return true;
2087         }
2088     } else {
2089         /*
2090          * Poll write faults too if background snapshot is enabled; that's
2091          * when we have vcpus got blocked by the write protected pages.
2092          */
2093         block = poll_fault_page(rs, &offset);
2094     }
2095 
2096     if (block) {
2097         /*
2098          * We want the background search to continue from the queued page
2099          * since the guest is likely to want other pages near to the page
2100          * it just requested.
2101          */
2102         pss->block = block;
2103         pss->page = offset >> TARGET_PAGE_BITS;
2104 
2105         /*
2106          * This unqueued page would break the "one round" check, even is
2107          * really rare.
2108          */
2109         pss->complete_round = false;
2110         /* Mark it an urgent request, meanwhile using POSTCOPY channel */
2111         pss->postcopy_requested = true;
2112         pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
2113     }
2114 
2115     return !!block;
2116 }
2117 
2118 /**
2119  * migration_page_queue_free: drop any remaining pages in the ram
2120  * request queue
2121  *
2122  * It should be empty at the end anyway, but in error cases there may
2123  * be some left.  in case that there is any page left, we drop it.
2124  *
2125  */
2126 static void migration_page_queue_free(RAMState *rs)
2127 {
2128     struct RAMSrcPageRequest *mspr, *next_mspr;
2129     /* This queue generally should be empty - but in the case of a failed
2130      * migration might have some droppings in.
2131      */
2132     RCU_READ_LOCK_GUARD();
2133     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2134         memory_region_unref(mspr->rb->mr);
2135         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2136         g_free(mspr);
2137     }
2138 }
2139 
2140 /**
2141  * ram_save_queue_pages: queue the page for transmission
2142  *
2143  * A request from postcopy destination for example.
2144  *
2145  * Returns zero on success or negative on error
2146  *
2147  * @rbname: Name of the RAMBLock of the request. NULL means the
2148  *          same that last one.
2149  * @start: starting address from the start of the RAMBlock
2150  * @len: length (in bytes) to send
2151  */
2152 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2153 {
2154     RAMBlock *ramblock;
2155     RAMState *rs = ram_state;
2156 
2157     ram_counters.postcopy_requests++;
2158     RCU_READ_LOCK_GUARD();
2159 
2160     if (!rbname) {
2161         /* Reuse last RAMBlock */
2162         ramblock = rs->last_req_rb;
2163 
2164         if (!ramblock) {
2165             /*
2166              * Shouldn't happen, we can't reuse the last RAMBlock if
2167              * it's the 1st request.
2168              */
2169             error_report("ram_save_queue_pages no previous block");
2170             return -1;
2171         }
2172     } else {
2173         ramblock = qemu_ram_block_by_name(rbname);
2174 
2175         if (!ramblock) {
2176             /* We shouldn't be asked for a non-existent RAMBlock */
2177             error_report("ram_save_queue_pages no block '%s'", rbname);
2178             return -1;
2179         }
2180         rs->last_req_rb = ramblock;
2181     }
2182     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2183     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2184         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2185                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2186                      __func__, start, len, ramblock->used_length);
2187         return -1;
2188     }
2189 
2190     struct RAMSrcPageRequest *new_entry =
2191         g_new0(struct RAMSrcPageRequest, 1);
2192     new_entry->rb = ramblock;
2193     new_entry->offset = start;
2194     new_entry->len = len;
2195 
2196     memory_region_ref(ramblock->mr);
2197     qemu_mutex_lock(&rs->src_page_req_mutex);
2198     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2199     migration_make_urgent_request();
2200     qemu_mutex_unlock(&rs->src_page_req_mutex);
2201 
2202     return 0;
2203 }
2204 
2205 static bool save_page_use_compression(RAMState *rs)
2206 {
2207     if (!migrate_use_compression()) {
2208         return false;
2209     }
2210 
2211     /*
2212      * If xbzrle is enabled (e.g., after first round of migration), stop
2213      * using the data compression. In theory, xbzrle can do better than
2214      * compression.
2215      */
2216     if (rs->xbzrle_enabled) {
2217         return false;
2218     }
2219 
2220     return true;
2221 }
2222 
2223 /*
2224  * try to compress the page before posting it out, return true if the page
2225  * has been properly handled by compression, otherwise needs other
2226  * paths to handle it
2227  */
2228 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2229 {
2230     if (!save_page_use_compression(rs)) {
2231         return false;
2232     }
2233 
2234     /*
2235      * When starting the process of a new block, the first page of
2236      * the block should be sent out before other pages in the same
2237      * block, and all the pages in last block should have been sent
2238      * out, keeping this order is important, because the 'cont' flag
2239      * is used to avoid resending the block name.
2240      *
2241      * We post the fist page as normal page as compression will take
2242      * much CPU resource.
2243      */
2244     if (block != rs->last_sent_block) {
2245         flush_compressed_data(rs);
2246         return false;
2247     }
2248 
2249     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2250         return true;
2251     }
2252 
2253     compression_counters.busy++;
2254     return false;
2255 }
2256 
2257 /**
2258  * ram_save_target_page: save one target page
2259  *
2260  * Returns the number of pages written
2261  *
2262  * @rs: current RAM state
2263  * @pss: data about the page we want to send
2264  */
2265 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2266 {
2267     RAMBlock *block = pss->block;
2268     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2269     int res;
2270 
2271     if (control_save_page(rs, block, offset, &res)) {
2272         return res;
2273     }
2274 
2275     if (save_compress_page(rs, block, offset)) {
2276         return 1;
2277     }
2278 
2279     res = save_zero_page(rs, block, offset);
2280     if (res > 0) {
2281         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2282          * page would be stale
2283          */
2284         if (!save_page_use_compression(rs)) {
2285             XBZRLE_cache_lock();
2286             xbzrle_cache_zero_page(rs, block->offset + offset);
2287             XBZRLE_cache_unlock();
2288         }
2289         return res;
2290     }
2291 
2292     /*
2293      * Do not use multifd for:
2294      * 1. Compression as the first page in the new block should be posted out
2295      *    before sending the compressed page
2296      * 2. In postcopy as one whole host page should be placed
2297      */
2298     if (!save_page_use_compression(rs) && migrate_use_multifd()
2299         && !migration_in_postcopy()) {
2300         return ram_save_multifd_page(rs, block, offset);
2301     }
2302 
2303     return ram_save_page(rs, pss);
2304 }
2305 
2306 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2307 {
2308     MigrationState *ms = migrate_get_current();
2309 
2310     /* Not enabled eager preempt?  Then never do that. */
2311     if (!migrate_postcopy_preempt()) {
2312         return false;
2313     }
2314 
2315     /* If the user explicitly disabled breaking of huge page, skip */
2316     if (!ms->postcopy_preempt_break_huge) {
2317         return false;
2318     }
2319 
2320     /* If the ramblock we're sending is a small page?  Never bother. */
2321     if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2322         return false;
2323     }
2324 
2325     /* Not in postcopy at all? */
2326     if (!migration_in_postcopy()) {
2327         return false;
2328     }
2329 
2330     /*
2331      * If we're already handling a postcopy request, don't preempt as this page
2332      * has got the same high priority.
2333      */
2334     if (pss->postcopy_requested) {
2335         return false;
2336     }
2337 
2338     /* If there's postcopy requests, then check it up! */
2339     return postcopy_has_request(rs);
2340 }
2341 
2342 /* Returns true if we preempted precopy, false otherwise */
2343 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2344 {
2345     PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2346 
2347     trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2348 
2349     /*
2350      * Time to preempt precopy. Cache current PSS into preempt state, so that
2351      * after handling the postcopy pages we can recover to it.  We need to do
2352      * so because the dest VM will have partial of the precopy huge page kept
2353      * over in its tmp huge page caches; better move on with it when we can.
2354      */
2355     p_state->ram_block = pss->block;
2356     p_state->ram_page = pss->page;
2357     p_state->preempted = true;
2358 }
2359 
2360 /* Whether we're preempted by a postcopy request during sending a huge page */
2361 static bool postcopy_preempt_triggered(RAMState *rs)
2362 {
2363     return rs->postcopy_preempt_state.preempted;
2364 }
2365 
2366 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2367                                      bool postcopy_requested)
2368 {
2369     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2370 
2371     assert(state->preempted);
2372 
2373     pss->block = state->ram_block;
2374     pss->page = state->ram_page;
2375 
2376     /* Whether this is a postcopy request? */
2377     pss->postcopy_requested = postcopy_requested;
2378     /*
2379      * When restoring a preempted page, the old data resides in PRECOPY
2380      * slow channel, even if postcopy_requested is set.  So always use
2381      * PRECOPY channel here.
2382      */
2383     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
2384 
2385     trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2386 
2387     /* Reset preempt state, most importantly, set preempted==false */
2388     postcopy_preempt_reset(rs);
2389 }
2390 
2391 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2392 {
2393     MigrationState *s = migrate_get_current();
2394     unsigned int channel = pss->postcopy_target_channel;
2395     QEMUFile *next;
2396 
2397     if (channel != rs->postcopy_channel) {
2398         if (channel == RAM_CHANNEL_PRECOPY) {
2399             next = s->to_dst_file;
2400         } else {
2401             next = s->postcopy_qemufile_src;
2402         }
2403         /* Update and cache the current channel */
2404         rs->f = next;
2405         rs->postcopy_channel = channel;
2406 
2407         /*
2408          * If channel switched, reset last_sent_block since the old sent block
2409          * may not be on the same channel.
2410          */
2411         rs->last_sent_block = NULL;
2412 
2413         trace_postcopy_preempt_switch_channel(channel);
2414     }
2415 
2416     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2417 }
2418 
2419 /* We need to make sure rs->f always points to the default channel elsewhere */
2420 static void postcopy_preempt_reset_channel(RAMState *rs)
2421 {
2422     if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2423         rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2424         rs->f = migrate_get_current()->to_dst_file;
2425         trace_postcopy_preempt_reset_channel();
2426     }
2427 }
2428 
2429 /**
2430  * ram_save_host_page: save a whole host page
2431  *
2432  * Starting at *offset send pages up to the end of the current host
2433  * page. It's valid for the initial offset to point into the middle of
2434  * a host page in which case the remainder of the hostpage is sent.
2435  * Only dirty target pages are sent. Note that the host page size may
2436  * be a huge page for this block.
2437  * The saving stops at the boundary of the used_length of the block
2438  * if the RAMBlock isn't a multiple of the host page size.
2439  *
2440  * Returns the number of pages written or negative on error
2441  *
2442  * @rs: current RAM state
2443  * @pss: data about the page we want to send
2444  */
2445 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2446 {
2447     int tmppages, pages = 0;
2448     size_t pagesize_bits =
2449         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2450     unsigned long hostpage_boundary =
2451         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2452     unsigned long start_page = pss->page;
2453     int res;
2454 
2455     if (ramblock_is_ignored(pss->block)) {
2456         error_report("block %s should not be migrated !", pss->block->idstr);
2457         return 0;
2458     }
2459 
2460     if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2461         postcopy_preempt_choose_channel(rs, pss);
2462     }
2463 
2464     do {
2465         if (postcopy_needs_preempt(rs, pss)) {
2466             postcopy_do_preempt(rs, pss);
2467             break;
2468         }
2469 
2470         /* Check the pages is dirty and if it is send it */
2471         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2472             tmppages = ram_save_target_page(rs, pss);
2473             if (tmppages < 0) {
2474                 return tmppages;
2475             }
2476 
2477             pages += tmppages;
2478             /*
2479              * Allow rate limiting to happen in the middle of huge pages if
2480              * something is sent in the current iteration.
2481              */
2482             if (pagesize_bits > 1 && tmppages > 0) {
2483                 migration_rate_limit();
2484             }
2485         }
2486         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2487     } while ((pss->page < hostpage_boundary) &&
2488              offset_in_ramblock(pss->block,
2489                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2490     /* The offset we leave with is the min boundary of host page and block */
2491     pss->page = MIN(pss->page, hostpage_boundary);
2492 
2493     /*
2494      * When with postcopy preempt mode, flush the data as soon as possible for
2495      * postcopy requests, because we've already sent a whole huge page, so the
2496      * dst node should already have enough resource to atomically filling in
2497      * the current missing page.
2498      *
2499      * More importantly, when using separate postcopy channel, we must do
2500      * explicit flush or it won't flush until the buffer is full.
2501      */
2502     if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2503         qemu_fflush(rs->f);
2504     }
2505 
2506     res = ram_save_release_protection(rs, pss, start_page);
2507     return (res < 0 ? res : pages);
2508 }
2509 
2510 /**
2511  * ram_find_and_save_block: finds a dirty page and sends it to f
2512  *
2513  * Called within an RCU critical section.
2514  *
2515  * Returns the number of pages written where zero means no dirty pages,
2516  * or negative on error
2517  *
2518  * @rs: current RAM state
2519  *
2520  * On systems where host-page-size > target-page-size it will send all the
2521  * pages in a host page that are dirty.
2522  */
2523 static int ram_find_and_save_block(RAMState *rs)
2524 {
2525     PageSearchStatus pss;
2526     int pages = 0;
2527     bool again, found;
2528 
2529     /* No dirty page as there is zero RAM */
2530     if (!ram_bytes_total()) {
2531         return pages;
2532     }
2533 
2534     pss.block = rs->last_seen_block;
2535     pss.page = rs->last_page;
2536     pss.complete_round = false;
2537 
2538     if (!pss.block) {
2539         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2540     }
2541 
2542     do {
2543         again = true;
2544         found = get_queued_page(rs, &pss);
2545 
2546         if (!found) {
2547             /*
2548              * Recover previous precopy ramblock/offset if postcopy has
2549              * preempted precopy.  Otherwise find the next dirty bit.
2550              */
2551             if (postcopy_preempt_triggered(rs)) {
2552                 postcopy_preempt_restore(rs, &pss, false);
2553                 found = true;
2554             } else {
2555                 /* priority queue empty, so just search for something dirty */
2556                 found = find_dirty_block(rs, &pss, &again);
2557             }
2558         }
2559 
2560         if (found) {
2561             pages = ram_save_host_page(rs, &pss);
2562         }
2563     } while (!pages && again);
2564 
2565     rs->last_seen_block = pss.block;
2566     rs->last_page = pss.page;
2567 
2568     return pages;
2569 }
2570 
2571 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2572 {
2573     uint64_t pages = size / TARGET_PAGE_SIZE;
2574 
2575     if (zero) {
2576         ram_counters.duplicate += pages;
2577     } else {
2578         ram_counters.normal += pages;
2579         ram_transferred_add(size);
2580         qemu_file_credit_transfer(f, size);
2581     }
2582 }
2583 
2584 static uint64_t ram_bytes_total_common(bool count_ignored)
2585 {
2586     RAMBlock *block;
2587     uint64_t total = 0;
2588 
2589     RCU_READ_LOCK_GUARD();
2590 
2591     if (count_ignored) {
2592         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2593             total += block->used_length;
2594         }
2595     } else {
2596         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2597             total += block->used_length;
2598         }
2599     }
2600     return total;
2601 }
2602 
2603 uint64_t ram_bytes_total(void)
2604 {
2605     return ram_bytes_total_common(false);
2606 }
2607 
2608 static void xbzrle_load_setup(void)
2609 {
2610     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2611 }
2612 
2613 static void xbzrle_load_cleanup(void)
2614 {
2615     g_free(XBZRLE.decoded_buf);
2616     XBZRLE.decoded_buf = NULL;
2617 }
2618 
2619 static void ram_state_cleanup(RAMState **rsp)
2620 {
2621     if (*rsp) {
2622         migration_page_queue_free(*rsp);
2623         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2624         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2625         g_free(*rsp);
2626         *rsp = NULL;
2627     }
2628 }
2629 
2630 static void xbzrle_cleanup(void)
2631 {
2632     XBZRLE_cache_lock();
2633     if (XBZRLE.cache) {
2634         cache_fini(XBZRLE.cache);
2635         g_free(XBZRLE.encoded_buf);
2636         g_free(XBZRLE.current_buf);
2637         g_free(XBZRLE.zero_target_page);
2638         XBZRLE.cache = NULL;
2639         XBZRLE.encoded_buf = NULL;
2640         XBZRLE.current_buf = NULL;
2641         XBZRLE.zero_target_page = NULL;
2642     }
2643     XBZRLE_cache_unlock();
2644 }
2645 
2646 static void ram_save_cleanup(void *opaque)
2647 {
2648     RAMState **rsp = opaque;
2649     RAMBlock *block;
2650 
2651     /* We don't use dirty log with background snapshots */
2652     if (!migrate_background_snapshot()) {
2653         /* caller have hold iothread lock or is in a bh, so there is
2654          * no writing race against the migration bitmap
2655          */
2656         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2657             /*
2658              * do not stop dirty log without starting it, since
2659              * memory_global_dirty_log_stop will assert that
2660              * memory_global_dirty_log_start/stop used in pairs
2661              */
2662             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2663         }
2664     }
2665 
2666     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2667         g_free(block->clear_bmap);
2668         block->clear_bmap = NULL;
2669         g_free(block->bmap);
2670         block->bmap = NULL;
2671     }
2672 
2673     xbzrle_cleanup();
2674     compress_threads_save_cleanup();
2675     ram_state_cleanup(rsp);
2676 }
2677 
2678 static void ram_state_reset(RAMState *rs)
2679 {
2680     rs->last_seen_block = NULL;
2681     rs->last_sent_block = NULL;
2682     rs->last_page = 0;
2683     rs->last_version = ram_list.version;
2684     rs->xbzrle_enabled = false;
2685     postcopy_preempt_reset(rs);
2686     rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2687 }
2688 
2689 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2690 
2691 /* **** functions for postcopy ***** */
2692 
2693 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2694 {
2695     struct RAMBlock *block;
2696 
2697     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2698         unsigned long *bitmap = block->bmap;
2699         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2700         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2701 
2702         while (run_start < range) {
2703             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2704             ram_discard_range(block->idstr,
2705                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2706                               ((ram_addr_t)(run_end - run_start))
2707                                 << TARGET_PAGE_BITS);
2708             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2709         }
2710     }
2711 }
2712 
2713 /**
2714  * postcopy_send_discard_bm_ram: discard a RAMBlock
2715  *
2716  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2717  *
2718  * @ms: current migration state
2719  * @block: RAMBlock to discard
2720  */
2721 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2722 {
2723     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2724     unsigned long current;
2725     unsigned long *bitmap = block->bmap;
2726 
2727     for (current = 0; current < end; ) {
2728         unsigned long one = find_next_bit(bitmap, end, current);
2729         unsigned long zero, discard_length;
2730 
2731         if (one >= end) {
2732             break;
2733         }
2734 
2735         zero = find_next_zero_bit(bitmap, end, one + 1);
2736 
2737         if (zero >= end) {
2738             discard_length = end - one;
2739         } else {
2740             discard_length = zero - one;
2741         }
2742         postcopy_discard_send_range(ms, one, discard_length);
2743         current = one + discard_length;
2744     }
2745 }
2746 
2747 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2748 
2749 /**
2750  * postcopy_each_ram_send_discard: discard all RAMBlocks
2751  *
2752  * Utility for the outgoing postcopy code.
2753  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2754  *   passing it bitmap indexes and name.
2755  * (qemu_ram_foreach_block ends up passing unscaled lengths
2756  *  which would mean postcopy code would have to deal with target page)
2757  *
2758  * @ms: current migration state
2759  */
2760 static void postcopy_each_ram_send_discard(MigrationState *ms)
2761 {
2762     struct RAMBlock *block;
2763 
2764     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2765         postcopy_discard_send_init(ms, block->idstr);
2766 
2767         /*
2768          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2769          * host-page size chunks, mark any partially dirty host-page size
2770          * chunks as all dirty.  In this case the host-page is the host-page
2771          * for the particular RAMBlock, i.e. it might be a huge page.
2772          */
2773         postcopy_chunk_hostpages_pass(ms, block);
2774 
2775         /*
2776          * Postcopy sends chunks of bitmap over the wire, but it
2777          * just needs indexes at this point, avoids it having
2778          * target page specific code.
2779          */
2780         postcopy_send_discard_bm_ram(ms, block);
2781         postcopy_discard_send_finish(ms);
2782     }
2783 }
2784 
2785 /**
2786  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2787  *
2788  * Helper for postcopy_chunk_hostpages; it's called twice to
2789  * canonicalize the two bitmaps, that are similar, but one is
2790  * inverted.
2791  *
2792  * Postcopy requires that all target pages in a hostpage are dirty or
2793  * clean, not a mix.  This function canonicalizes the bitmaps.
2794  *
2795  * @ms: current migration state
2796  * @block: block that contains the page we want to canonicalize
2797  */
2798 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2799 {
2800     RAMState *rs = ram_state;
2801     unsigned long *bitmap = block->bmap;
2802     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2803     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2804     unsigned long run_start;
2805 
2806     if (block->page_size == TARGET_PAGE_SIZE) {
2807         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2808         return;
2809     }
2810 
2811     /* Find a dirty page */
2812     run_start = find_next_bit(bitmap, pages, 0);
2813 
2814     while (run_start < pages) {
2815 
2816         /*
2817          * If the start of this run of pages is in the middle of a host
2818          * page, then we need to fixup this host page.
2819          */
2820         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2821             /* Find the end of this run */
2822             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2823             /*
2824              * If the end isn't at the start of a host page, then the
2825              * run doesn't finish at the end of a host page
2826              * and we need to discard.
2827              */
2828         }
2829 
2830         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2831             unsigned long page;
2832             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2833                                                              host_ratio);
2834             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2835 
2836             /* Clean up the bitmap */
2837             for (page = fixup_start_addr;
2838                  page < fixup_start_addr + host_ratio; page++) {
2839                 /*
2840                  * Remark them as dirty, updating the count for any pages
2841                  * that weren't previously dirty.
2842                  */
2843                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2844             }
2845         }
2846 
2847         /* Find the next dirty page for the next iteration */
2848         run_start = find_next_bit(bitmap, pages, run_start);
2849     }
2850 }
2851 
2852 /**
2853  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2854  *
2855  * Transmit the set of pages to be discarded after precopy to the target
2856  * these are pages that:
2857  *     a) Have been previously transmitted but are now dirty again
2858  *     b) Pages that have never been transmitted, this ensures that
2859  *        any pages on the destination that have been mapped by background
2860  *        tasks get discarded (transparent huge pages is the specific concern)
2861  * Hopefully this is pretty sparse
2862  *
2863  * @ms: current migration state
2864  */
2865 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2866 {
2867     RAMState *rs = ram_state;
2868 
2869     RCU_READ_LOCK_GUARD();
2870 
2871     /* This should be our last sync, the src is now paused */
2872     migration_bitmap_sync(rs);
2873 
2874     /* Easiest way to make sure we don't resume in the middle of a host-page */
2875     rs->last_seen_block = NULL;
2876     rs->last_sent_block = NULL;
2877     rs->last_page = 0;
2878 
2879     postcopy_each_ram_send_discard(ms);
2880 
2881     trace_ram_postcopy_send_discard_bitmap();
2882 }
2883 
2884 /**
2885  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2886  *
2887  * Returns zero on success
2888  *
2889  * @rbname: name of the RAMBlock of the request. NULL means the
2890  *          same that last one.
2891  * @start: RAMBlock starting page
2892  * @length: RAMBlock size
2893  */
2894 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2895 {
2896     trace_ram_discard_range(rbname, start, length);
2897 
2898     RCU_READ_LOCK_GUARD();
2899     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2900 
2901     if (!rb) {
2902         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2903         return -1;
2904     }
2905 
2906     /*
2907      * On source VM, we don't need to update the received bitmap since
2908      * we don't even have one.
2909      */
2910     if (rb->receivedmap) {
2911         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2912                      length >> qemu_target_page_bits());
2913     }
2914 
2915     return ram_block_discard_range(rb, start, length);
2916 }
2917 
2918 /*
2919  * For every allocation, we will try not to crash the VM if the
2920  * allocation failed.
2921  */
2922 static int xbzrle_init(void)
2923 {
2924     Error *local_err = NULL;
2925 
2926     if (!migrate_use_xbzrle()) {
2927         return 0;
2928     }
2929 
2930     XBZRLE_cache_lock();
2931 
2932     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2933     if (!XBZRLE.zero_target_page) {
2934         error_report("%s: Error allocating zero page", __func__);
2935         goto err_out;
2936     }
2937 
2938     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2939                               TARGET_PAGE_SIZE, &local_err);
2940     if (!XBZRLE.cache) {
2941         error_report_err(local_err);
2942         goto free_zero_page;
2943     }
2944 
2945     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2946     if (!XBZRLE.encoded_buf) {
2947         error_report("%s: Error allocating encoded_buf", __func__);
2948         goto free_cache;
2949     }
2950 
2951     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2952     if (!XBZRLE.current_buf) {
2953         error_report("%s: Error allocating current_buf", __func__);
2954         goto free_encoded_buf;
2955     }
2956 
2957     /* We are all good */
2958     XBZRLE_cache_unlock();
2959     return 0;
2960 
2961 free_encoded_buf:
2962     g_free(XBZRLE.encoded_buf);
2963     XBZRLE.encoded_buf = NULL;
2964 free_cache:
2965     cache_fini(XBZRLE.cache);
2966     XBZRLE.cache = NULL;
2967 free_zero_page:
2968     g_free(XBZRLE.zero_target_page);
2969     XBZRLE.zero_target_page = NULL;
2970 err_out:
2971     XBZRLE_cache_unlock();
2972     return -ENOMEM;
2973 }
2974 
2975 static int ram_state_init(RAMState **rsp)
2976 {
2977     *rsp = g_try_new0(RAMState, 1);
2978 
2979     if (!*rsp) {
2980         error_report("%s: Init ramstate fail", __func__);
2981         return -1;
2982     }
2983 
2984     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2985     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2986     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2987 
2988     /*
2989      * Count the total number of pages used by ram blocks not including any
2990      * gaps due to alignment or unplugs.
2991      * This must match with the initial values of dirty bitmap.
2992      */
2993     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2994     ram_state_reset(*rsp);
2995 
2996     return 0;
2997 }
2998 
2999 static void ram_list_init_bitmaps(void)
3000 {
3001     MigrationState *ms = migrate_get_current();
3002     RAMBlock *block;
3003     unsigned long pages;
3004     uint8_t shift;
3005 
3006     /* Skip setting bitmap if there is no RAM */
3007     if (ram_bytes_total()) {
3008         shift = ms->clear_bitmap_shift;
3009         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3010             error_report("clear_bitmap_shift (%u) too big, using "
3011                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3012             shift = CLEAR_BITMAP_SHIFT_MAX;
3013         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3014             error_report("clear_bitmap_shift (%u) too small, using "
3015                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3016             shift = CLEAR_BITMAP_SHIFT_MIN;
3017         }
3018 
3019         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3020             pages = block->max_length >> TARGET_PAGE_BITS;
3021             /*
3022              * The initial dirty bitmap for migration must be set with all
3023              * ones to make sure we'll migrate every guest RAM page to
3024              * destination.
3025              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3026              * new migration after a failed migration, ram_list.
3027              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3028              * guest memory.
3029              */
3030             block->bmap = bitmap_new(pages);
3031             bitmap_set(block->bmap, 0, pages);
3032             block->clear_bmap_shift = shift;
3033             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3034         }
3035     }
3036 }
3037 
3038 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3039 {
3040     unsigned long pages;
3041     RAMBlock *rb;
3042 
3043     RCU_READ_LOCK_GUARD();
3044 
3045     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3046             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3047             rs->migration_dirty_pages -= pages;
3048     }
3049 }
3050 
3051 static void ram_init_bitmaps(RAMState *rs)
3052 {
3053     /* For memory_global_dirty_log_start below.  */
3054     qemu_mutex_lock_iothread();
3055     qemu_mutex_lock_ramlist();
3056 
3057     WITH_RCU_READ_LOCK_GUARD() {
3058         ram_list_init_bitmaps();
3059         /* We don't use dirty log with background snapshots */
3060         if (!migrate_background_snapshot()) {
3061             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3062             migration_bitmap_sync_precopy(rs);
3063         }
3064     }
3065     qemu_mutex_unlock_ramlist();
3066     qemu_mutex_unlock_iothread();
3067 
3068     /*
3069      * After an eventual first bitmap sync, fixup the initial bitmap
3070      * containing all 1s to exclude any discarded pages from migration.
3071      */
3072     migration_bitmap_clear_discarded_pages(rs);
3073 }
3074 
3075 static int ram_init_all(RAMState **rsp)
3076 {
3077     if (ram_state_init(rsp)) {
3078         return -1;
3079     }
3080 
3081     if (xbzrle_init()) {
3082         ram_state_cleanup(rsp);
3083         return -1;
3084     }
3085 
3086     ram_init_bitmaps(*rsp);
3087 
3088     return 0;
3089 }
3090 
3091 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3092 {
3093     RAMBlock *block;
3094     uint64_t pages = 0;
3095 
3096     /*
3097      * Postcopy is not using xbzrle/compression, so no need for that.
3098      * Also, since source are already halted, we don't need to care
3099      * about dirty page logging as well.
3100      */
3101 
3102     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3103         pages += bitmap_count_one(block->bmap,
3104                                   block->used_length >> TARGET_PAGE_BITS);
3105     }
3106 
3107     /* This may not be aligned with current bitmaps. Recalculate. */
3108     rs->migration_dirty_pages = pages;
3109 
3110     ram_state_reset(rs);
3111 
3112     /* Update RAMState cache of output QEMUFile */
3113     rs->f = out;
3114 
3115     trace_ram_state_resume_prepare(pages);
3116 }
3117 
3118 /*
3119  * This function clears bits of the free pages reported by the caller from the
3120  * migration dirty bitmap. @addr is the host address corresponding to the
3121  * start of the continuous guest free pages, and @len is the total bytes of
3122  * those pages.
3123  */
3124 void qemu_guest_free_page_hint(void *addr, size_t len)
3125 {
3126     RAMBlock *block;
3127     ram_addr_t offset;
3128     size_t used_len, start, npages;
3129     MigrationState *s = migrate_get_current();
3130 
3131     /* This function is currently expected to be used during live migration */
3132     if (!migration_is_setup_or_active(s->state)) {
3133         return;
3134     }
3135 
3136     for (; len > 0; len -= used_len, addr += used_len) {
3137         block = qemu_ram_block_from_host(addr, false, &offset);
3138         if (unlikely(!block || offset >= block->used_length)) {
3139             /*
3140              * The implementation might not support RAMBlock resize during
3141              * live migration, but it could happen in theory with future
3142              * updates. So we add a check here to capture that case.
3143              */
3144             error_report_once("%s unexpected error", __func__);
3145             return;
3146         }
3147 
3148         if (len <= block->used_length - offset) {
3149             used_len = len;
3150         } else {
3151             used_len = block->used_length - offset;
3152         }
3153 
3154         start = offset >> TARGET_PAGE_BITS;
3155         npages = used_len >> TARGET_PAGE_BITS;
3156 
3157         qemu_mutex_lock(&ram_state->bitmap_mutex);
3158         /*
3159          * The skipped free pages are equavalent to be sent from clear_bmap's
3160          * perspective, so clear the bits from the memory region bitmap which
3161          * are initially set. Otherwise those skipped pages will be sent in
3162          * the next round after syncing from the memory region bitmap.
3163          */
3164         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3165         ram_state->migration_dirty_pages -=
3166                       bitmap_count_one_with_offset(block->bmap, start, npages);
3167         bitmap_clear(block->bmap, start, npages);
3168         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3169     }
3170 }
3171 
3172 /*
3173  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3174  * long-running RCU critical section.  When rcu-reclaims in the code
3175  * start to become numerous it will be necessary to reduce the
3176  * granularity of these critical sections.
3177  */
3178 
3179 /**
3180  * ram_save_setup: Setup RAM for migration
3181  *
3182  * Returns zero to indicate success and negative for error
3183  *
3184  * @f: QEMUFile where to send the data
3185  * @opaque: RAMState pointer
3186  */
3187 static int ram_save_setup(QEMUFile *f, void *opaque)
3188 {
3189     RAMState **rsp = opaque;
3190     RAMBlock *block;
3191     int ret;
3192 
3193     if (compress_threads_save_setup()) {
3194         return -1;
3195     }
3196 
3197     /* migration has already setup the bitmap, reuse it. */
3198     if (!migration_in_colo_state()) {
3199         if (ram_init_all(rsp) != 0) {
3200             compress_threads_save_cleanup();
3201             return -1;
3202         }
3203     }
3204     (*rsp)->f = f;
3205 
3206     WITH_RCU_READ_LOCK_GUARD() {
3207         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3208 
3209         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3210             qemu_put_byte(f, strlen(block->idstr));
3211             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3212             qemu_put_be64(f, block->used_length);
3213             if (migrate_postcopy_ram() && block->page_size !=
3214                                           qemu_host_page_size) {
3215                 qemu_put_be64(f, block->page_size);
3216             }
3217             if (migrate_ignore_shared()) {
3218                 qemu_put_be64(f, block->mr->addr);
3219             }
3220         }
3221     }
3222 
3223     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3224     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3225 
3226     ret =  multifd_send_sync_main(f);
3227     if (ret < 0) {
3228         return ret;
3229     }
3230 
3231     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3232     qemu_fflush(f);
3233 
3234     return 0;
3235 }
3236 
3237 /**
3238  * ram_save_iterate: iterative stage for migration
3239  *
3240  * Returns zero to indicate success and negative for error
3241  *
3242  * @f: QEMUFile where to send the data
3243  * @opaque: RAMState pointer
3244  */
3245 static int ram_save_iterate(QEMUFile *f, void *opaque)
3246 {
3247     RAMState **temp = opaque;
3248     RAMState *rs = *temp;
3249     int ret = 0;
3250     int i;
3251     int64_t t0;
3252     int done = 0;
3253 
3254     if (blk_mig_bulk_active()) {
3255         /* Avoid transferring ram during bulk phase of block migration as
3256          * the bulk phase will usually take a long time and transferring
3257          * ram updates during that time is pointless. */
3258         goto out;
3259     }
3260 
3261     /*
3262      * We'll take this lock a little bit long, but it's okay for two reasons.
3263      * Firstly, the only possible other thread to take it is who calls
3264      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3265      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3266      * guarantees that we'll at least released it in a regular basis.
3267      */
3268     qemu_mutex_lock(&rs->bitmap_mutex);
3269     WITH_RCU_READ_LOCK_GUARD() {
3270         if (ram_list.version != rs->last_version) {
3271             ram_state_reset(rs);
3272         }
3273 
3274         /* Read version before ram_list.blocks */
3275         smp_rmb();
3276 
3277         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3278 
3279         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3280         i = 0;
3281         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3282                postcopy_has_request(rs)) {
3283             int pages;
3284 
3285             if (qemu_file_get_error(f)) {
3286                 break;
3287             }
3288 
3289             pages = ram_find_and_save_block(rs);
3290             /* no more pages to sent */
3291             if (pages == 0) {
3292                 done = 1;
3293                 break;
3294             }
3295 
3296             if (pages < 0) {
3297                 qemu_file_set_error(f, pages);
3298                 break;
3299             }
3300 
3301             rs->target_page_count += pages;
3302 
3303             /*
3304              * During postcopy, it is necessary to make sure one whole host
3305              * page is sent in one chunk.
3306              */
3307             if (migrate_postcopy_ram()) {
3308                 flush_compressed_data(rs);
3309             }
3310 
3311             /*
3312              * we want to check in the 1st loop, just in case it was the 1st
3313              * time and we had to sync the dirty bitmap.
3314              * qemu_clock_get_ns() is a bit expensive, so we only check each
3315              * some iterations
3316              */
3317             if ((i & 63) == 0) {
3318                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3319                               1000000;
3320                 if (t1 > MAX_WAIT) {
3321                     trace_ram_save_iterate_big_wait(t1, i);
3322                     break;
3323                 }
3324             }
3325             i++;
3326         }
3327     }
3328     qemu_mutex_unlock(&rs->bitmap_mutex);
3329 
3330     postcopy_preempt_reset_channel(rs);
3331 
3332     /*
3333      * Must occur before EOS (or any QEMUFile operation)
3334      * because of RDMA protocol.
3335      */
3336     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3337 
3338 out:
3339     if (ret >= 0
3340         && migration_is_setup_or_active(migrate_get_current()->state)) {
3341         ret = multifd_send_sync_main(rs->f);
3342         if (ret < 0) {
3343             return ret;
3344         }
3345 
3346         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3347         qemu_fflush(f);
3348         ram_transferred_add(8);
3349 
3350         ret = qemu_file_get_error(f);
3351     }
3352     if (ret < 0) {
3353         return ret;
3354     }
3355 
3356     return done;
3357 }
3358 
3359 /**
3360  * ram_save_complete: function called to send the remaining amount of ram
3361  *
3362  * Returns zero to indicate success or negative on error
3363  *
3364  * Called with iothread lock
3365  *
3366  * @f: QEMUFile where to send the data
3367  * @opaque: RAMState pointer
3368  */
3369 static int ram_save_complete(QEMUFile *f, void *opaque)
3370 {
3371     RAMState **temp = opaque;
3372     RAMState *rs = *temp;
3373     int ret = 0;
3374 
3375     rs->last_stage = !migration_in_colo_state();
3376 
3377     WITH_RCU_READ_LOCK_GUARD() {
3378         if (!migration_in_postcopy()) {
3379             migration_bitmap_sync_precopy(rs);
3380         }
3381 
3382         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3383 
3384         /* try transferring iterative blocks of memory */
3385 
3386         /* flush all remaining blocks regardless of rate limiting */
3387         while (true) {
3388             int pages;
3389 
3390             pages = ram_find_and_save_block(rs);
3391             /* no more blocks to sent */
3392             if (pages == 0) {
3393                 break;
3394             }
3395             if (pages < 0) {
3396                 ret = pages;
3397                 break;
3398             }
3399         }
3400 
3401         flush_compressed_data(rs);
3402         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3403     }
3404 
3405     if (ret < 0) {
3406         return ret;
3407     }
3408 
3409     postcopy_preempt_reset_channel(rs);
3410 
3411     ret = multifd_send_sync_main(rs->f);
3412     if (ret < 0) {
3413         return ret;
3414     }
3415 
3416     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3417     qemu_fflush(f);
3418 
3419     return 0;
3420 }
3421 
3422 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3423                              uint64_t *res_precopy_only,
3424                              uint64_t *res_compatible,
3425                              uint64_t *res_postcopy_only)
3426 {
3427     RAMState **temp = opaque;
3428     RAMState *rs = *temp;
3429     uint64_t remaining_size;
3430 
3431     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3432 
3433     if (!migration_in_postcopy() &&
3434         remaining_size < max_size) {
3435         qemu_mutex_lock_iothread();
3436         WITH_RCU_READ_LOCK_GUARD() {
3437             migration_bitmap_sync_precopy(rs);
3438         }
3439         qemu_mutex_unlock_iothread();
3440         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3441     }
3442 
3443     if (migrate_postcopy_ram()) {
3444         /* We can do postcopy, and all the data is postcopiable */
3445         *res_compatible += remaining_size;
3446     } else {
3447         *res_precopy_only += remaining_size;
3448     }
3449 }
3450 
3451 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3452 {
3453     unsigned int xh_len;
3454     int xh_flags;
3455     uint8_t *loaded_data;
3456 
3457     /* extract RLE header */
3458     xh_flags = qemu_get_byte(f);
3459     xh_len = qemu_get_be16(f);
3460 
3461     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3462         error_report("Failed to load XBZRLE page - wrong compression!");
3463         return -1;
3464     }
3465 
3466     if (xh_len > TARGET_PAGE_SIZE) {
3467         error_report("Failed to load XBZRLE page - len overflow!");
3468         return -1;
3469     }
3470     loaded_data = XBZRLE.decoded_buf;
3471     /* load data and decode */
3472     /* it can change loaded_data to point to an internal buffer */
3473     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3474 
3475     /* decode RLE */
3476     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3477                              TARGET_PAGE_SIZE) == -1) {
3478         error_report("Failed to load XBZRLE page - decode error!");
3479         return -1;
3480     }
3481 
3482     return 0;
3483 }
3484 
3485 /**
3486  * ram_block_from_stream: read a RAMBlock id from the migration stream
3487  *
3488  * Must be called from within a rcu critical section.
3489  *
3490  * Returns a pointer from within the RCU-protected ram_list.
3491  *
3492  * @mis: the migration incoming state pointer
3493  * @f: QEMUFile where to read the data from
3494  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3495  * @channel: the channel we're using
3496  */
3497 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3498                                               QEMUFile *f, int flags,
3499                                               int channel)
3500 {
3501     RAMBlock *block = mis->last_recv_block[channel];
3502     char id[256];
3503     uint8_t len;
3504 
3505     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3506         if (!block) {
3507             error_report("Ack, bad migration stream!");
3508             return NULL;
3509         }
3510         return block;
3511     }
3512 
3513     len = qemu_get_byte(f);
3514     qemu_get_buffer(f, (uint8_t *)id, len);
3515     id[len] = 0;
3516 
3517     block = qemu_ram_block_by_name(id);
3518     if (!block) {
3519         error_report("Can't find block %s", id);
3520         return NULL;
3521     }
3522 
3523     if (ramblock_is_ignored(block)) {
3524         error_report("block %s should not be migrated !", id);
3525         return NULL;
3526     }
3527 
3528     mis->last_recv_block[channel] = block;
3529 
3530     return block;
3531 }
3532 
3533 static inline void *host_from_ram_block_offset(RAMBlock *block,
3534                                                ram_addr_t offset)
3535 {
3536     if (!offset_in_ramblock(block, offset)) {
3537         return NULL;
3538     }
3539 
3540     return block->host + offset;
3541 }
3542 
3543 static void *host_page_from_ram_block_offset(RAMBlock *block,
3544                                              ram_addr_t offset)
3545 {
3546     /* Note: Explicitly no check against offset_in_ramblock(). */
3547     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3548                                    block->page_size);
3549 }
3550 
3551 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3552                                                          ram_addr_t offset)
3553 {
3554     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3555 }
3556 
3557 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3558                              ram_addr_t offset, bool record_bitmap)
3559 {
3560     if (!offset_in_ramblock(block, offset)) {
3561         return NULL;
3562     }
3563     if (!block->colo_cache) {
3564         error_report("%s: colo_cache is NULL in block :%s",
3565                      __func__, block->idstr);
3566         return NULL;
3567     }
3568 
3569     /*
3570     * During colo checkpoint, we need bitmap of these migrated pages.
3571     * It help us to decide which pages in ram cache should be flushed
3572     * into VM's RAM later.
3573     */
3574     if (record_bitmap &&
3575         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3576         ram_state->migration_dirty_pages++;
3577     }
3578     return block->colo_cache + offset;
3579 }
3580 
3581 /**
3582  * ram_handle_compressed: handle the zero page case
3583  *
3584  * If a page (or a whole RDMA chunk) has been
3585  * determined to be zero, then zap it.
3586  *
3587  * @host: host address for the zero page
3588  * @ch: what the page is filled from.  We only support zero
3589  * @size: size of the zero page
3590  */
3591 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3592 {
3593     if (ch != 0 || !buffer_is_zero(host, size)) {
3594         memset(host, ch, size);
3595     }
3596 }
3597 
3598 /* return the size after decompression, or negative value on error */
3599 static int
3600 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3601                      const uint8_t *source, size_t source_len)
3602 {
3603     int err;
3604 
3605     err = inflateReset(stream);
3606     if (err != Z_OK) {
3607         return -1;
3608     }
3609 
3610     stream->avail_in = source_len;
3611     stream->next_in = (uint8_t *)source;
3612     stream->avail_out = dest_len;
3613     stream->next_out = dest;
3614 
3615     err = inflate(stream, Z_NO_FLUSH);
3616     if (err != Z_STREAM_END) {
3617         return -1;
3618     }
3619 
3620     return stream->total_out;
3621 }
3622 
3623 static void *do_data_decompress(void *opaque)
3624 {
3625     DecompressParam *param = opaque;
3626     unsigned long pagesize;
3627     uint8_t *des;
3628     int len, ret;
3629 
3630     qemu_mutex_lock(&param->mutex);
3631     while (!param->quit) {
3632         if (param->des) {
3633             des = param->des;
3634             len = param->len;
3635             param->des = 0;
3636             qemu_mutex_unlock(&param->mutex);
3637 
3638             pagesize = TARGET_PAGE_SIZE;
3639 
3640             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3641                                        param->compbuf, len);
3642             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3643                 error_report("decompress data failed");
3644                 qemu_file_set_error(decomp_file, ret);
3645             }
3646 
3647             qemu_mutex_lock(&decomp_done_lock);
3648             param->done = true;
3649             qemu_cond_signal(&decomp_done_cond);
3650             qemu_mutex_unlock(&decomp_done_lock);
3651 
3652             qemu_mutex_lock(&param->mutex);
3653         } else {
3654             qemu_cond_wait(&param->cond, &param->mutex);
3655         }
3656     }
3657     qemu_mutex_unlock(&param->mutex);
3658 
3659     return NULL;
3660 }
3661 
3662 static int wait_for_decompress_done(void)
3663 {
3664     int idx, thread_count;
3665 
3666     if (!migrate_use_compression()) {
3667         return 0;
3668     }
3669 
3670     thread_count = migrate_decompress_threads();
3671     qemu_mutex_lock(&decomp_done_lock);
3672     for (idx = 0; idx < thread_count; idx++) {
3673         while (!decomp_param[idx].done) {
3674             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3675         }
3676     }
3677     qemu_mutex_unlock(&decomp_done_lock);
3678     return qemu_file_get_error(decomp_file);
3679 }
3680 
3681 static void compress_threads_load_cleanup(void)
3682 {
3683     int i, thread_count;
3684 
3685     if (!migrate_use_compression()) {
3686         return;
3687     }
3688     thread_count = migrate_decompress_threads();
3689     for (i = 0; i < thread_count; i++) {
3690         /*
3691          * we use it as a indicator which shows if the thread is
3692          * properly init'd or not
3693          */
3694         if (!decomp_param[i].compbuf) {
3695             break;
3696         }
3697 
3698         qemu_mutex_lock(&decomp_param[i].mutex);
3699         decomp_param[i].quit = true;
3700         qemu_cond_signal(&decomp_param[i].cond);
3701         qemu_mutex_unlock(&decomp_param[i].mutex);
3702     }
3703     for (i = 0; i < thread_count; i++) {
3704         if (!decomp_param[i].compbuf) {
3705             break;
3706         }
3707 
3708         qemu_thread_join(decompress_threads + i);
3709         qemu_mutex_destroy(&decomp_param[i].mutex);
3710         qemu_cond_destroy(&decomp_param[i].cond);
3711         inflateEnd(&decomp_param[i].stream);
3712         g_free(decomp_param[i].compbuf);
3713         decomp_param[i].compbuf = NULL;
3714     }
3715     g_free(decompress_threads);
3716     g_free(decomp_param);
3717     decompress_threads = NULL;
3718     decomp_param = NULL;
3719     decomp_file = NULL;
3720 }
3721 
3722 static int compress_threads_load_setup(QEMUFile *f)
3723 {
3724     int i, thread_count;
3725 
3726     if (!migrate_use_compression()) {
3727         return 0;
3728     }
3729 
3730     thread_count = migrate_decompress_threads();
3731     decompress_threads = g_new0(QemuThread, thread_count);
3732     decomp_param = g_new0(DecompressParam, thread_count);
3733     qemu_mutex_init(&decomp_done_lock);
3734     qemu_cond_init(&decomp_done_cond);
3735     decomp_file = f;
3736     for (i = 0; i < thread_count; i++) {
3737         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3738             goto exit;
3739         }
3740 
3741         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3742         qemu_mutex_init(&decomp_param[i].mutex);
3743         qemu_cond_init(&decomp_param[i].cond);
3744         decomp_param[i].done = true;
3745         decomp_param[i].quit = false;
3746         qemu_thread_create(decompress_threads + i, "decompress",
3747                            do_data_decompress, decomp_param + i,
3748                            QEMU_THREAD_JOINABLE);
3749     }
3750     return 0;
3751 exit:
3752     compress_threads_load_cleanup();
3753     return -1;
3754 }
3755 
3756 static void decompress_data_with_multi_threads(QEMUFile *f,
3757                                                void *host, int len)
3758 {
3759     int idx, thread_count;
3760 
3761     thread_count = migrate_decompress_threads();
3762     QEMU_LOCK_GUARD(&decomp_done_lock);
3763     while (true) {
3764         for (idx = 0; idx < thread_count; idx++) {
3765             if (decomp_param[idx].done) {
3766                 decomp_param[idx].done = false;
3767                 qemu_mutex_lock(&decomp_param[idx].mutex);
3768                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3769                 decomp_param[idx].des = host;
3770                 decomp_param[idx].len = len;
3771                 qemu_cond_signal(&decomp_param[idx].cond);
3772                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3773                 break;
3774             }
3775         }
3776         if (idx < thread_count) {
3777             break;
3778         } else {
3779             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3780         }
3781     }
3782 }
3783 
3784 static void colo_init_ram_state(void)
3785 {
3786     ram_state_init(&ram_state);
3787 }
3788 
3789 /*
3790  * colo cache: this is for secondary VM, we cache the whole
3791  * memory of the secondary VM, it is need to hold the global lock
3792  * to call this helper.
3793  */
3794 int colo_init_ram_cache(void)
3795 {
3796     RAMBlock *block;
3797 
3798     WITH_RCU_READ_LOCK_GUARD() {
3799         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3800             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3801                                                     NULL, false, false);
3802             if (!block->colo_cache) {
3803                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3804                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3805                              block->used_length);
3806                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3807                     if (block->colo_cache) {
3808                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3809                         block->colo_cache = NULL;
3810                     }
3811                 }
3812                 return -errno;
3813             }
3814             if (!machine_dump_guest_core(current_machine)) {
3815                 qemu_madvise(block->colo_cache, block->used_length,
3816                              QEMU_MADV_DONTDUMP);
3817             }
3818         }
3819     }
3820 
3821     /*
3822     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3823     * with to decide which page in cache should be flushed into SVM's RAM. Here
3824     * we use the same name 'ram_bitmap' as for migration.
3825     */
3826     if (ram_bytes_total()) {
3827         RAMBlock *block;
3828 
3829         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3830             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3831             block->bmap = bitmap_new(pages);
3832         }
3833     }
3834 
3835     colo_init_ram_state();
3836     return 0;
3837 }
3838 
3839 /* TODO: duplicated with ram_init_bitmaps */
3840 void colo_incoming_start_dirty_log(void)
3841 {
3842     RAMBlock *block = NULL;
3843     /* For memory_global_dirty_log_start below. */
3844     qemu_mutex_lock_iothread();
3845     qemu_mutex_lock_ramlist();
3846 
3847     memory_global_dirty_log_sync();
3848     WITH_RCU_READ_LOCK_GUARD() {
3849         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3850             ramblock_sync_dirty_bitmap(ram_state, block);
3851             /* Discard this dirty bitmap record */
3852             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3853         }
3854         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3855     }
3856     ram_state->migration_dirty_pages = 0;
3857     qemu_mutex_unlock_ramlist();
3858     qemu_mutex_unlock_iothread();
3859 }
3860 
3861 /* It is need to hold the global lock to call this helper */
3862 void colo_release_ram_cache(void)
3863 {
3864     RAMBlock *block;
3865 
3866     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3867     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3868         g_free(block->bmap);
3869         block->bmap = NULL;
3870     }
3871 
3872     WITH_RCU_READ_LOCK_GUARD() {
3873         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3874             if (block->colo_cache) {
3875                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3876                 block->colo_cache = NULL;
3877             }
3878         }
3879     }
3880     ram_state_cleanup(&ram_state);
3881 }
3882 
3883 /**
3884  * ram_load_setup: Setup RAM for migration incoming side
3885  *
3886  * Returns zero to indicate success and negative for error
3887  *
3888  * @f: QEMUFile where to receive the data
3889  * @opaque: RAMState pointer
3890  */
3891 static int ram_load_setup(QEMUFile *f, void *opaque)
3892 {
3893     if (compress_threads_load_setup(f)) {
3894         return -1;
3895     }
3896 
3897     xbzrle_load_setup();
3898     ramblock_recv_map_init();
3899 
3900     return 0;
3901 }
3902 
3903 static int ram_load_cleanup(void *opaque)
3904 {
3905     RAMBlock *rb;
3906 
3907     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3908         qemu_ram_block_writeback(rb);
3909     }
3910 
3911     xbzrle_load_cleanup();
3912     compress_threads_load_cleanup();
3913 
3914     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3915         g_free(rb->receivedmap);
3916         rb->receivedmap = NULL;
3917     }
3918 
3919     return 0;
3920 }
3921 
3922 /**
3923  * ram_postcopy_incoming_init: allocate postcopy data structures
3924  *
3925  * Returns 0 for success and negative if there was one error
3926  *
3927  * @mis: current migration incoming state
3928  *
3929  * Allocate data structures etc needed by incoming migration with
3930  * postcopy-ram. postcopy-ram's similarly names
3931  * postcopy_ram_incoming_init does the work.
3932  */
3933 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3934 {
3935     return postcopy_ram_incoming_init(mis);
3936 }
3937 
3938 /**
3939  * ram_load_postcopy: load a page in postcopy case
3940  *
3941  * Returns 0 for success or -errno in case of error
3942  *
3943  * Called in postcopy mode by ram_load().
3944  * rcu_read_lock is taken prior to this being called.
3945  *
3946  * @f: QEMUFile where to send the data
3947  * @channel: the channel to use for loading
3948  */
3949 int ram_load_postcopy(QEMUFile *f, int channel)
3950 {
3951     int flags = 0, ret = 0;
3952     bool place_needed = false;
3953     bool matches_target_page_size = false;
3954     MigrationIncomingState *mis = migration_incoming_get_current();
3955     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3956 
3957     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3958         ram_addr_t addr;
3959         void *page_buffer = NULL;
3960         void *place_source = NULL;
3961         RAMBlock *block = NULL;
3962         uint8_t ch;
3963         int len;
3964 
3965         addr = qemu_get_be64(f);
3966 
3967         /*
3968          * If qemu file error, we should stop here, and then "addr"
3969          * may be invalid
3970          */
3971         ret = qemu_file_get_error(f);
3972         if (ret) {
3973             break;
3974         }
3975 
3976         flags = addr & ~TARGET_PAGE_MASK;
3977         addr &= TARGET_PAGE_MASK;
3978 
3979         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3980         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3981                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3982             block = ram_block_from_stream(mis, f, flags, channel);
3983             if (!block) {
3984                 ret = -EINVAL;
3985                 break;
3986             }
3987 
3988             /*
3989              * Relying on used_length is racy and can result in false positives.
3990              * We might place pages beyond used_length in case RAM was shrunk
3991              * while in postcopy, which is fine - trying to place via
3992              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3993              */
3994             if (!block->host || addr >= block->postcopy_length) {
3995                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3996                 ret = -EINVAL;
3997                 break;
3998             }
3999             tmp_page->target_pages++;
4000             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4001             /*
4002              * Postcopy requires that we place whole host pages atomically;
4003              * these may be huge pages for RAMBlocks that are backed by
4004              * hugetlbfs.
4005              * To make it atomic, the data is read into a temporary page
4006              * that's moved into place later.
4007              * The migration protocol uses,  possibly smaller, target-pages
4008              * however the source ensures it always sends all the components
4009              * of a host page in one chunk.
4010              */
4011             page_buffer = tmp_page->tmp_huge_page +
4012                           host_page_offset_from_ram_block_offset(block, addr);
4013             /* If all TP are zero then we can optimise the place */
4014             if (tmp_page->target_pages == 1) {
4015                 tmp_page->host_addr =
4016                     host_page_from_ram_block_offset(block, addr);
4017             } else if (tmp_page->host_addr !=
4018                        host_page_from_ram_block_offset(block, addr)) {
4019                 /* not the 1st TP within the HP */
4020                 error_report("Non-same host page detected on channel %d: "
4021                              "Target host page %p, received host page %p "
4022                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4023                              channel, tmp_page->host_addr,
4024                              host_page_from_ram_block_offset(block, addr),
4025                              block->idstr, addr, tmp_page->target_pages);
4026                 ret = -EINVAL;
4027                 break;
4028             }
4029 
4030             /*
4031              * If it's the last part of a host page then we place the host
4032              * page
4033              */
4034             if (tmp_page->target_pages ==
4035                 (block->page_size / TARGET_PAGE_SIZE)) {
4036                 place_needed = true;
4037             }
4038             place_source = tmp_page->tmp_huge_page;
4039         }
4040 
4041         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4042         case RAM_SAVE_FLAG_ZERO:
4043             ch = qemu_get_byte(f);
4044             /*
4045              * Can skip to set page_buffer when
4046              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4047              */
4048             if (ch || !matches_target_page_size) {
4049                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4050             }
4051             if (ch) {
4052                 tmp_page->all_zero = false;
4053             }
4054             break;
4055 
4056         case RAM_SAVE_FLAG_PAGE:
4057             tmp_page->all_zero = false;
4058             if (!matches_target_page_size) {
4059                 /* For huge pages, we always use temporary buffer */
4060                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4061             } else {
4062                 /*
4063                  * For small pages that matches target page size, we
4064                  * avoid the qemu_file copy.  Instead we directly use
4065                  * the buffer of QEMUFile to place the page.  Note: we
4066                  * cannot do any QEMUFile operation before using that
4067                  * buffer to make sure the buffer is valid when
4068                  * placing the page.
4069                  */
4070                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4071                                          TARGET_PAGE_SIZE);
4072             }
4073             break;
4074         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4075             tmp_page->all_zero = false;
4076             len = qemu_get_be32(f);
4077             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4078                 error_report("Invalid compressed data length: %d", len);
4079                 ret = -EINVAL;
4080                 break;
4081             }
4082             decompress_data_with_multi_threads(f, page_buffer, len);
4083             break;
4084 
4085         case RAM_SAVE_FLAG_EOS:
4086             /* normal exit */
4087             multifd_recv_sync_main();
4088             break;
4089         default:
4090             error_report("Unknown combination of migration flags: 0x%x"
4091                          " (postcopy mode)", flags);
4092             ret = -EINVAL;
4093             break;
4094         }
4095 
4096         /* Got the whole host page, wait for decompress before placing. */
4097         if (place_needed) {
4098             ret |= wait_for_decompress_done();
4099         }
4100 
4101         /* Detect for any possible file errors */
4102         if (!ret && qemu_file_get_error(f)) {
4103             ret = qemu_file_get_error(f);
4104         }
4105 
4106         if (!ret && place_needed) {
4107             if (tmp_page->all_zero) {
4108                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4109             } else {
4110                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4111                                           place_source, block);
4112             }
4113             place_needed = false;
4114             postcopy_temp_page_reset(tmp_page);
4115         }
4116     }
4117 
4118     return ret;
4119 }
4120 
4121 static bool postcopy_is_advised(void)
4122 {
4123     PostcopyState ps = postcopy_state_get();
4124     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4125 }
4126 
4127 static bool postcopy_is_running(void)
4128 {
4129     PostcopyState ps = postcopy_state_get();
4130     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4131 }
4132 
4133 /*
4134  * Flush content of RAM cache into SVM's memory.
4135  * Only flush the pages that be dirtied by PVM or SVM or both.
4136  */
4137 void colo_flush_ram_cache(void)
4138 {
4139     RAMBlock *block = NULL;
4140     void *dst_host;
4141     void *src_host;
4142     unsigned long offset = 0;
4143 
4144     memory_global_dirty_log_sync();
4145     WITH_RCU_READ_LOCK_GUARD() {
4146         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4147             ramblock_sync_dirty_bitmap(ram_state, block);
4148         }
4149     }
4150 
4151     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4152     WITH_RCU_READ_LOCK_GUARD() {
4153         block = QLIST_FIRST_RCU(&ram_list.blocks);
4154 
4155         while (block) {
4156             unsigned long num = 0;
4157 
4158             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4159             if (!offset_in_ramblock(block,
4160                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4161                 offset = 0;
4162                 num = 0;
4163                 block = QLIST_NEXT_RCU(block, next);
4164             } else {
4165                 unsigned long i = 0;
4166 
4167                 for (i = 0; i < num; i++) {
4168                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4169                 }
4170                 dst_host = block->host
4171                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4172                 src_host = block->colo_cache
4173                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4174                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4175                 offset += num;
4176             }
4177         }
4178     }
4179     trace_colo_flush_ram_cache_end();
4180 }
4181 
4182 /**
4183  * ram_load_precopy: load pages in precopy case
4184  *
4185  * Returns 0 for success or -errno in case of error
4186  *
4187  * Called in precopy mode by ram_load().
4188  * rcu_read_lock is taken prior to this being called.
4189  *
4190  * @f: QEMUFile where to send the data
4191  */
4192 static int ram_load_precopy(QEMUFile *f)
4193 {
4194     MigrationIncomingState *mis = migration_incoming_get_current();
4195     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4196     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4197     bool postcopy_advised = postcopy_is_advised();
4198     if (!migrate_use_compression()) {
4199         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4200     }
4201 
4202     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4203         ram_addr_t addr, total_ram_bytes;
4204         void *host = NULL, *host_bak = NULL;
4205         uint8_t ch;
4206 
4207         /*
4208          * Yield periodically to let main loop run, but an iteration of
4209          * the main loop is expensive, so do it each some iterations
4210          */
4211         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4212             aio_co_schedule(qemu_get_current_aio_context(),
4213                             qemu_coroutine_self());
4214             qemu_coroutine_yield();
4215         }
4216         i++;
4217 
4218         addr = qemu_get_be64(f);
4219         flags = addr & ~TARGET_PAGE_MASK;
4220         addr &= TARGET_PAGE_MASK;
4221 
4222         if (flags & invalid_flags) {
4223             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4224                 error_report("Received an unexpected compressed page");
4225             }
4226 
4227             ret = -EINVAL;
4228             break;
4229         }
4230 
4231         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4232                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4233             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4234                                                     RAM_CHANNEL_PRECOPY);
4235 
4236             host = host_from_ram_block_offset(block, addr);
4237             /*
4238              * After going into COLO stage, we should not load the page
4239              * into SVM's memory directly, we put them into colo_cache firstly.
4240              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4241              * Previously, we copied all these memory in preparing stage of COLO
4242              * while we need to stop VM, which is a time-consuming process.
4243              * Here we optimize it by a trick, back-up every page while in
4244              * migration process while COLO is enabled, though it affects the
4245              * speed of the migration, but it obviously reduce the downtime of
4246              * back-up all SVM'S memory in COLO preparing stage.
4247              */
4248             if (migration_incoming_colo_enabled()) {
4249                 if (migration_incoming_in_colo_state()) {
4250                     /* In COLO stage, put all pages into cache temporarily */
4251                     host = colo_cache_from_block_offset(block, addr, true);
4252                 } else {
4253                    /*
4254                     * In migration stage but before COLO stage,
4255                     * Put all pages into both cache and SVM's memory.
4256                     */
4257                     host_bak = colo_cache_from_block_offset(block, addr, false);
4258                 }
4259             }
4260             if (!host) {
4261                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4262                 ret = -EINVAL;
4263                 break;
4264             }
4265             if (!migration_incoming_in_colo_state()) {
4266                 ramblock_recv_bitmap_set(block, host);
4267             }
4268 
4269             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4270         }
4271 
4272         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4273         case RAM_SAVE_FLAG_MEM_SIZE:
4274             /* Synchronize RAM block list */
4275             total_ram_bytes = addr;
4276             while (!ret && total_ram_bytes) {
4277                 RAMBlock *block;
4278                 char id[256];
4279                 ram_addr_t length;
4280 
4281                 len = qemu_get_byte(f);
4282                 qemu_get_buffer(f, (uint8_t *)id, len);
4283                 id[len] = 0;
4284                 length = qemu_get_be64(f);
4285 
4286                 block = qemu_ram_block_by_name(id);
4287                 if (block && !qemu_ram_is_migratable(block)) {
4288                     error_report("block %s should not be migrated !", id);
4289                     ret = -EINVAL;
4290                 } else if (block) {
4291                     if (length != block->used_length) {
4292                         Error *local_err = NULL;
4293 
4294                         ret = qemu_ram_resize(block, length,
4295                                               &local_err);
4296                         if (local_err) {
4297                             error_report_err(local_err);
4298                         }
4299                     }
4300                     /* For postcopy we need to check hugepage sizes match */
4301                     if (postcopy_advised && migrate_postcopy_ram() &&
4302                         block->page_size != qemu_host_page_size) {
4303                         uint64_t remote_page_size = qemu_get_be64(f);
4304                         if (remote_page_size != block->page_size) {
4305                             error_report("Mismatched RAM page size %s "
4306                                          "(local) %zd != %" PRId64,
4307                                          id, block->page_size,
4308                                          remote_page_size);
4309                             ret = -EINVAL;
4310                         }
4311                     }
4312                     if (migrate_ignore_shared()) {
4313                         hwaddr addr = qemu_get_be64(f);
4314                         if (ramblock_is_ignored(block) &&
4315                             block->mr->addr != addr) {
4316                             error_report("Mismatched GPAs for block %s "
4317                                          "%" PRId64 "!= %" PRId64,
4318                                          id, (uint64_t)addr,
4319                                          (uint64_t)block->mr->addr);
4320                             ret = -EINVAL;
4321                         }
4322                     }
4323                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4324                                           block->idstr);
4325                 } else {
4326                     error_report("Unknown ramblock \"%s\", cannot "
4327                                  "accept migration", id);
4328                     ret = -EINVAL;
4329                 }
4330 
4331                 total_ram_bytes -= length;
4332             }
4333             break;
4334 
4335         case RAM_SAVE_FLAG_ZERO:
4336             ch = qemu_get_byte(f);
4337             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4338             break;
4339 
4340         case RAM_SAVE_FLAG_PAGE:
4341             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4342             break;
4343 
4344         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4345             len = qemu_get_be32(f);
4346             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4347                 error_report("Invalid compressed data length: %d", len);
4348                 ret = -EINVAL;
4349                 break;
4350             }
4351             decompress_data_with_multi_threads(f, host, len);
4352             break;
4353 
4354         case RAM_SAVE_FLAG_XBZRLE:
4355             if (load_xbzrle(f, addr, host) < 0) {
4356                 error_report("Failed to decompress XBZRLE page at "
4357                              RAM_ADDR_FMT, addr);
4358                 ret = -EINVAL;
4359                 break;
4360             }
4361             break;
4362         case RAM_SAVE_FLAG_EOS:
4363             /* normal exit */
4364             multifd_recv_sync_main();
4365             break;
4366         default:
4367             if (flags & RAM_SAVE_FLAG_HOOK) {
4368                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4369             } else {
4370                 error_report("Unknown combination of migration flags: 0x%x",
4371                              flags);
4372                 ret = -EINVAL;
4373             }
4374         }
4375         if (!ret) {
4376             ret = qemu_file_get_error(f);
4377         }
4378         if (!ret && host_bak) {
4379             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4380         }
4381     }
4382 
4383     ret |= wait_for_decompress_done();
4384     return ret;
4385 }
4386 
4387 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4388 {
4389     int ret = 0;
4390     static uint64_t seq_iter;
4391     /*
4392      * If system is running in postcopy mode, page inserts to host memory must
4393      * be atomic
4394      */
4395     bool postcopy_running = postcopy_is_running();
4396 
4397     seq_iter++;
4398 
4399     if (version_id != 4) {
4400         return -EINVAL;
4401     }
4402 
4403     /*
4404      * This RCU critical section can be very long running.
4405      * When RCU reclaims in the code start to become numerous,
4406      * it will be necessary to reduce the granularity of this
4407      * critical section.
4408      */
4409     WITH_RCU_READ_LOCK_GUARD() {
4410         if (postcopy_running) {
4411             /*
4412              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4413              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4414              * service fast page faults.
4415              */
4416             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4417         } else {
4418             ret = ram_load_precopy(f);
4419         }
4420     }
4421     trace_ram_load_complete(ret, seq_iter);
4422 
4423     return ret;
4424 }
4425 
4426 static bool ram_has_postcopy(void *opaque)
4427 {
4428     RAMBlock *rb;
4429     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4430         if (ramblock_is_pmem(rb)) {
4431             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4432                          "is not supported now!", rb->idstr, rb->host);
4433             return false;
4434         }
4435     }
4436 
4437     return migrate_postcopy_ram();
4438 }
4439 
4440 /* Sync all the dirty bitmap with destination VM.  */
4441 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4442 {
4443     RAMBlock *block;
4444     QEMUFile *file = s->to_dst_file;
4445     int ramblock_count = 0;
4446 
4447     trace_ram_dirty_bitmap_sync_start();
4448 
4449     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4450         qemu_savevm_send_recv_bitmap(file, block->idstr);
4451         trace_ram_dirty_bitmap_request(block->idstr);
4452         ramblock_count++;
4453     }
4454 
4455     trace_ram_dirty_bitmap_sync_wait();
4456 
4457     /* Wait until all the ramblocks' dirty bitmap synced */
4458     while (ramblock_count--) {
4459         qemu_sem_wait(&s->rp_state.rp_sem);
4460     }
4461 
4462     trace_ram_dirty_bitmap_sync_complete();
4463 
4464     return 0;
4465 }
4466 
4467 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4468 {
4469     qemu_sem_post(&s->rp_state.rp_sem);
4470 }
4471 
4472 /*
4473  * Read the received bitmap, revert it as the initial dirty bitmap.
4474  * This is only used when the postcopy migration is paused but wants
4475  * to resume from a middle point.
4476  */
4477 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4478 {
4479     int ret = -EINVAL;
4480     /* from_dst_file is always valid because we're within rp_thread */
4481     QEMUFile *file = s->rp_state.from_dst_file;
4482     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4483     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4484     uint64_t size, end_mark;
4485 
4486     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4487 
4488     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4489         error_report("%s: incorrect state %s", __func__,
4490                      MigrationStatus_str(s->state));
4491         return -EINVAL;
4492     }
4493 
4494     /*
4495      * Note: see comments in ramblock_recv_bitmap_send() on why we
4496      * need the endianness conversion, and the paddings.
4497      */
4498     local_size = ROUND_UP(local_size, 8);
4499 
4500     /* Add paddings */
4501     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4502 
4503     size = qemu_get_be64(file);
4504 
4505     /* The size of the bitmap should match with our ramblock */
4506     if (size != local_size) {
4507         error_report("%s: ramblock '%s' bitmap size mismatch "
4508                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4509                      block->idstr, size, local_size);
4510         ret = -EINVAL;
4511         goto out;
4512     }
4513 
4514     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4515     end_mark = qemu_get_be64(file);
4516 
4517     ret = qemu_file_get_error(file);
4518     if (ret || size != local_size) {
4519         error_report("%s: read bitmap failed for ramblock '%s': %d"
4520                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4521                      __func__, block->idstr, ret, local_size, size);
4522         ret = -EIO;
4523         goto out;
4524     }
4525 
4526     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4527         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4528                      __func__, block->idstr, end_mark);
4529         ret = -EINVAL;
4530         goto out;
4531     }
4532 
4533     /*
4534      * Endianness conversion. We are during postcopy (though paused).
4535      * The dirty bitmap won't change. We can directly modify it.
4536      */
4537     bitmap_from_le(block->bmap, le_bitmap, nbits);
4538 
4539     /*
4540      * What we received is "received bitmap". Revert it as the initial
4541      * dirty bitmap for this ramblock.
4542      */
4543     bitmap_complement(block->bmap, block->bmap, nbits);
4544 
4545     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4546     ramblock_dirty_bitmap_clear_discarded_pages(block);
4547 
4548     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4549     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4550 
4551     /*
4552      * We succeeded to sync bitmap for current ramblock. If this is
4553      * the last one to sync, we need to notify the main send thread.
4554      */
4555     ram_dirty_bitmap_reload_notify(s);
4556 
4557     ret = 0;
4558 out:
4559     g_free(le_bitmap);
4560     return ret;
4561 }
4562 
4563 static int ram_resume_prepare(MigrationState *s, void *opaque)
4564 {
4565     RAMState *rs = *(RAMState **)opaque;
4566     int ret;
4567 
4568     ret = ram_dirty_bitmap_sync_all(s, rs);
4569     if (ret) {
4570         return ret;
4571     }
4572 
4573     ram_state_resume_prepare(rs, s->to_dst_file);
4574 
4575     return 0;
4576 }
4577 
4578 void postcopy_preempt_shutdown_file(MigrationState *s)
4579 {
4580     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4581     qemu_fflush(s->postcopy_qemufile_src);
4582 }
4583 
4584 static SaveVMHandlers savevm_ram_handlers = {
4585     .save_setup = ram_save_setup,
4586     .save_live_iterate = ram_save_iterate,
4587     .save_live_complete_postcopy = ram_save_complete,
4588     .save_live_complete_precopy = ram_save_complete,
4589     .has_postcopy = ram_has_postcopy,
4590     .save_live_pending = ram_save_pending,
4591     .load_state = ram_load,
4592     .save_cleanup = ram_save_cleanup,
4593     .load_setup = ram_load_setup,
4594     .load_cleanup = ram_load_cleanup,
4595     .resume_prepare = ram_resume_prepare,
4596 };
4597 
4598 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4599                                       size_t old_size, size_t new_size)
4600 {
4601     PostcopyState ps = postcopy_state_get();
4602     ram_addr_t offset;
4603     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4604     Error *err = NULL;
4605 
4606     if (ramblock_is_ignored(rb)) {
4607         return;
4608     }
4609 
4610     if (!migration_is_idle()) {
4611         /*
4612          * Precopy code on the source cannot deal with the size of RAM blocks
4613          * changing at random points in time - especially after sending the
4614          * RAM block sizes in the migration stream, they must no longer change.
4615          * Abort and indicate a proper reason.
4616          */
4617         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4618         migration_cancel(err);
4619         error_free(err);
4620     }
4621 
4622     switch (ps) {
4623     case POSTCOPY_INCOMING_ADVISE:
4624         /*
4625          * Update what ram_postcopy_incoming_init()->init_range() does at the
4626          * time postcopy was advised. Syncing RAM blocks with the source will
4627          * result in RAM resizes.
4628          */
4629         if (old_size < new_size) {
4630             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4631                 error_report("RAM block '%s' discard of resized RAM failed",
4632                              rb->idstr);
4633             }
4634         }
4635         rb->postcopy_length = new_size;
4636         break;
4637     case POSTCOPY_INCOMING_NONE:
4638     case POSTCOPY_INCOMING_RUNNING:
4639     case POSTCOPY_INCOMING_END:
4640         /*
4641          * Once our guest is running, postcopy does no longer care about
4642          * resizes. When growing, the new memory was not available on the
4643          * source, no handler needed.
4644          */
4645         break;
4646     default:
4647         error_report("RAM block '%s' resized during postcopy state: %d",
4648                      rb->idstr, ps);
4649         exit(-1);
4650     }
4651 }
4652 
4653 static RAMBlockNotifier ram_mig_ram_notifier = {
4654     .ram_block_resized = ram_mig_ram_block_resized,
4655 };
4656 
4657 void ram_mig_init(void)
4658 {
4659     qemu_mutex_init(&XBZRLE.lock);
4660     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4661     ram_block_notifier_add(&ram_mig_ram_notifier);
4662 }
4663