xref: /openbmc/qemu/migration/ram.c (revision 06e2b010)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
62 
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
66 
67 /***********************************************************/
68 /* ram save/restore */
69 
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71  * worked for pages that where filled with the same char.  We switched
72  * it to only search for the zero value.  And to avoid confusion with
73  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74  */
75 
76 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO     0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE     0x08
80 #define RAM_SAVE_FLAG_EOS      0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE   0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
85 
86 XBZRLECacheStats xbzrle_counters;
87 
88 /* used by the search for pages to send */
89 struct PageSearchStatus {
90     /* The migration channel used for a specific host page */
91     QEMUFile    *pss_channel;
92     /* Last block from where we have sent data */
93     RAMBlock *last_sent_block;
94     /* Current block being searched */
95     RAMBlock    *block;
96     /* Current page to search from */
97     unsigned long page;
98     /* Set once we wrap around */
99     bool         complete_round;
100     /* Whether we're sending a host page */
101     bool          host_page_sending;
102     /* The start/end of current host page.  Invalid if host_page_sending==false */
103     unsigned long host_page_start;
104     unsigned long host_page_end;
105 };
106 typedef struct PageSearchStatus PageSearchStatus;
107 
108 /* struct contains XBZRLE cache and a static page
109    used by the compression */
110 static struct {
111     /* buffer used for XBZRLE encoding */
112     uint8_t *encoded_buf;
113     /* buffer for storing page content */
114     uint8_t *current_buf;
115     /* Cache for XBZRLE, Protected by lock. */
116     PageCache *cache;
117     QemuMutex lock;
118     /* it will store a page full of zeros */
119     uint8_t *zero_target_page;
120     /* buffer used for XBZRLE decoding */
121     uint8_t *decoded_buf;
122 } XBZRLE;
123 
124 static void XBZRLE_cache_lock(void)
125 {
126     if (migrate_use_xbzrle()) {
127         qemu_mutex_lock(&XBZRLE.lock);
128     }
129 }
130 
131 static void XBZRLE_cache_unlock(void)
132 {
133     if (migrate_use_xbzrle()) {
134         qemu_mutex_unlock(&XBZRLE.lock);
135     }
136 }
137 
138 /**
139  * xbzrle_cache_resize: resize the xbzrle cache
140  *
141  * This function is called from migrate_params_apply in main
142  * thread, possibly while a migration is in progress.  A running
143  * migration may be using the cache and might finish during this call,
144  * hence changes to the cache are protected by XBZRLE.lock().
145  *
146  * Returns 0 for success or -1 for error
147  *
148  * @new_size: new cache size
149  * @errp: set *errp if the check failed, with reason
150  */
151 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
152 {
153     PageCache *new_cache;
154     int64_t ret = 0;
155 
156     /* Check for truncation */
157     if (new_size != (size_t)new_size) {
158         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
159                    "exceeding address space");
160         return -1;
161     }
162 
163     if (new_size == migrate_xbzrle_cache_size()) {
164         /* nothing to do */
165         return 0;
166     }
167 
168     XBZRLE_cache_lock();
169 
170     if (XBZRLE.cache != NULL) {
171         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
172         if (!new_cache) {
173             ret = -1;
174             goto out;
175         }
176 
177         cache_fini(XBZRLE.cache);
178         XBZRLE.cache = new_cache;
179     }
180 out:
181     XBZRLE_cache_unlock();
182     return ret;
183 }
184 
185 static bool postcopy_preempt_active(void)
186 {
187     return migrate_postcopy_preempt() && migration_in_postcopy();
188 }
189 
190 bool ramblock_is_ignored(RAMBlock *block)
191 {
192     return !qemu_ram_is_migratable(block) ||
193            (migrate_ignore_shared() && qemu_ram_is_shared(block));
194 }
195 
196 #undef RAMBLOCK_FOREACH
197 
198 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
199 {
200     RAMBlock *block;
201     int ret = 0;
202 
203     RCU_READ_LOCK_GUARD();
204 
205     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
206         ret = func(block, opaque);
207         if (ret) {
208             break;
209         }
210     }
211     return ret;
212 }
213 
214 static void ramblock_recv_map_init(void)
215 {
216     RAMBlock *rb;
217 
218     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
219         assert(!rb->receivedmap);
220         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
221     }
222 }
223 
224 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
225 {
226     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
227                     rb->receivedmap);
228 }
229 
230 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
231 {
232     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
233 }
234 
235 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
236 {
237     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
238 }
239 
240 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
241                                     size_t nr)
242 {
243     bitmap_set_atomic(rb->receivedmap,
244                       ramblock_recv_bitmap_offset(host_addr, rb),
245                       nr);
246 }
247 
248 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
249 
250 /*
251  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
252  *
253  * Returns >0 if success with sent bytes, or <0 if error.
254  */
255 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
256                                   const char *block_name)
257 {
258     RAMBlock *block = qemu_ram_block_by_name(block_name);
259     unsigned long *le_bitmap, nbits;
260     uint64_t size;
261 
262     if (!block) {
263         error_report("%s: invalid block name: %s", __func__, block_name);
264         return -1;
265     }
266 
267     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
268 
269     /*
270      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
271      * machines we may need 4 more bytes for padding (see below
272      * comment). So extend it a bit before hand.
273      */
274     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
275 
276     /*
277      * Always use little endian when sending the bitmap. This is
278      * required that when source and destination VMs are not using the
279      * same endianness. (Note: big endian won't work.)
280      */
281     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
282 
283     /* Size of the bitmap, in bytes */
284     size = DIV_ROUND_UP(nbits, 8);
285 
286     /*
287      * size is always aligned to 8 bytes for 64bit machines, but it
288      * may not be true for 32bit machines. We need this padding to
289      * make sure the migration can survive even between 32bit and
290      * 64bit machines.
291      */
292     size = ROUND_UP(size, 8);
293 
294     qemu_put_be64(file, size);
295     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
296     /*
297      * Mark as an end, in case the middle part is screwed up due to
298      * some "mysterious" reason.
299      */
300     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
301     qemu_fflush(file);
302 
303     g_free(le_bitmap);
304 
305     if (qemu_file_get_error(file)) {
306         return qemu_file_get_error(file);
307     }
308 
309     return size + sizeof(size);
310 }
311 
312 /*
313  * An outstanding page request, on the source, having been received
314  * and queued
315  */
316 struct RAMSrcPageRequest {
317     RAMBlock *rb;
318     hwaddr    offset;
319     hwaddr    len;
320 
321     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
322 };
323 
324 /* State of RAM for migration */
325 struct RAMState {
326     /*
327      * PageSearchStatus structures for the channels when send pages.
328      * Protected by the bitmap_mutex.
329      */
330     PageSearchStatus pss[RAM_CHANNEL_MAX];
331     /* UFFD file descriptor, used in 'write-tracking' migration */
332     int uffdio_fd;
333     /* Last block that we have visited searching for dirty pages */
334     RAMBlock *last_seen_block;
335     /* Last dirty target page we have sent */
336     ram_addr_t last_page;
337     /* last ram version we have seen */
338     uint32_t last_version;
339     /* How many times we have dirty too many pages */
340     int dirty_rate_high_cnt;
341     /* these variables are used for bitmap sync */
342     /* last time we did a full bitmap_sync */
343     int64_t time_last_bitmap_sync;
344     /* bytes transferred at start_time */
345     uint64_t bytes_xfer_prev;
346     /* number of dirty pages since start_time */
347     uint64_t num_dirty_pages_period;
348     /* xbzrle misses since the beginning of the period */
349     uint64_t xbzrle_cache_miss_prev;
350     /* Amount of xbzrle pages since the beginning of the period */
351     uint64_t xbzrle_pages_prev;
352     /* Amount of xbzrle encoded bytes since the beginning of the period */
353     uint64_t xbzrle_bytes_prev;
354     /* Start using XBZRLE (e.g., after the first round). */
355     bool xbzrle_enabled;
356     /* Are we on the last stage of migration */
357     bool last_stage;
358     /* compression statistics since the beginning of the period */
359     /* amount of count that no free thread to compress data */
360     uint64_t compress_thread_busy_prev;
361     /* amount bytes after compression */
362     uint64_t compressed_size_prev;
363     /* amount of compressed pages */
364     uint64_t compress_pages_prev;
365 
366     /* total handled target pages at the beginning of period */
367     uint64_t target_page_count_prev;
368     /* total handled target pages since start */
369     uint64_t target_page_count;
370     /* number of dirty bits in the bitmap */
371     uint64_t migration_dirty_pages;
372     /*
373      * Protects:
374      * - dirty/clear bitmap
375      * - migration_dirty_pages
376      * - pss structures
377      */
378     QemuMutex bitmap_mutex;
379     /* The RAMBlock used in the last src_page_requests */
380     RAMBlock *last_req_rb;
381     /* Queue of outstanding page requests from the destination */
382     QemuMutex src_page_req_mutex;
383     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
384 };
385 typedef struct RAMState RAMState;
386 
387 static RAMState *ram_state;
388 
389 static NotifierWithReturnList precopy_notifier_list;
390 
391 /* Whether postcopy has queued requests? */
392 static bool postcopy_has_request(RAMState *rs)
393 {
394     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
395 }
396 
397 void precopy_infrastructure_init(void)
398 {
399     notifier_with_return_list_init(&precopy_notifier_list);
400 }
401 
402 void precopy_add_notifier(NotifierWithReturn *n)
403 {
404     notifier_with_return_list_add(&precopy_notifier_list, n);
405 }
406 
407 void precopy_remove_notifier(NotifierWithReturn *n)
408 {
409     notifier_with_return_remove(n);
410 }
411 
412 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
413 {
414     PrecopyNotifyData pnd;
415     pnd.reason = reason;
416     pnd.errp = errp;
417 
418     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
419 }
420 
421 uint64_t ram_bytes_remaining(void)
422 {
423     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
424                        0;
425 }
426 
427 /*
428  * NOTE: not all stats in ram_counters are used in reality.  See comments
429  * for struct MigrationAtomicStats.  The ultimate result of ram migration
430  * counters will be a merged version with both ram_counters and the atomic
431  * fields in ram_atomic_counters.
432  */
433 MigrationStats ram_counters;
434 MigrationAtomicStats ram_atomic_counters;
435 
436 void ram_transferred_add(uint64_t bytes)
437 {
438     if (runstate_is_running()) {
439         ram_counters.precopy_bytes += bytes;
440     } else if (migration_in_postcopy()) {
441         stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
442     } else {
443         ram_counters.downtime_bytes += bytes;
444     }
445     stat64_add(&ram_atomic_counters.transferred, bytes);
446 }
447 
448 void dirty_sync_missed_zero_copy(void)
449 {
450     ram_counters.dirty_sync_missed_zero_copy++;
451 }
452 
453 CompressionStats compression_counters;
454 
455 struct CompressParam {
456     bool done;
457     bool quit;
458     bool zero_page;
459     QEMUFile *file;
460     QemuMutex mutex;
461     QemuCond cond;
462     RAMBlock *block;
463     ram_addr_t offset;
464 
465     /* internally used fields */
466     z_stream stream;
467     uint8_t *originbuf;
468 };
469 typedef struct CompressParam CompressParam;
470 
471 struct DecompressParam {
472     bool done;
473     bool quit;
474     QemuMutex mutex;
475     QemuCond cond;
476     void *des;
477     uint8_t *compbuf;
478     int len;
479     z_stream stream;
480 };
481 typedef struct DecompressParam DecompressParam;
482 
483 static CompressParam *comp_param;
484 static QemuThread *compress_threads;
485 /* comp_done_cond is used to wake up the migration thread when
486  * one of the compression threads has finished the compression.
487  * comp_done_lock is used to co-work with comp_done_cond.
488  */
489 static QemuMutex comp_done_lock;
490 static QemuCond comp_done_cond;
491 
492 static QEMUFile *decomp_file;
493 static DecompressParam *decomp_param;
494 static QemuThread *decompress_threads;
495 static QemuMutex decomp_done_lock;
496 static QemuCond decomp_done_cond;
497 
498 static int ram_save_host_page_urgent(PageSearchStatus *pss);
499 
500 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
501                                  ram_addr_t offset, uint8_t *source_buf);
502 
503 /* NOTE: page is the PFN not real ram_addr_t. */
504 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
505 {
506     pss->block = rb;
507     pss->page = page;
508     pss->complete_round = false;
509 }
510 
511 /*
512  * Check whether two PSSs are actively sending the same page.  Return true
513  * if it is, false otherwise.
514  */
515 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
516 {
517     return pss1->host_page_sending && pss2->host_page_sending &&
518         (pss1->host_page_start == pss2->host_page_start);
519 }
520 
521 static void *do_data_compress(void *opaque)
522 {
523     CompressParam *param = opaque;
524     RAMBlock *block;
525     ram_addr_t offset;
526     bool zero_page;
527 
528     qemu_mutex_lock(&param->mutex);
529     while (!param->quit) {
530         if (param->block) {
531             block = param->block;
532             offset = param->offset;
533             param->block = NULL;
534             qemu_mutex_unlock(&param->mutex);
535 
536             zero_page = do_compress_ram_page(param->file, &param->stream,
537                                              block, offset, param->originbuf);
538 
539             qemu_mutex_lock(&comp_done_lock);
540             param->done = true;
541             param->zero_page = zero_page;
542             qemu_cond_signal(&comp_done_cond);
543             qemu_mutex_unlock(&comp_done_lock);
544 
545             qemu_mutex_lock(&param->mutex);
546         } else {
547             qemu_cond_wait(&param->cond, &param->mutex);
548         }
549     }
550     qemu_mutex_unlock(&param->mutex);
551 
552     return NULL;
553 }
554 
555 static void compress_threads_save_cleanup(void)
556 {
557     int i, thread_count;
558 
559     if (!migrate_use_compression() || !comp_param) {
560         return;
561     }
562 
563     thread_count = migrate_compress_threads();
564     for (i = 0; i < thread_count; i++) {
565         /*
566          * we use it as a indicator which shows if the thread is
567          * properly init'd or not
568          */
569         if (!comp_param[i].file) {
570             break;
571         }
572 
573         qemu_mutex_lock(&comp_param[i].mutex);
574         comp_param[i].quit = true;
575         qemu_cond_signal(&comp_param[i].cond);
576         qemu_mutex_unlock(&comp_param[i].mutex);
577 
578         qemu_thread_join(compress_threads + i);
579         qemu_mutex_destroy(&comp_param[i].mutex);
580         qemu_cond_destroy(&comp_param[i].cond);
581         deflateEnd(&comp_param[i].stream);
582         g_free(comp_param[i].originbuf);
583         qemu_fclose(comp_param[i].file);
584         comp_param[i].file = NULL;
585     }
586     qemu_mutex_destroy(&comp_done_lock);
587     qemu_cond_destroy(&comp_done_cond);
588     g_free(compress_threads);
589     g_free(comp_param);
590     compress_threads = NULL;
591     comp_param = NULL;
592 }
593 
594 static int compress_threads_save_setup(void)
595 {
596     int i, thread_count;
597 
598     if (!migrate_use_compression()) {
599         return 0;
600     }
601     thread_count = migrate_compress_threads();
602     compress_threads = g_new0(QemuThread, thread_count);
603     comp_param = g_new0(CompressParam, thread_count);
604     qemu_cond_init(&comp_done_cond);
605     qemu_mutex_init(&comp_done_lock);
606     for (i = 0; i < thread_count; i++) {
607         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
608         if (!comp_param[i].originbuf) {
609             goto exit;
610         }
611 
612         if (deflateInit(&comp_param[i].stream,
613                         migrate_compress_level()) != Z_OK) {
614             g_free(comp_param[i].originbuf);
615             goto exit;
616         }
617 
618         /* comp_param[i].file is just used as a dummy buffer to save data,
619          * set its ops to empty.
620          */
621         comp_param[i].file = qemu_file_new_output(
622             QIO_CHANNEL(qio_channel_null_new()));
623         comp_param[i].done = true;
624         comp_param[i].quit = false;
625         qemu_mutex_init(&comp_param[i].mutex);
626         qemu_cond_init(&comp_param[i].cond);
627         qemu_thread_create(compress_threads + i, "compress",
628                            do_data_compress, comp_param + i,
629                            QEMU_THREAD_JOINABLE);
630     }
631     return 0;
632 
633 exit:
634     compress_threads_save_cleanup();
635     return -1;
636 }
637 
638 /**
639  * save_page_header: write page header to wire
640  *
641  * If this is the 1st block, it also writes the block identification
642  *
643  * Returns the number of bytes written
644  *
645  * @pss: current PSS channel status
646  * @block: block that contains the page we want to send
647  * @offset: offset inside the block for the page
648  *          in the lower bits, it contains flags
649  */
650 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
651                                ram_addr_t offset)
652 {
653     size_t size, len;
654     bool same_block = (block == pss->last_sent_block);
655     QEMUFile *f = pss->pss_channel;
656 
657     if (same_block) {
658         offset |= RAM_SAVE_FLAG_CONTINUE;
659     }
660     qemu_put_be64(f, offset);
661     size = 8;
662 
663     if (!same_block) {
664         len = strlen(block->idstr);
665         qemu_put_byte(f, len);
666         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
667         size += 1 + len;
668         pss->last_sent_block = block;
669     }
670     return size;
671 }
672 
673 /**
674  * mig_throttle_guest_down: throttle down the guest
675  *
676  * Reduce amount of guest cpu execution to hopefully slow down memory
677  * writes. If guest dirty memory rate is reduced below the rate at
678  * which we can transfer pages to the destination then we should be
679  * able to complete migration. Some workloads dirty memory way too
680  * fast and will not effectively converge, even with auto-converge.
681  */
682 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
683                                     uint64_t bytes_dirty_threshold)
684 {
685     MigrationState *s = migrate_get_current();
686     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
687     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
688     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
689     int pct_max = s->parameters.max_cpu_throttle;
690 
691     uint64_t throttle_now = cpu_throttle_get_percentage();
692     uint64_t cpu_now, cpu_ideal, throttle_inc;
693 
694     /* We have not started throttling yet. Let's start it. */
695     if (!cpu_throttle_active()) {
696         cpu_throttle_set(pct_initial);
697     } else {
698         /* Throttling already on, just increase the rate */
699         if (!pct_tailslow) {
700             throttle_inc = pct_increment;
701         } else {
702             /* Compute the ideal CPU percentage used by Guest, which may
703              * make the dirty rate match the dirty rate threshold. */
704             cpu_now = 100 - throttle_now;
705             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
706                         bytes_dirty_period);
707             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
708         }
709         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
710     }
711 }
712 
713 void mig_throttle_counter_reset(void)
714 {
715     RAMState *rs = ram_state;
716 
717     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
718     rs->num_dirty_pages_period = 0;
719     rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
720 }
721 
722 /**
723  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
724  *
725  * @rs: current RAM state
726  * @current_addr: address for the zero page
727  *
728  * Update the xbzrle cache to reflect a page that's been sent as all 0.
729  * The important thing is that a stale (not-yet-0'd) page be replaced
730  * by the new data.
731  * As a bonus, if the page wasn't in the cache it gets added so that
732  * when a small write is made into the 0'd page it gets XBZRLE sent.
733  */
734 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
735 {
736     /* We don't care if this fails to allocate a new cache page
737      * as long as it updated an old one */
738     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
739                  ram_counters.dirty_sync_count);
740 }
741 
742 #define ENCODING_FLAG_XBZRLE 0x1
743 
744 /**
745  * save_xbzrle_page: compress and send current page
746  *
747  * Returns: 1 means that we wrote the page
748  *          0 means that page is identical to the one already sent
749  *          -1 means that xbzrle would be longer than normal
750  *
751  * @rs: current RAM state
752  * @pss: current PSS channel
753  * @current_data: pointer to the address of the page contents
754  * @current_addr: addr of the page
755  * @block: block that contains the page we want to send
756  * @offset: offset inside the block for the page
757  */
758 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
759                             uint8_t **current_data, ram_addr_t current_addr,
760                             RAMBlock *block, ram_addr_t offset)
761 {
762     int encoded_len = 0, bytes_xbzrle;
763     uint8_t *prev_cached_page;
764     QEMUFile *file = pss->pss_channel;
765 
766     if (!cache_is_cached(XBZRLE.cache, current_addr,
767                          ram_counters.dirty_sync_count)) {
768         xbzrle_counters.cache_miss++;
769         if (!rs->last_stage) {
770             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
771                              ram_counters.dirty_sync_count) == -1) {
772                 return -1;
773             } else {
774                 /* update *current_data when the page has been
775                    inserted into cache */
776                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
777             }
778         }
779         return -1;
780     }
781 
782     /*
783      * Reaching here means the page has hit the xbzrle cache, no matter what
784      * encoding result it is (normal encoding, overflow or skipping the page),
785      * count the page as encoded. This is used to calculate the encoding rate.
786      *
787      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
788      * 2nd page turns out to be skipped (i.e. no new bytes written to the
789      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
790      * skipped page included. In this way, the encoding rate can tell if the
791      * guest page is good for xbzrle encoding.
792      */
793     xbzrle_counters.pages++;
794     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
795 
796     /* save current buffer into memory */
797     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
798 
799     /* XBZRLE encoding (if there is no overflow) */
800     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
801                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
802                                        TARGET_PAGE_SIZE);
803 
804     /*
805      * Update the cache contents, so that it corresponds to the data
806      * sent, in all cases except where we skip the page.
807      */
808     if (!rs->last_stage && encoded_len != 0) {
809         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
810         /*
811          * In the case where we couldn't compress, ensure that the caller
812          * sends the data from the cache, since the guest might have
813          * changed the RAM since we copied it.
814          */
815         *current_data = prev_cached_page;
816     }
817 
818     if (encoded_len == 0) {
819         trace_save_xbzrle_page_skipping();
820         return 0;
821     } else if (encoded_len == -1) {
822         trace_save_xbzrle_page_overflow();
823         xbzrle_counters.overflow++;
824         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
825         return -1;
826     }
827 
828     /* Send XBZRLE based compressed page */
829     bytes_xbzrle = save_page_header(pss, block,
830                                     offset | RAM_SAVE_FLAG_XBZRLE);
831     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
832     qemu_put_be16(file, encoded_len);
833     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
834     bytes_xbzrle += encoded_len + 1 + 2;
835     /*
836      * Like compressed_size (please see update_compress_thread_counts),
837      * the xbzrle encoded bytes don't count the 8 byte header with
838      * RAM_SAVE_FLAG_CONTINUE.
839      */
840     xbzrle_counters.bytes += bytes_xbzrle - 8;
841     ram_transferred_add(bytes_xbzrle);
842 
843     return 1;
844 }
845 
846 /**
847  * pss_find_next_dirty: find the next dirty page of current ramblock
848  *
849  * This function updates pss->page to point to the next dirty page index
850  * within the ramblock to migrate, or the end of ramblock when nothing
851  * found.  Note that when pss->host_page_sending==true it means we're
852  * during sending a host page, so we won't look for dirty page that is
853  * outside the host page boundary.
854  *
855  * @pss: the current page search status
856  */
857 static void pss_find_next_dirty(PageSearchStatus *pss)
858 {
859     RAMBlock *rb = pss->block;
860     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
861     unsigned long *bitmap = rb->bmap;
862 
863     if (ramblock_is_ignored(rb)) {
864         /* Points directly to the end, so we know no dirty page */
865         pss->page = size;
866         return;
867     }
868 
869     /*
870      * If during sending a host page, only look for dirty pages within the
871      * current host page being send.
872      */
873     if (pss->host_page_sending) {
874         assert(pss->host_page_end);
875         size = MIN(size, pss->host_page_end);
876     }
877 
878     pss->page = find_next_bit(bitmap, size, pss->page);
879 }
880 
881 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
882                                                        unsigned long page)
883 {
884     uint8_t shift;
885     hwaddr size, start;
886 
887     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
888         return;
889     }
890 
891     shift = rb->clear_bmap_shift;
892     /*
893      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
894      * can make things easier sometimes since then start address
895      * of the small chunk will always be 64 pages aligned so the
896      * bitmap will always be aligned to unsigned long. We should
897      * even be able to remove this restriction but I'm simply
898      * keeping it.
899      */
900     assert(shift >= 6);
901 
902     size = 1ULL << (TARGET_PAGE_BITS + shift);
903     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
904     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
905     memory_region_clear_dirty_bitmap(rb->mr, start, size);
906 }
907 
908 static void
909 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
910                                                  unsigned long start,
911                                                  unsigned long npages)
912 {
913     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
914     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
915     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
916 
917     /*
918      * Clear pages from start to start + npages - 1, so the end boundary is
919      * exclusive.
920      */
921     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
922         migration_clear_memory_region_dirty_bitmap(rb, i);
923     }
924 }
925 
926 /*
927  * colo_bitmap_find_diry:find contiguous dirty pages from start
928  *
929  * Returns the page offset within memory region of the start of the contiguout
930  * dirty page
931  *
932  * @rs: current RAM state
933  * @rb: RAMBlock where to search for dirty pages
934  * @start: page where we start the search
935  * @num: the number of contiguous dirty pages
936  */
937 static inline
938 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
939                                      unsigned long start, unsigned long *num)
940 {
941     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
942     unsigned long *bitmap = rb->bmap;
943     unsigned long first, next;
944 
945     *num = 0;
946 
947     if (ramblock_is_ignored(rb)) {
948         return size;
949     }
950 
951     first = find_next_bit(bitmap, size, start);
952     if (first >= size) {
953         return first;
954     }
955     next = find_next_zero_bit(bitmap, size, first + 1);
956     assert(next >= first);
957     *num = next - first;
958     return first;
959 }
960 
961 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
962                                                 RAMBlock *rb,
963                                                 unsigned long page)
964 {
965     bool ret;
966 
967     /*
968      * Clear dirty bitmap if needed.  This _must_ be called before we
969      * send any of the page in the chunk because we need to make sure
970      * we can capture further page content changes when we sync dirty
971      * log the next time.  So as long as we are going to send any of
972      * the page in the chunk we clear the remote dirty bitmap for all.
973      * Clearing it earlier won't be a problem, but too late will.
974      */
975     migration_clear_memory_region_dirty_bitmap(rb, page);
976 
977     ret = test_and_clear_bit(page, rb->bmap);
978     if (ret) {
979         rs->migration_dirty_pages--;
980     }
981 
982     return ret;
983 }
984 
985 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
986                                        void *opaque)
987 {
988     const hwaddr offset = section->offset_within_region;
989     const hwaddr size = int128_get64(section->size);
990     const unsigned long start = offset >> TARGET_PAGE_BITS;
991     const unsigned long npages = size >> TARGET_PAGE_BITS;
992     RAMBlock *rb = section->mr->ram_block;
993     uint64_t *cleared_bits = opaque;
994 
995     /*
996      * We don't grab ram_state->bitmap_mutex because we expect to run
997      * only when starting migration or during postcopy recovery where
998      * we don't have concurrent access.
999      */
1000     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1001         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1002     }
1003     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1004     bitmap_clear(rb->bmap, start, npages);
1005 }
1006 
1007 /*
1008  * Exclude all dirty pages from migration that fall into a discarded range as
1009  * managed by a RamDiscardManager responsible for the mapped memory region of
1010  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1011  *
1012  * Discarded pages ("logically unplugged") have undefined content and must
1013  * not get migrated, because even reading these pages for migration might
1014  * result in undesired behavior.
1015  *
1016  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1017  *
1018  * Note: The result is only stable while migrating (precopy/postcopy).
1019  */
1020 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1021 {
1022     uint64_t cleared_bits = 0;
1023 
1024     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1025         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1026         MemoryRegionSection section = {
1027             .mr = rb->mr,
1028             .offset_within_region = 0,
1029             .size = int128_make64(qemu_ram_get_used_length(rb)),
1030         };
1031 
1032         ram_discard_manager_replay_discarded(rdm, &section,
1033                                              dirty_bitmap_clear_section,
1034                                              &cleared_bits);
1035     }
1036     return cleared_bits;
1037 }
1038 
1039 /*
1040  * Check if a host-page aligned page falls into a discarded range as managed by
1041  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1042  *
1043  * Note: The result is only stable while migrating (precopy/postcopy).
1044  */
1045 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1046 {
1047     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1048         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1049         MemoryRegionSection section = {
1050             .mr = rb->mr,
1051             .offset_within_region = start,
1052             .size = int128_make64(qemu_ram_pagesize(rb)),
1053         };
1054 
1055         return !ram_discard_manager_is_populated(rdm, &section);
1056     }
1057     return false;
1058 }
1059 
1060 /* Called with RCU critical section */
1061 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1062 {
1063     uint64_t new_dirty_pages =
1064         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1065 
1066     rs->migration_dirty_pages += new_dirty_pages;
1067     rs->num_dirty_pages_period += new_dirty_pages;
1068 }
1069 
1070 /**
1071  * ram_pagesize_summary: calculate all the pagesizes of a VM
1072  *
1073  * Returns a summary bitmap of the page sizes of all RAMBlocks
1074  *
1075  * For VMs with just normal pages this is equivalent to the host page
1076  * size. If it's got some huge pages then it's the OR of all the
1077  * different page sizes.
1078  */
1079 uint64_t ram_pagesize_summary(void)
1080 {
1081     RAMBlock *block;
1082     uint64_t summary = 0;
1083 
1084     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1085         summary |= block->page_size;
1086     }
1087 
1088     return summary;
1089 }
1090 
1091 uint64_t ram_get_total_transferred_pages(void)
1092 {
1093     return  stat64_get(&ram_atomic_counters.normal) +
1094         stat64_get(&ram_atomic_counters.duplicate) +
1095         compression_counters.pages + xbzrle_counters.pages;
1096 }
1097 
1098 static void migration_update_rates(RAMState *rs, int64_t end_time)
1099 {
1100     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1101     double compressed_size;
1102 
1103     /* calculate period counters */
1104     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1105                 / (end_time - rs->time_last_bitmap_sync);
1106 
1107     if (!page_count) {
1108         return;
1109     }
1110 
1111     if (migrate_use_xbzrle()) {
1112         double encoded_size, unencoded_size;
1113 
1114         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1115             rs->xbzrle_cache_miss_prev) / page_count;
1116         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1117         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1118                          TARGET_PAGE_SIZE;
1119         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1120         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1121             xbzrle_counters.encoding_rate = 0;
1122         } else {
1123             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1124         }
1125         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1126         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1127     }
1128 
1129     if (migrate_use_compression()) {
1130         compression_counters.busy_rate = (double)(compression_counters.busy -
1131             rs->compress_thread_busy_prev) / page_count;
1132         rs->compress_thread_busy_prev = compression_counters.busy;
1133 
1134         compressed_size = compression_counters.compressed_size -
1135                           rs->compressed_size_prev;
1136         if (compressed_size) {
1137             double uncompressed_size = (compression_counters.pages -
1138                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1139 
1140             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1141             compression_counters.compression_rate =
1142                                         uncompressed_size / compressed_size;
1143 
1144             rs->compress_pages_prev = compression_counters.pages;
1145             rs->compressed_size_prev = compression_counters.compressed_size;
1146         }
1147     }
1148 }
1149 
1150 static void migration_trigger_throttle(RAMState *rs)
1151 {
1152     MigrationState *s = migrate_get_current();
1153     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1154     uint64_t bytes_xfer_period =
1155         stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
1156     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1157     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1158 
1159     /* During block migration the auto-converge logic incorrectly detects
1160      * that ram migration makes no progress. Avoid this by disabling the
1161      * throttling logic during the bulk phase of block migration. */
1162     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1163         /* The following detection logic can be refined later. For now:
1164            Check to see if the ratio between dirtied bytes and the approx.
1165            amount of bytes that just got transferred since the last time
1166            we were in this routine reaches the threshold. If that happens
1167            twice, start or increase throttling. */
1168 
1169         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1170             (++rs->dirty_rate_high_cnt >= 2)) {
1171             trace_migration_throttle();
1172             rs->dirty_rate_high_cnt = 0;
1173             mig_throttle_guest_down(bytes_dirty_period,
1174                                     bytes_dirty_threshold);
1175         }
1176     }
1177 }
1178 
1179 static void migration_bitmap_sync(RAMState *rs)
1180 {
1181     RAMBlock *block;
1182     int64_t end_time;
1183 
1184     ram_counters.dirty_sync_count++;
1185 
1186     if (!rs->time_last_bitmap_sync) {
1187         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1188     }
1189 
1190     trace_migration_bitmap_sync_start();
1191     memory_global_dirty_log_sync();
1192 
1193     qemu_mutex_lock(&rs->bitmap_mutex);
1194     WITH_RCU_READ_LOCK_GUARD() {
1195         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1196             ramblock_sync_dirty_bitmap(rs, block);
1197         }
1198         ram_counters.remaining = ram_bytes_remaining();
1199     }
1200     qemu_mutex_unlock(&rs->bitmap_mutex);
1201 
1202     memory_global_after_dirty_log_sync();
1203     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1204 
1205     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1206 
1207     /* more than 1 second = 1000 millisecons */
1208     if (end_time > rs->time_last_bitmap_sync + 1000) {
1209         migration_trigger_throttle(rs);
1210 
1211         migration_update_rates(rs, end_time);
1212 
1213         rs->target_page_count_prev = rs->target_page_count;
1214 
1215         /* reset period counters */
1216         rs->time_last_bitmap_sync = end_time;
1217         rs->num_dirty_pages_period = 0;
1218         rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
1219     }
1220     if (migrate_use_events()) {
1221         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1222     }
1223 }
1224 
1225 static void migration_bitmap_sync_precopy(RAMState *rs)
1226 {
1227     Error *local_err = NULL;
1228 
1229     /*
1230      * The current notifier usage is just an optimization to migration, so we
1231      * don't stop the normal migration process in the error case.
1232      */
1233     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1234         error_report_err(local_err);
1235         local_err = NULL;
1236     }
1237 
1238     migration_bitmap_sync(rs);
1239 
1240     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1241         error_report_err(local_err);
1242     }
1243 }
1244 
1245 void ram_release_page(const char *rbname, uint64_t offset)
1246 {
1247     if (!migrate_release_ram() || !migration_in_postcopy()) {
1248         return;
1249     }
1250 
1251     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1252 }
1253 
1254 /**
1255  * save_zero_page_to_file: send the zero page to the file
1256  *
1257  * Returns the size of data written to the file, 0 means the page is not
1258  * a zero page
1259  *
1260  * @pss: current PSS channel
1261  * @block: block that contains the page we want to send
1262  * @offset: offset inside the block for the page
1263  */
1264 static int save_zero_page_to_file(PageSearchStatus *pss,
1265                                   RAMBlock *block, ram_addr_t offset)
1266 {
1267     uint8_t *p = block->host + offset;
1268     QEMUFile *file = pss->pss_channel;
1269     int len = 0;
1270 
1271     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1272         len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
1273         qemu_put_byte(file, 0);
1274         len += 1;
1275         ram_release_page(block->idstr, offset);
1276     }
1277     return len;
1278 }
1279 
1280 /**
1281  * save_zero_page: send the zero page to the stream
1282  *
1283  * Returns the number of pages written.
1284  *
1285  * @pss: current PSS channel
1286  * @block: block that contains the page we want to send
1287  * @offset: offset inside the block for the page
1288  */
1289 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
1290                           ram_addr_t offset)
1291 {
1292     int len = save_zero_page_to_file(pss, block, offset);
1293 
1294     if (len) {
1295         stat64_add(&ram_atomic_counters.duplicate, 1);
1296         ram_transferred_add(len);
1297         return 1;
1298     }
1299     return -1;
1300 }
1301 
1302 /*
1303  * @pages: the number of pages written by the control path,
1304  *        < 0 - error
1305  *        > 0 - number of pages written
1306  *
1307  * Return true if the pages has been saved, otherwise false is returned.
1308  */
1309 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1310                               ram_addr_t offset, int *pages)
1311 {
1312     uint64_t bytes_xmit = 0;
1313     int ret;
1314 
1315     *pages = -1;
1316     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1317                                 TARGET_PAGE_SIZE, &bytes_xmit);
1318     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1319         return false;
1320     }
1321 
1322     if (bytes_xmit) {
1323         ram_transferred_add(bytes_xmit);
1324         *pages = 1;
1325     }
1326 
1327     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1328         return true;
1329     }
1330 
1331     if (bytes_xmit > 0) {
1332         stat64_add(&ram_atomic_counters.normal, 1);
1333     } else if (bytes_xmit == 0) {
1334         stat64_add(&ram_atomic_counters.duplicate, 1);
1335     }
1336 
1337     return true;
1338 }
1339 
1340 /*
1341  * directly send the page to the stream
1342  *
1343  * Returns the number of pages written.
1344  *
1345  * @pss: current PSS channel
1346  * @block: block that contains the page we want to send
1347  * @offset: offset inside the block for the page
1348  * @buf: the page to be sent
1349  * @async: send to page asyncly
1350  */
1351 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1352                             ram_addr_t offset, uint8_t *buf, bool async)
1353 {
1354     QEMUFile *file = pss->pss_channel;
1355 
1356     ram_transferred_add(save_page_header(pss, block,
1357                                          offset | RAM_SAVE_FLAG_PAGE));
1358     if (async) {
1359         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1360                               migrate_release_ram() &&
1361                               migration_in_postcopy());
1362     } else {
1363         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1364     }
1365     ram_transferred_add(TARGET_PAGE_SIZE);
1366     stat64_add(&ram_atomic_counters.normal, 1);
1367     return 1;
1368 }
1369 
1370 /**
1371  * ram_save_page: send the given page to the stream
1372  *
1373  * Returns the number of pages written.
1374  *          < 0 - error
1375  *          >=0 - Number of pages written - this might legally be 0
1376  *                if xbzrle noticed the page was the same.
1377  *
1378  * @rs: current RAM state
1379  * @block: block that contains the page we want to send
1380  * @offset: offset inside the block for the page
1381  */
1382 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1383 {
1384     int pages = -1;
1385     uint8_t *p;
1386     bool send_async = true;
1387     RAMBlock *block = pss->block;
1388     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1389     ram_addr_t current_addr = block->offset + offset;
1390 
1391     p = block->host + offset;
1392     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1393 
1394     XBZRLE_cache_lock();
1395     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1396         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1397                                  block, offset);
1398         if (!rs->last_stage) {
1399             /* Can't send this cached data async, since the cache page
1400              * might get updated before it gets to the wire
1401              */
1402             send_async = false;
1403         }
1404     }
1405 
1406     /* XBZRLE overflow or normal page */
1407     if (pages == -1) {
1408         pages = save_normal_page(pss, block, offset, p, send_async);
1409     }
1410 
1411     XBZRLE_cache_unlock();
1412 
1413     return pages;
1414 }
1415 
1416 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1417                                  ram_addr_t offset)
1418 {
1419     if (multifd_queue_page(file, block, offset) < 0) {
1420         return -1;
1421     }
1422     stat64_add(&ram_atomic_counters.normal, 1);
1423 
1424     return 1;
1425 }
1426 
1427 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1428                                  ram_addr_t offset, uint8_t *source_buf)
1429 {
1430     RAMState *rs = ram_state;
1431     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1432     uint8_t *p = block->host + offset;
1433     int ret;
1434 
1435     if (save_zero_page_to_file(pss, block, offset)) {
1436         return true;
1437     }
1438 
1439     save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1440 
1441     /*
1442      * copy it to a internal buffer to avoid it being modified by VM
1443      * so that we can catch up the error during compression and
1444      * decompression
1445      */
1446     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1447     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1448     if (ret < 0) {
1449         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1450         error_report("compressed data failed!");
1451     }
1452     return false;
1453 }
1454 
1455 static void
1456 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1457 {
1458     ram_transferred_add(bytes_xmit);
1459 
1460     if (param->zero_page) {
1461         stat64_add(&ram_atomic_counters.duplicate, 1);
1462         return;
1463     }
1464 
1465     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1466     compression_counters.compressed_size += bytes_xmit - 8;
1467     compression_counters.pages++;
1468 }
1469 
1470 static bool save_page_use_compression(RAMState *rs);
1471 
1472 static void flush_compressed_data(RAMState *rs)
1473 {
1474     MigrationState *ms = migrate_get_current();
1475     int idx, len, thread_count;
1476 
1477     if (!save_page_use_compression(rs)) {
1478         return;
1479     }
1480     thread_count = migrate_compress_threads();
1481 
1482     qemu_mutex_lock(&comp_done_lock);
1483     for (idx = 0; idx < thread_count; idx++) {
1484         while (!comp_param[idx].done) {
1485             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1486         }
1487     }
1488     qemu_mutex_unlock(&comp_done_lock);
1489 
1490     for (idx = 0; idx < thread_count; idx++) {
1491         qemu_mutex_lock(&comp_param[idx].mutex);
1492         if (!comp_param[idx].quit) {
1493             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1494             /*
1495              * it's safe to fetch zero_page without holding comp_done_lock
1496              * as there is no further request submitted to the thread,
1497              * i.e, the thread should be waiting for a request at this point.
1498              */
1499             update_compress_thread_counts(&comp_param[idx], len);
1500         }
1501         qemu_mutex_unlock(&comp_param[idx].mutex);
1502     }
1503 }
1504 
1505 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1506                                        ram_addr_t offset)
1507 {
1508     param->block = block;
1509     param->offset = offset;
1510 }
1511 
1512 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1513 {
1514     int idx, thread_count, bytes_xmit = -1, pages = -1;
1515     bool wait = migrate_compress_wait_thread();
1516     MigrationState *ms = migrate_get_current();
1517 
1518     thread_count = migrate_compress_threads();
1519     qemu_mutex_lock(&comp_done_lock);
1520 retry:
1521     for (idx = 0; idx < thread_count; idx++) {
1522         if (comp_param[idx].done) {
1523             comp_param[idx].done = false;
1524             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1525                                             comp_param[idx].file);
1526             qemu_mutex_lock(&comp_param[idx].mutex);
1527             set_compress_params(&comp_param[idx], block, offset);
1528             qemu_cond_signal(&comp_param[idx].cond);
1529             qemu_mutex_unlock(&comp_param[idx].mutex);
1530             pages = 1;
1531             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1532             break;
1533         }
1534     }
1535 
1536     /*
1537      * wait for the free thread if the user specifies 'compress-wait-thread',
1538      * otherwise we will post the page out in the main thread as normal page.
1539      */
1540     if (pages < 0 && wait) {
1541         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1542         goto retry;
1543     }
1544     qemu_mutex_unlock(&comp_done_lock);
1545 
1546     return pages;
1547 }
1548 
1549 /**
1550  * find_dirty_block: find the next dirty page and update any state
1551  * associated with the search process.
1552  *
1553  * Returns true if a page is found
1554  *
1555  * @rs: current RAM state
1556  * @pss: data about the state of the current dirty page scan
1557  * @again: set to false if the search has scanned the whole of RAM
1558  */
1559 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1560 {
1561     /* Update pss->page for the next dirty bit in ramblock */
1562     pss_find_next_dirty(pss);
1563 
1564     if (pss->complete_round && pss->block == rs->last_seen_block &&
1565         pss->page >= rs->last_page) {
1566         /*
1567          * We've been once around the RAM and haven't found anything.
1568          * Give up.
1569          */
1570         *again = false;
1571         return false;
1572     }
1573     if (!offset_in_ramblock(pss->block,
1574                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1575         /* Didn't find anything in this RAM Block */
1576         pss->page = 0;
1577         pss->block = QLIST_NEXT_RCU(pss->block, next);
1578         if (!pss->block) {
1579             /*
1580              * If memory migration starts over, we will meet a dirtied page
1581              * which may still exists in compression threads's ring, so we
1582              * should flush the compressed data to make sure the new page
1583              * is not overwritten by the old one in the destination.
1584              *
1585              * Also If xbzrle is on, stop using the data compression at this
1586              * point. In theory, xbzrle can do better than compression.
1587              */
1588             flush_compressed_data(rs);
1589 
1590             /* Hit the end of the list */
1591             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1592             /* Flag that we've looped */
1593             pss->complete_round = true;
1594             /* After the first round, enable XBZRLE. */
1595             if (migrate_use_xbzrle()) {
1596                 rs->xbzrle_enabled = true;
1597             }
1598         }
1599         /* Didn't find anything this time, but try again on the new block */
1600         *again = true;
1601         return false;
1602     } else {
1603         /* Can go around again, but... */
1604         *again = true;
1605         /* We've found something so probably don't need to */
1606         return true;
1607     }
1608 }
1609 
1610 /**
1611  * unqueue_page: gets a page of the queue
1612  *
1613  * Helper for 'get_queued_page' - gets a page off the queue
1614  *
1615  * Returns the block of the page (or NULL if none available)
1616  *
1617  * @rs: current RAM state
1618  * @offset: used to return the offset within the RAMBlock
1619  */
1620 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1621 {
1622     struct RAMSrcPageRequest *entry;
1623     RAMBlock *block = NULL;
1624 
1625     if (!postcopy_has_request(rs)) {
1626         return NULL;
1627     }
1628 
1629     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1630 
1631     /*
1632      * This should _never_ change even after we take the lock, because no one
1633      * should be taking anything off the request list other than us.
1634      */
1635     assert(postcopy_has_request(rs));
1636 
1637     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1638     block = entry->rb;
1639     *offset = entry->offset;
1640 
1641     if (entry->len > TARGET_PAGE_SIZE) {
1642         entry->len -= TARGET_PAGE_SIZE;
1643         entry->offset += TARGET_PAGE_SIZE;
1644     } else {
1645         memory_region_unref(block->mr);
1646         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1647         g_free(entry);
1648         migration_consume_urgent_request();
1649     }
1650 
1651     return block;
1652 }
1653 
1654 #if defined(__linux__)
1655 /**
1656  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1657  *   is found, return RAM block pointer and page offset
1658  *
1659  * Returns pointer to the RAMBlock containing faulting page,
1660  *   NULL if no write faults are pending
1661  *
1662  * @rs: current RAM state
1663  * @offset: page offset from the beginning of the block
1664  */
1665 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1666 {
1667     struct uffd_msg uffd_msg;
1668     void *page_address;
1669     RAMBlock *block;
1670     int res;
1671 
1672     if (!migrate_background_snapshot()) {
1673         return NULL;
1674     }
1675 
1676     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1677     if (res <= 0) {
1678         return NULL;
1679     }
1680 
1681     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1682     block = qemu_ram_block_from_host(page_address, false, offset);
1683     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1684     return block;
1685 }
1686 
1687 /**
1688  * ram_save_release_protection: release UFFD write protection after
1689  *   a range of pages has been saved
1690  *
1691  * @rs: current RAM state
1692  * @pss: page-search-status structure
1693  * @start_page: index of the first page in the range relative to pss->block
1694  *
1695  * Returns 0 on success, negative value in case of an error
1696 */
1697 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1698         unsigned long start_page)
1699 {
1700     int res = 0;
1701 
1702     /* Check if page is from UFFD-managed region. */
1703     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1704         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1705         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1706 
1707         /* Flush async buffers before un-protect. */
1708         qemu_fflush(pss->pss_channel);
1709         /* Un-protect memory range. */
1710         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1711                 false, false);
1712     }
1713 
1714     return res;
1715 }
1716 
1717 /* ram_write_tracking_available: check if kernel supports required UFFD features
1718  *
1719  * Returns true if supports, false otherwise
1720  */
1721 bool ram_write_tracking_available(void)
1722 {
1723     uint64_t uffd_features;
1724     int res;
1725 
1726     res = uffd_query_features(&uffd_features);
1727     return (res == 0 &&
1728             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1729 }
1730 
1731 /* ram_write_tracking_compatible: check if guest configuration is
1732  *   compatible with 'write-tracking'
1733  *
1734  * Returns true if compatible, false otherwise
1735  */
1736 bool ram_write_tracking_compatible(void)
1737 {
1738     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1739     int uffd_fd;
1740     RAMBlock *block;
1741     bool ret = false;
1742 
1743     /* Open UFFD file descriptor */
1744     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1745     if (uffd_fd < 0) {
1746         return false;
1747     }
1748 
1749     RCU_READ_LOCK_GUARD();
1750 
1751     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1752         uint64_t uffd_ioctls;
1753 
1754         /* Nothing to do with read-only and MMIO-writable regions */
1755         if (block->mr->readonly || block->mr->rom_device) {
1756             continue;
1757         }
1758         /* Try to register block memory via UFFD-IO to track writes */
1759         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1760                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1761             goto out;
1762         }
1763         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1764             goto out;
1765         }
1766     }
1767     ret = true;
1768 
1769 out:
1770     uffd_close_fd(uffd_fd);
1771     return ret;
1772 }
1773 
1774 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1775                                        ram_addr_t size)
1776 {
1777     const ram_addr_t end = offset + size;
1778 
1779     /*
1780      * We read one byte of each page; this will preallocate page tables if
1781      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1782      * where no page was populated yet. This might require adaption when
1783      * supporting other mappings, like shmem.
1784      */
1785     for (; offset < end; offset += block->page_size) {
1786         char tmp = *((char *)block->host + offset);
1787 
1788         /* Don't optimize the read out */
1789         asm volatile("" : "+r" (tmp));
1790     }
1791 }
1792 
1793 static inline int populate_read_section(MemoryRegionSection *section,
1794                                         void *opaque)
1795 {
1796     const hwaddr size = int128_get64(section->size);
1797     hwaddr offset = section->offset_within_region;
1798     RAMBlock *block = section->mr->ram_block;
1799 
1800     populate_read_range(block, offset, size);
1801     return 0;
1802 }
1803 
1804 /*
1805  * ram_block_populate_read: preallocate page tables and populate pages in the
1806  *   RAM block by reading a byte of each page.
1807  *
1808  * Since it's solely used for userfault_fd WP feature, here we just
1809  *   hardcode page size to qemu_real_host_page_size.
1810  *
1811  * @block: RAM block to populate
1812  */
1813 static void ram_block_populate_read(RAMBlock *rb)
1814 {
1815     /*
1816      * Skip populating all pages that fall into a discarded range as managed by
1817      * a RamDiscardManager responsible for the mapped memory region of the
1818      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1819      * must not get populated automatically. We don't have to track
1820      * modifications via userfaultfd WP reliably, because these pages will
1821      * not be part of the migration stream either way -- see
1822      * ramblock_dirty_bitmap_exclude_discarded_pages().
1823      *
1824      * Note: The result is only stable while migrating (precopy/postcopy).
1825      */
1826     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1827         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1828         MemoryRegionSection section = {
1829             .mr = rb->mr,
1830             .offset_within_region = 0,
1831             .size = rb->mr->size,
1832         };
1833 
1834         ram_discard_manager_replay_populated(rdm, &section,
1835                                              populate_read_section, NULL);
1836     } else {
1837         populate_read_range(rb, 0, rb->used_length);
1838     }
1839 }
1840 
1841 /*
1842  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1843  */
1844 void ram_write_tracking_prepare(void)
1845 {
1846     RAMBlock *block;
1847 
1848     RCU_READ_LOCK_GUARD();
1849 
1850     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1851         /* Nothing to do with read-only and MMIO-writable regions */
1852         if (block->mr->readonly || block->mr->rom_device) {
1853             continue;
1854         }
1855 
1856         /*
1857          * Populate pages of the RAM block before enabling userfault_fd
1858          * write protection.
1859          *
1860          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1861          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1862          * pages with pte_none() entries in page table.
1863          */
1864         ram_block_populate_read(block);
1865     }
1866 }
1867 
1868 static inline int uffd_protect_section(MemoryRegionSection *section,
1869                                        void *opaque)
1870 {
1871     const hwaddr size = int128_get64(section->size);
1872     const hwaddr offset = section->offset_within_region;
1873     RAMBlock *rb = section->mr->ram_block;
1874     int uffd_fd = (uintptr_t)opaque;
1875 
1876     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1877                                   false);
1878 }
1879 
1880 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1881 {
1882     assert(rb->flags & RAM_UF_WRITEPROTECT);
1883 
1884     /* See ram_block_populate_read() */
1885     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1886         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1887         MemoryRegionSection section = {
1888             .mr = rb->mr,
1889             .offset_within_region = 0,
1890             .size = rb->mr->size,
1891         };
1892 
1893         return ram_discard_manager_replay_populated(rdm, &section,
1894                                                     uffd_protect_section,
1895                                                     (void *)(uintptr_t)uffd_fd);
1896     }
1897     return uffd_change_protection(uffd_fd, rb->host,
1898                                   rb->used_length, true, false);
1899 }
1900 
1901 /*
1902  * ram_write_tracking_start: start UFFD-WP memory tracking
1903  *
1904  * Returns 0 for success or negative value in case of error
1905  */
1906 int ram_write_tracking_start(void)
1907 {
1908     int uffd_fd;
1909     RAMState *rs = ram_state;
1910     RAMBlock *block;
1911 
1912     /* Open UFFD file descriptor */
1913     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1914     if (uffd_fd < 0) {
1915         return uffd_fd;
1916     }
1917     rs->uffdio_fd = uffd_fd;
1918 
1919     RCU_READ_LOCK_GUARD();
1920 
1921     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1922         /* Nothing to do with read-only and MMIO-writable regions */
1923         if (block->mr->readonly || block->mr->rom_device) {
1924             continue;
1925         }
1926 
1927         /* Register block memory with UFFD to track writes */
1928         if (uffd_register_memory(rs->uffdio_fd, block->host,
1929                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1930             goto fail;
1931         }
1932         block->flags |= RAM_UF_WRITEPROTECT;
1933         memory_region_ref(block->mr);
1934 
1935         /* Apply UFFD write protection to the block memory range */
1936         if (ram_block_uffd_protect(block, uffd_fd)) {
1937             goto fail;
1938         }
1939 
1940         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1941                 block->host, block->max_length);
1942     }
1943 
1944     return 0;
1945 
1946 fail:
1947     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1948 
1949     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1950         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1951             continue;
1952         }
1953         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1954         /* Cleanup flags and remove reference */
1955         block->flags &= ~RAM_UF_WRITEPROTECT;
1956         memory_region_unref(block->mr);
1957     }
1958 
1959     uffd_close_fd(uffd_fd);
1960     rs->uffdio_fd = -1;
1961     return -1;
1962 }
1963 
1964 /**
1965  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1966  */
1967 void ram_write_tracking_stop(void)
1968 {
1969     RAMState *rs = ram_state;
1970     RAMBlock *block;
1971 
1972     RCU_READ_LOCK_GUARD();
1973 
1974     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1975         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1976             continue;
1977         }
1978         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1979 
1980         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1981                 block->host, block->max_length);
1982 
1983         /* Cleanup flags and remove reference */
1984         block->flags &= ~RAM_UF_WRITEPROTECT;
1985         memory_region_unref(block->mr);
1986     }
1987 
1988     /* Finally close UFFD file descriptor */
1989     uffd_close_fd(rs->uffdio_fd);
1990     rs->uffdio_fd = -1;
1991 }
1992 
1993 #else
1994 /* No target OS support, stubs just fail or ignore */
1995 
1996 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1997 {
1998     (void) rs;
1999     (void) offset;
2000 
2001     return NULL;
2002 }
2003 
2004 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2005         unsigned long start_page)
2006 {
2007     (void) rs;
2008     (void) pss;
2009     (void) start_page;
2010 
2011     return 0;
2012 }
2013 
2014 bool ram_write_tracking_available(void)
2015 {
2016     return false;
2017 }
2018 
2019 bool ram_write_tracking_compatible(void)
2020 {
2021     assert(0);
2022     return false;
2023 }
2024 
2025 int ram_write_tracking_start(void)
2026 {
2027     assert(0);
2028     return -1;
2029 }
2030 
2031 void ram_write_tracking_stop(void)
2032 {
2033     assert(0);
2034 }
2035 #endif /* defined(__linux__) */
2036 
2037 /**
2038  * get_queued_page: unqueue a page from the postcopy requests
2039  *
2040  * Skips pages that are already sent (!dirty)
2041  *
2042  * Returns true if a queued page is found
2043  *
2044  * @rs: current RAM state
2045  * @pss: data about the state of the current dirty page scan
2046  */
2047 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2048 {
2049     RAMBlock  *block;
2050     ram_addr_t offset;
2051     bool dirty;
2052 
2053     do {
2054         block = unqueue_page(rs, &offset);
2055         /*
2056          * We're sending this page, and since it's postcopy nothing else
2057          * will dirty it, and we must make sure it doesn't get sent again
2058          * even if this queue request was received after the background
2059          * search already sent it.
2060          */
2061         if (block) {
2062             unsigned long page;
2063 
2064             page = offset >> TARGET_PAGE_BITS;
2065             dirty = test_bit(page, block->bmap);
2066             if (!dirty) {
2067                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2068                                                 page);
2069             } else {
2070                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2071             }
2072         }
2073 
2074     } while (block && !dirty);
2075 
2076     if (!block) {
2077         /*
2078          * Poll write faults too if background snapshot is enabled; that's
2079          * when we have vcpus got blocked by the write protected pages.
2080          */
2081         block = poll_fault_page(rs, &offset);
2082     }
2083 
2084     if (block) {
2085         /*
2086          * We want the background search to continue from the queued page
2087          * since the guest is likely to want other pages near to the page
2088          * it just requested.
2089          */
2090         pss->block = block;
2091         pss->page = offset >> TARGET_PAGE_BITS;
2092 
2093         /*
2094          * This unqueued page would break the "one round" check, even is
2095          * really rare.
2096          */
2097         pss->complete_round = false;
2098     }
2099 
2100     return !!block;
2101 }
2102 
2103 /**
2104  * migration_page_queue_free: drop any remaining pages in the ram
2105  * request queue
2106  *
2107  * It should be empty at the end anyway, but in error cases there may
2108  * be some left.  in case that there is any page left, we drop it.
2109  *
2110  */
2111 static void migration_page_queue_free(RAMState *rs)
2112 {
2113     struct RAMSrcPageRequest *mspr, *next_mspr;
2114     /* This queue generally should be empty - but in the case of a failed
2115      * migration might have some droppings in.
2116      */
2117     RCU_READ_LOCK_GUARD();
2118     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2119         memory_region_unref(mspr->rb->mr);
2120         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2121         g_free(mspr);
2122     }
2123 }
2124 
2125 /**
2126  * ram_save_queue_pages: queue the page for transmission
2127  *
2128  * A request from postcopy destination for example.
2129  *
2130  * Returns zero on success or negative on error
2131  *
2132  * @rbname: Name of the RAMBLock of the request. NULL means the
2133  *          same that last one.
2134  * @start: starting address from the start of the RAMBlock
2135  * @len: length (in bytes) to send
2136  */
2137 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2138 {
2139     RAMBlock *ramblock;
2140     RAMState *rs = ram_state;
2141 
2142     ram_counters.postcopy_requests++;
2143     RCU_READ_LOCK_GUARD();
2144 
2145     if (!rbname) {
2146         /* Reuse last RAMBlock */
2147         ramblock = rs->last_req_rb;
2148 
2149         if (!ramblock) {
2150             /*
2151              * Shouldn't happen, we can't reuse the last RAMBlock if
2152              * it's the 1st request.
2153              */
2154             error_report("ram_save_queue_pages no previous block");
2155             return -1;
2156         }
2157     } else {
2158         ramblock = qemu_ram_block_by_name(rbname);
2159 
2160         if (!ramblock) {
2161             /* We shouldn't be asked for a non-existent RAMBlock */
2162             error_report("ram_save_queue_pages no block '%s'", rbname);
2163             return -1;
2164         }
2165         rs->last_req_rb = ramblock;
2166     }
2167     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2168     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2169         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2170                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2171                      __func__, start, len, ramblock->used_length);
2172         return -1;
2173     }
2174 
2175     /*
2176      * When with postcopy preempt, we send back the page directly in the
2177      * rp-return thread.
2178      */
2179     if (postcopy_preempt_active()) {
2180         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2181         size_t page_size = qemu_ram_pagesize(ramblock);
2182         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2183         int ret = 0;
2184 
2185         qemu_mutex_lock(&rs->bitmap_mutex);
2186 
2187         pss_init(pss, ramblock, page_start);
2188         /*
2189          * Always use the preempt channel, and make sure it's there.  It's
2190          * safe to access without lock, because when rp-thread is running
2191          * we should be the only one who operates on the qemufile
2192          */
2193         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2194         assert(pss->pss_channel);
2195 
2196         /*
2197          * It must be either one or multiple of host page size.  Just
2198          * assert; if something wrong we're mostly split brain anyway.
2199          */
2200         assert(len % page_size == 0);
2201         while (len) {
2202             if (ram_save_host_page_urgent(pss)) {
2203                 error_report("%s: ram_save_host_page_urgent() failed: "
2204                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2205                              __func__, ramblock->idstr, start);
2206                 ret = -1;
2207                 break;
2208             }
2209             /*
2210              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2211              * will automatically be moved and point to the next host page
2212              * we're going to send, so no need to update here.
2213              *
2214              * Normally QEMU never sends >1 host page in requests, so
2215              * logically we don't even need that as the loop should only
2216              * run once, but just to be consistent.
2217              */
2218             len -= page_size;
2219         };
2220         qemu_mutex_unlock(&rs->bitmap_mutex);
2221 
2222         return ret;
2223     }
2224 
2225     struct RAMSrcPageRequest *new_entry =
2226         g_new0(struct RAMSrcPageRequest, 1);
2227     new_entry->rb = ramblock;
2228     new_entry->offset = start;
2229     new_entry->len = len;
2230 
2231     memory_region_ref(ramblock->mr);
2232     qemu_mutex_lock(&rs->src_page_req_mutex);
2233     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2234     migration_make_urgent_request();
2235     qemu_mutex_unlock(&rs->src_page_req_mutex);
2236 
2237     return 0;
2238 }
2239 
2240 static bool save_page_use_compression(RAMState *rs)
2241 {
2242     if (!migrate_use_compression()) {
2243         return false;
2244     }
2245 
2246     /*
2247      * If xbzrle is enabled (e.g., after first round of migration), stop
2248      * using the data compression. In theory, xbzrle can do better than
2249      * compression.
2250      */
2251     if (rs->xbzrle_enabled) {
2252         return false;
2253     }
2254 
2255     return true;
2256 }
2257 
2258 /*
2259  * try to compress the page before posting it out, return true if the page
2260  * has been properly handled by compression, otherwise needs other
2261  * paths to handle it
2262  */
2263 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2264                                RAMBlock *block, ram_addr_t offset)
2265 {
2266     if (!save_page_use_compression(rs)) {
2267         return false;
2268     }
2269 
2270     /*
2271      * When starting the process of a new block, the first page of
2272      * the block should be sent out before other pages in the same
2273      * block, and all the pages in last block should have been sent
2274      * out, keeping this order is important, because the 'cont' flag
2275      * is used to avoid resending the block name.
2276      *
2277      * We post the fist page as normal page as compression will take
2278      * much CPU resource.
2279      */
2280     if (block != pss->last_sent_block) {
2281         flush_compressed_data(rs);
2282         return false;
2283     }
2284 
2285     if (compress_page_with_multi_thread(block, offset) > 0) {
2286         return true;
2287     }
2288 
2289     compression_counters.busy++;
2290     return false;
2291 }
2292 
2293 /**
2294  * ram_save_target_page: save one target page
2295  *
2296  * Returns the number of pages written
2297  *
2298  * @rs: current RAM state
2299  * @pss: data about the page we want to send
2300  */
2301 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2302 {
2303     RAMBlock *block = pss->block;
2304     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2305     int res;
2306 
2307     if (control_save_page(pss, block, offset, &res)) {
2308         return res;
2309     }
2310 
2311     if (save_compress_page(rs, pss, block, offset)) {
2312         return 1;
2313     }
2314 
2315     res = save_zero_page(pss, block, offset);
2316     if (res > 0) {
2317         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2318          * page would be stale
2319          */
2320         if (rs->xbzrle_enabled) {
2321             XBZRLE_cache_lock();
2322             xbzrle_cache_zero_page(rs, block->offset + offset);
2323             XBZRLE_cache_unlock();
2324         }
2325         return res;
2326     }
2327 
2328     /*
2329      * Do not use multifd in postcopy as one whole host page should be
2330      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2331      * if host page size == guest page size the dest guest during run may
2332      * still see partially copied pages which is data corruption.
2333      */
2334     if (migrate_use_multifd() && !migration_in_postcopy()) {
2335         return ram_save_multifd_page(pss->pss_channel, block, offset);
2336     }
2337 
2338     return ram_save_page(rs, pss);
2339 }
2340 
2341 /* Should be called before sending a host page */
2342 static void pss_host_page_prepare(PageSearchStatus *pss)
2343 {
2344     /* How many guest pages are there in one host page? */
2345     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2346 
2347     pss->host_page_sending = true;
2348     if (guest_pfns <= 1) {
2349         /*
2350          * This covers both when guest psize == host psize, or when guest
2351          * has larger psize than the host (guest_pfns==0).
2352          *
2353          * For the latter, we always send one whole guest page per
2354          * iteration of the host page (example: an Alpha VM on x86 host
2355          * will have guest psize 8K while host psize 4K).
2356          */
2357         pss->host_page_start = pss->page;
2358         pss->host_page_end = pss->page + 1;
2359     } else {
2360         /*
2361          * The host page spans over multiple guest pages, we send them
2362          * within the same host page iteration.
2363          */
2364         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2365         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2366     }
2367 }
2368 
2369 /*
2370  * Whether the page pointed by PSS is within the host page being sent.
2371  * Must be called after a previous pss_host_page_prepare().
2372  */
2373 static bool pss_within_range(PageSearchStatus *pss)
2374 {
2375     ram_addr_t ram_addr;
2376 
2377     assert(pss->host_page_sending);
2378 
2379     /* Over host-page boundary? */
2380     if (pss->page >= pss->host_page_end) {
2381         return false;
2382     }
2383 
2384     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2385 
2386     return offset_in_ramblock(pss->block, ram_addr);
2387 }
2388 
2389 static void pss_host_page_finish(PageSearchStatus *pss)
2390 {
2391     pss->host_page_sending = false;
2392     /* This is not needed, but just to reset it */
2393     pss->host_page_start = pss->host_page_end = 0;
2394 }
2395 
2396 /*
2397  * Send an urgent host page specified by `pss'.  Need to be called with
2398  * bitmap_mutex held.
2399  *
2400  * Returns 0 if save host page succeeded, false otherwise.
2401  */
2402 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2403 {
2404     bool page_dirty, sent = false;
2405     RAMState *rs = ram_state;
2406     int ret = 0;
2407 
2408     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2409     pss_host_page_prepare(pss);
2410 
2411     /*
2412      * If precopy is sending the same page, let it be done in precopy, or
2413      * we could send the same page in two channels and none of them will
2414      * receive the whole page.
2415      */
2416     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2417         trace_postcopy_preempt_hit(pss->block->idstr,
2418                                    pss->page << TARGET_PAGE_BITS);
2419         return 0;
2420     }
2421 
2422     do {
2423         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2424 
2425         if (page_dirty) {
2426             /* Be strict to return code; it must be 1, or what else? */
2427             if (ram_save_target_page(rs, pss) != 1) {
2428                 error_report_once("%s: ram_save_target_page failed", __func__);
2429                 ret = -1;
2430                 goto out;
2431             }
2432             sent = true;
2433         }
2434         pss_find_next_dirty(pss);
2435     } while (pss_within_range(pss));
2436 out:
2437     pss_host_page_finish(pss);
2438     /* For urgent requests, flush immediately if sent */
2439     if (sent) {
2440         qemu_fflush(pss->pss_channel);
2441     }
2442     return ret;
2443 }
2444 
2445 /**
2446  * ram_save_host_page: save a whole host page
2447  *
2448  * Starting at *offset send pages up to the end of the current host
2449  * page. It's valid for the initial offset to point into the middle of
2450  * a host page in which case the remainder of the hostpage is sent.
2451  * Only dirty target pages are sent. Note that the host page size may
2452  * be a huge page for this block.
2453  *
2454  * The saving stops at the boundary of the used_length of the block
2455  * if the RAMBlock isn't a multiple of the host page size.
2456  *
2457  * The caller must be with ram_state.bitmap_mutex held to call this
2458  * function.  Note that this function can temporarily release the lock, but
2459  * when the function is returned it'll make sure the lock is still held.
2460  *
2461  * Returns the number of pages written or negative on error
2462  *
2463  * @rs: current RAM state
2464  * @pss: data about the page we want to send
2465  */
2466 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2467 {
2468     bool page_dirty, preempt_active = postcopy_preempt_active();
2469     int tmppages, pages = 0;
2470     size_t pagesize_bits =
2471         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2472     unsigned long start_page = pss->page;
2473     int res;
2474 
2475     if (ramblock_is_ignored(pss->block)) {
2476         error_report("block %s should not be migrated !", pss->block->idstr);
2477         return 0;
2478     }
2479 
2480     /* Update host page boundary information */
2481     pss_host_page_prepare(pss);
2482 
2483     do {
2484         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2485 
2486         /* Check the pages is dirty and if it is send it */
2487         if (page_dirty) {
2488             /*
2489              * Properly yield the lock only in postcopy preempt mode
2490              * because both migration thread and rp-return thread can
2491              * operate on the bitmaps.
2492              */
2493             if (preempt_active) {
2494                 qemu_mutex_unlock(&rs->bitmap_mutex);
2495             }
2496             tmppages = ram_save_target_page(rs, pss);
2497             if (tmppages >= 0) {
2498                 pages += tmppages;
2499                 /*
2500                  * Allow rate limiting to happen in the middle of huge pages if
2501                  * something is sent in the current iteration.
2502                  */
2503                 if (pagesize_bits > 1 && tmppages > 0) {
2504                     migration_rate_limit();
2505                 }
2506             }
2507             if (preempt_active) {
2508                 qemu_mutex_lock(&rs->bitmap_mutex);
2509             }
2510         } else {
2511             tmppages = 0;
2512         }
2513 
2514         if (tmppages < 0) {
2515             pss_host_page_finish(pss);
2516             return tmppages;
2517         }
2518 
2519         pss_find_next_dirty(pss);
2520     } while (pss_within_range(pss));
2521 
2522     pss_host_page_finish(pss);
2523 
2524     res = ram_save_release_protection(rs, pss, start_page);
2525     return (res < 0 ? res : pages);
2526 }
2527 
2528 /**
2529  * ram_find_and_save_block: finds a dirty page and sends it to f
2530  *
2531  * Called within an RCU critical section.
2532  *
2533  * Returns the number of pages written where zero means no dirty pages,
2534  * or negative on error
2535  *
2536  * @rs: current RAM state
2537  *
2538  * On systems where host-page-size > target-page-size it will send all the
2539  * pages in a host page that are dirty.
2540  */
2541 static int ram_find_and_save_block(RAMState *rs)
2542 {
2543     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2544     int pages = 0;
2545     bool again, found;
2546 
2547     /* No dirty page as there is zero RAM */
2548     if (!ram_bytes_total()) {
2549         return pages;
2550     }
2551 
2552     /*
2553      * Always keep last_seen_block/last_page valid during this procedure,
2554      * because find_dirty_block() relies on these values (e.g., we compare
2555      * last_seen_block with pss.block to see whether we searched all the
2556      * ramblocks) to detect the completion of migration.  Having NULL value
2557      * of last_seen_block can conditionally cause below loop to run forever.
2558      */
2559     if (!rs->last_seen_block) {
2560         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2561         rs->last_page = 0;
2562     }
2563 
2564     pss_init(pss, rs->last_seen_block, rs->last_page);
2565 
2566     do {
2567         again = true;
2568         found = get_queued_page(rs, pss);
2569 
2570         if (!found) {
2571             /* priority queue empty, so just search for something dirty */
2572             found = find_dirty_block(rs, pss, &again);
2573         }
2574 
2575         if (found) {
2576             pages = ram_save_host_page(rs, pss);
2577         }
2578     } while (!pages && again);
2579 
2580     rs->last_seen_block = pss->block;
2581     rs->last_page = pss->page;
2582 
2583     return pages;
2584 }
2585 
2586 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2587 {
2588     uint64_t pages = size / TARGET_PAGE_SIZE;
2589 
2590     if (zero) {
2591         stat64_add(&ram_atomic_counters.duplicate, pages);
2592     } else {
2593         stat64_add(&ram_atomic_counters.normal, pages);
2594         ram_transferred_add(size);
2595         qemu_file_credit_transfer(f, size);
2596     }
2597 }
2598 
2599 static uint64_t ram_bytes_total_common(bool count_ignored)
2600 {
2601     RAMBlock *block;
2602     uint64_t total = 0;
2603 
2604     RCU_READ_LOCK_GUARD();
2605 
2606     if (count_ignored) {
2607         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2608             total += block->used_length;
2609         }
2610     } else {
2611         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2612             total += block->used_length;
2613         }
2614     }
2615     return total;
2616 }
2617 
2618 uint64_t ram_bytes_total(void)
2619 {
2620     return ram_bytes_total_common(false);
2621 }
2622 
2623 static void xbzrle_load_setup(void)
2624 {
2625     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2626 }
2627 
2628 static void xbzrle_load_cleanup(void)
2629 {
2630     g_free(XBZRLE.decoded_buf);
2631     XBZRLE.decoded_buf = NULL;
2632 }
2633 
2634 static void ram_state_cleanup(RAMState **rsp)
2635 {
2636     if (*rsp) {
2637         migration_page_queue_free(*rsp);
2638         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2639         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2640         g_free(*rsp);
2641         *rsp = NULL;
2642     }
2643 }
2644 
2645 static void xbzrle_cleanup(void)
2646 {
2647     XBZRLE_cache_lock();
2648     if (XBZRLE.cache) {
2649         cache_fini(XBZRLE.cache);
2650         g_free(XBZRLE.encoded_buf);
2651         g_free(XBZRLE.current_buf);
2652         g_free(XBZRLE.zero_target_page);
2653         XBZRLE.cache = NULL;
2654         XBZRLE.encoded_buf = NULL;
2655         XBZRLE.current_buf = NULL;
2656         XBZRLE.zero_target_page = NULL;
2657     }
2658     XBZRLE_cache_unlock();
2659 }
2660 
2661 static void ram_save_cleanup(void *opaque)
2662 {
2663     RAMState **rsp = opaque;
2664     RAMBlock *block;
2665 
2666     /* We don't use dirty log with background snapshots */
2667     if (!migrate_background_snapshot()) {
2668         /* caller have hold iothread lock or is in a bh, so there is
2669          * no writing race against the migration bitmap
2670          */
2671         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2672             /*
2673              * do not stop dirty log without starting it, since
2674              * memory_global_dirty_log_stop will assert that
2675              * memory_global_dirty_log_start/stop used in pairs
2676              */
2677             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2678         }
2679     }
2680 
2681     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2682         g_free(block->clear_bmap);
2683         block->clear_bmap = NULL;
2684         g_free(block->bmap);
2685         block->bmap = NULL;
2686     }
2687 
2688     xbzrle_cleanup();
2689     compress_threads_save_cleanup();
2690     ram_state_cleanup(rsp);
2691 }
2692 
2693 static void ram_state_reset(RAMState *rs)
2694 {
2695     int i;
2696 
2697     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2698         rs->pss[i].last_sent_block = NULL;
2699     }
2700 
2701     rs->last_seen_block = NULL;
2702     rs->last_page = 0;
2703     rs->last_version = ram_list.version;
2704     rs->xbzrle_enabled = false;
2705 }
2706 
2707 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2708 
2709 /* **** functions for postcopy ***** */
2710 
2711 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2712 {
2713     struct RAMBlock *block;
2714 
2715     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2716         unsigned long *bitmap = block->bmap;
2717         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2718         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2719 
2720         while (run_start < range) {
2721             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2722             ram_discard_range(block->idstr,
2723                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2724                               ((ram_addr_t)(run_end - run_start))
2725                                 << TARGET_PAGE_BITS);
2726             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2727         }
2728     }
2729 }
2730 
2731 /**
2732  * postcopy_send_discard_bm_ram: discard a RAMBlock
2733  *
2734  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2735  *
2736  * @ms: current migration state
2737  * @block: RAMBlock to discard
2738  */
2739 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2740 {
2741     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2742     unsigned long current;
2743     unsigned long *bitmap = block->bmap;
2744 
2745     for (current = 0; current < end; ) {
2746         unsigned long one = find_next_bit(bitmap, end, current);
2747         unsigned long zero, discard_length;
2748 
2749         if (one >= end) {
2750             break;
2751         }
2752 
2753         zero = find_next_zero_bit(bitmap, end, one + 1);
2754 
2755         if (zero >= end) {
2756             discard_length = end - one;
2757         } else {
2758             discard_length = zero - one;
2759         }
2760         postcopy_discard_send_range(ms, one, discard_length);
2761         current = one + discard_length;
2762     }
2763 }
2764 
2765 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2766 
2767 /**
2768  * postcopy_each_ram_send_discard: discard all RAMBlocks
2769  *
2770  * Utility for the outgoing postcopy code.
2771  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2772  *   passing it bitmap indexes and name.
2773  * (qemu_ram_foreach_block ends up passing unscaled lengths
2774  *  which would mean postcopy code would have to deal with target page)
2775  *
2776  * @ms: current migration state
2777  */
2778 static void postcopy_each_ram_send_discard(MigrationState *ms)
2779 {
2780     struct RAMBlock *block;
2781 
2782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2783         postcopy_discard_send_init(ms, block->idstr);
2784 
2785         /*
2786          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2787          * host-page size chunks, mark any partially dirty host-page size
2788          * chunks as all dirty.  In this case the host-page is the host-page
2789          * for the particular RAMBlock, i.e. it might be a huge page.
2790          */
2791         postcopy_chunk_hostpages_pass(ms, block);
2792 
2793         /*
2794          * Postcopy sends chunks of bitmap over the wire, but it
2795          * just needs indexes at this point, avoids it having
2796          * target page specific code.
2797          */
2798         postcopy_send_discard_bm_ram(ms, block);
2799         postcopy_discard_send_finish(ms);
2800     }
2801 }
2802 
2803 /**
2804  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2805  *
2806  * Helper for postcopy_chunk_hostpages; it's called twice to
2807  * canonicalize the two bitmaps, that are similar, but one is
2808  * inverted.
2809  *
2810  * Postcopy requires that all target pages in a hostpage are dirty or
2811  * clean, not a mix.  This function canonicalizes the bitmaps.
2812  *
2813  * @ms: current migration state
2814  * @block: block that contains the page we want to canonicalize
2815  */
2816 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2817 {
2818     RAMState *rs = ram_state;
2819     unsigned long *bitmap = block->bmap;
2820     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2821     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2822     unsigned long run_start;
2823 
2824     if (block->page_size == TARGET_PAGE_SIZE) {
2825         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2826         return;
2827     }
2828 
2829     /* Find a dirty page */
2830     run_start = find_next_bit(bitmap, pages, 0);
2831 
2832     while (run_start < pages) {
2833 
2834         /*
2835          * If the start of this run of pages is in the middle of a host
2836          * page, then we need to fixup this host page.
2837          */
2838         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2839             /* Find the end of this run */
2840             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2841             /*
2842              * If the end isn't at the start of a host page, then the
2843              * run doesn't finish at the end of a host page
2844              * and we need to discard.
2845              */
2846         }
2847 
2848         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2849             unsigned long page;
2850             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2851                                                              host_ratio);
2852             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2853 
2854             /* Clean up the bitmap */
2855             for (page = fixup_start_addr;
2856                  page < fixup_start_addr + host_ratio; page++) {
2857                 /*
2858                  * Remark them as dirty, updating the count for any pages
2859                  * that weren't previously dirty.
2860                  */
2861                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2862             }
2863         }
2864 
2865         /* Find the next dirty page for the next iteration */
2866         run_start = find_next_bit(bitmap, pages, run_start);
2867     }
2868 }
2869 
2870 /**
2871  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2872  *
2873  * Transmit the set of pages to be discarded after precopy to the target
2874  * these are pages that:
2875  *     a) Have been previously transmitted but are now dirty again
2876  *     b) Pages that have never been transmitted, this ensures that
2877  *        any pages on the destination that have been mapped by background
2878  *        tasks get discarded (transparent huge pages is the specific concern)
2879  * Hopefully this is pretty sparse
2880  *
2881  * @ms: current migration state
2882  */
2883 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2884 {
2885     RAMState *rs = ram_state;
2886 
2887     RCU_READ_LOCK_GUARD();
2888 
2889     /* This should be our last sync, the src is now paused */
2890     migration_bitmap_sync(rs);
2891 
2892     /* Easiest way to make sure we don't resume in the middle of a host-page */
2893     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2894     rs->last_seen_block = NULL;
2895     rs->last_page = 0;
2896 
2897     postcopy_each_ram_send_discard(ms);
2898 
2899     trace_ram_postcopy_send_discard_bitmap();
2900 }
2901 
2902 /**
2903  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2904  *
2905  * Returns zero on success
2906  *
2907  * @rbname: name of the RAMBlock of the request. NULL means the
2908  *          same that last one.
2909  * @start: RAMBlock starting page
2910  * @length: RAMBlock size
2911  */
2912 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2913 {
2914     trace_ram_discard_range(rbname, start, length);
2915 
2916     RCU_READ_LOCK_GUARD();
2917     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2918 
2919     if (!rb) {
2920         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2921         return -1;
2922     }
2923 
2924     /*
2925      * On source VM, we don't need to update the received bitmap since
2926      * we don't even have one.
2927      */
2928     if (rb->receivedmap) {
2929         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2930                      length >> qemu_target_page_bits());
2931     }
2932 
2933     return ram_block_discard_range(rb, start, length);
2934 }
2935 
2936 /*
2937  * For every allocation, we will try not to crash the VM if the
2938  * allocation failed.
2939  */
2940 static int xbzrle_init(void)
2941 {
2942     Error *local_err = NULL;
2943 
2944     if (!migrate_use_xbzrle()) {
2945         return 0;
2946     }
2947 
2948     XBZRLE_cache_lock();
2949 
2950     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2951     if (!XBZRLE.zero_target_page) {
2952         error_report("%s: Error allocating zero page", __func__);
2953         goto err_out;
2954     }
2955 
2956     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2957                               TARGET_PAGE_SIZE, &local_err);
2958     if (!XBZRLE.cache) {
2959         error_report_err(local_err);
2960         goto free_zero_page;
2961     }
2962 
2963     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2964     if (!XBZRLE.encoded_buf) {
2965         error_report("%s: Error allocating encoded_buf", __func__);
2966         goto free_cache;
2967     }
2968 
2969     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2970     if (!XBZRLE.current_buf) {
2971         error_report("%s: Error allocating current_buf", __func__);
2972         goto free_encoded_buf;
2973     }
2974 
2975     /* We are all good */
2976     XBZRLE_cache_unlock();
2977     return 0;
2978 
2979 free_encoded_buf:
2980     g_free(XBZRLE.encoded_buf);
2981     XBZRLE.encoded_buf = NULL;
2982 free_cache:
2983     cache_fini(XBZRLE.cache);
2984     XBZRLE.cache = NULL;
2985 free_zero_page:
2986     g_free(XBZRLE.zero_target_page);
2987     XBZRLE.zero_target_page = NULL;
2988 err_out:
2989     XBZRLE_cache_unlock();
2990     return -ENOMEM;
2991 }
2992 
2993 static int ram_state_init(RAMState **rsp)
2994 {
2995     *rsp = g_try_new0(RAMState, 1);
2996 
2997     if (!*rsp) {
2998         error_report("%s: Init ramstate fail", __func__);
2999         return -1;
3000     }
3001 
3002     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3003     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3004     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3005 
3006     /*
3007      * Count the total number of pages used by ram blocks not including any
3008      * gaps due to alignment or unplugs.
3009      * This must match with the initial values of dirty bitmap.
3010      */
3011     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3012     ram_state_reset(*rsp);
3013 
3014     return 0;
3015 }
3016 
3017 static void ram_list_init_bitmaps(void)
3018 {
3019     MigrationState *ms = migrate_get_current();
3020     RAMBlock *block;
3021     unsigned long pages;
3022     uint8_t shift;
3023 
3024     /* Skip setting bitmap if there is no RAM */
3025     if (ram_bytes_total()) {
3026         shift = ms->clear_bitmap_shift;
3027         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3028             error_report("clear_bitmap_shift (%u) too big, using "
3029                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3030             shift = CLEAR_BITMAP_SHIFT_MAX;
3031         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3032             error_report("clear_bitmap_shift (%u) too small, using "
3033                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3034             shift = CLEAR_BITMAP_SHIFT_MIN;
3035         }
3036 
3037         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3038             pages = block->max_length >> TARGET_PAGE_BITS;
3039             /*
3040              * The initial dirty bitmap for migration must be set with all
3041              * ones to make sure we'll migrate every guest RAM page to
3042              * destination.
3043              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3044              * new migration after a failed migration, ram_list.
3045              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3046              * guest memory.
3047              */
3048             block->bmap = bitmap_new(pages);
3049             bitmap_set(block->bmap, 0, pages);
3050             block->clear_bmap_shift = shift;
3051             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3052         }
3053     }
3054 }
3055 
3056 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3057 {
3058     unsigned long pages;
3059     RAMBlock *rb;
3060 
3061     RCU_READ_LOCK_GUARD();
3062 
3063     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3064             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3065             rs->migration_dirty_pages -= pages;
3066     }
3067 }
3068 
3069 static void ram_init_bitmaps(RAMState *rs)
3070 {
3071     /* For memory_global_dirty_log_start below.  */
3072     qemu_mutex_lock_iothread();
3073     qemu_mutex_lock_ramlist();
3074 
3075     WITH_RCU_READ_LOCK_GUARD() {
3076         ram_list_init_bitmaps();
3077         /* We don't use dirty log with background snapshots */
3078         if (!migrate_background_snapshot()) {
3079             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3080             migration_bitmap_sync_precopy(rs);
3081         }
3082     }
3083     qemu_mutex_unlock_ramlist();
3084     qemu_mutex_unlock_iothread();
3085 
3086     /*
3087      * After an eventual first bitmap sync, fixup the initial bitmap
3088      * containing all 1s to exclude any discarded pages from migration.
3089      */
3090     migration_bitmap_clear_discarded_pages(rs);
3091 }
3092 
3093 static int ram_init_all(RAMState **rsp)
3094 {
3095     if (ram_state_init(rsp)) {
3096         return -1;
3097     }
3098 
3099     if (xbzrle_init()) {
3100         ram_state_cleanup(rsp);
3101         return -1;
3102     }
3103 
3104     ram_init_bitmaps(*rsp);
3105 
3106     return 0;
3107 }
3108 
3109 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3110 {
3111     RAMBlock *block;
3112     uint64_t pages = 0;
3113 
3114     /*
3115      * Postcopy is not using xbzrle/compression, so no need for that.
3116      * Also, since source are already halted, we don't need to care
3117      * about dirty page logging as well.
3118      */
3119 
3120     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3121         pages += bitmap_count_one(block->bmap,
3122                                   block->used_length >> TARGET_PAGE_BITS);
3123     }
3124 
3125     /* This may not be aligned with current bitmaps. Recalculate. */
3126     rs->migration_dirty_pages = pages;
3127 
3128     ram_state_reset(rs);
3129 
3130     /* Update RAMState cache of output QEMUFile */
3131     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3132 
3133     trace_ram_state_resume_prepare(pages);
3134 }
3135 
3136 /*
3137  * This function clears bits of the free pages reported by the caller from the
3138  * migration dirty bitmap. @addr is the host address corresponding to the
3139  * start of the continuous guest free pages, and @len is the total bytes of
3140  * those pages.
3141  */
3142 void qemu_guest_free_page_hint(void *addr, size_t len)
3143 {
3144     RAMBlock *block;
3145     ram_addr_t offset;
3146     size_t used_len, start, npages;
3147     MigrationState *s = migrate_get_current();
3148 
3149     /* This function is currently expected to be used during live migration */
3150     if (!migration_is_setup_or_active(s->state)) {
3151         return;
3152     }
3153 
3154     for (; len > 0; len -= used_len, addr += used_len) {
3155         block = qemu_ram_block_from_host(addr, false, &offset);
3156         if (unlikely(!block || offset >= block->used_length)) {
3157             /*
3158              * The implementation might not support RAMBlock resize during
3159              * live migration, but it could happen in theory with future
3160              * updates. So we add a check here to capture that case.
3161              */
3162             error_report_once("%s unexpected error", __func__);
3163             return;
3164         }
3165 
3166         if (len <= block->used_length - offset) {
3167             used_len = len;
3168         } else {
3169             used_len = block->used_length - offset;
3170         }
3171 
3172         start = offset >> TARGET_PAGE_BITS;
3173         npages = used_len >> TARGET_PAGE_BITS;
3174 
3175         qemu_mutex_lock(&ram_state->bitmap_mutex);
3176         /*
3177          * The skipped free pages are equavalent to be sent from clear_bmap's
3178          * perspective, so clear the bits from the memory region bitmap which
3179          * are initially set. Otherwise those skipped pages will be sent in
3180          * the next round after syncing from the memory region bitmap.
3181          */
3182         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3183         ram_state->migration_dirty_pages -=
3184                       bitmap_count_one_with_offset(block->bmap, start, npages);
3185         bitmap_clear(block->bmap, start, npages);
3186         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3187     }
3188 }
3189 
3190 /*
3191  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3192  * long-running RCU critical section.  When rcu-reclaims in the code
3193  * start to become numerous it will be necessary to reduce the
3194  * granularity of these critical sections.
3195  */
3196 
3197 /**
3198  * ram_save_setup: Setup RAM for migration
3199  *
3200  * Returns zero to indicate success and negative for error
3201  *
3202  * @f: QEMUFile where to send the data
3203  * @opaque: RAMState pointer
3204  */
3205 static int ram_save_setup(QEMUFile *f, void *opaque)
3206 {
3207     RAMState **rsp = opaque;
3208     RAMBlock *block;
3209     int ret;
3210 
3211     if (compress_threads_save_setup()) {
3212         return -1;
3213     }
3214 
3215     /* migration has already setup the bitmap, reuse it. */
3216     if (!migration_in_colo_state()) {
3217         if (ram_init_all(rsp) != 0) {
3218             compress_threads_save_cleanup();
3219             return -1;
3220         }
3221     }
3222     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3223 
3224     WITH_RCU_READ_LOCK_GUARD() {
3225         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3226 
3227         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3228             qemu_put_byte(f, strlen(block->idstr));
3229             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3230             qemu_put_be64(f, block->used_length);
3231             if (migrate_postcopy_ram() && block->page_size !=
3232                                           qemu_host_page_size) {
3233                 qemu_put_be64(f, block->page_size);
3234             }
3235             if (migrate_ignore_shared()) {
3236                 qemu_put_be64(f, block->mr->addr);
3237             }
3238         }
3239     }
3240 
3241     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3242     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3243 
3244     ret =  multifd_send_sync_main(f);
3245     if (ret < 0) {
3246         return ret;
3247     }
3248 
3249     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3250     qemu_fflush(f);
3251 
3252     return 0;
3253 }
3254 
3255 /**
3256  * ram_save_iterate: iterative stage for migration
3257  *
3258  * Returns zero to indicate success and negative for error
3259  *
3260  * @f: QEMUFile where to send the data
3261  * @opaque: RAMState pointer
3262  */
3263 static int ram_save_iterate(QEMUFile *f, void *opaque)
3264 {
3265     RAMState **temp = opaque;
3266     RAMState *rs = *temp;
3267     int ret = 0;
3268     int i;
3269     int64_t t0;
3270     int done = 0;
3271 
3272     if (blk_mig_bulk_active()) {
3273         /* Avoid transferring ram during bulk phase of block migration as
3274          * the bulk phase will usually take a long time and transferring
3275          * ram updates during that time is pointless. */
3276         goto out;
3277     }
3278 
3279     /*
3280      * We'll take this lock a little bit long, but it's okay for two reasons.
3281      * Firstly, the only possible other thread to take it is who calls
3282      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3283      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3284      * guarantees that we'll at least released it in a regular basis.
3285      */
3286     qemu_mutex_lock(&rs->bitmap_mutex);
3287     WITH_RCU_READ_LOCK_GUARD() {
3288         if (ram_list.version != rs->last_version) {
3289             ram_state_reset(rs);
3290         }
3291 
3292         /* Read version before ram_list.blocks */
3293         smp_rmb();
3294 
3295         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3296 
3297         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3298         i = 0;
3299         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3300                postcopy_has_request(rs)) {
3301             int pages;
3302 
3303             if (qemu_file_get_error(f)) {
3304                 break;
3305             }
3306 
3307             pages = ram_find_and_save_block(rs);
3308             /* no more pages to sent */
3309             if (pages == 0) {
3310                 done = 1;
3311                 break;
3312             }
3313 
3314             if (pages < 0) {
3315                 qemu_file_set_error(f, pages);
3316                 break;
3317             }
3318 
3319             rs->target_page_count += pages;
3320 
3321             /*
3322              * During postcopy, it is necessary to make sure one whole host
3323              * page is sent in one chunk.
3324              */
3325             if (migrate_postcopy_ram()) {
3326                 flush_compressed_data(rs);
3327             }
3328 
3329             /*
3330              * we want to check in the 1st loop, just in case it was the 1st
3331              * time and we had to sync the dirty bitmap.
3332              * qemu_clock_get_ns() is a bit expensive, so we only check each
3333              * some iterations
3334              */
3335             if ((i & 63) == 0) {
3336                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3337                               1000000;
3338                 if (t1 > MAX_WAIT) {
3339                     trace_ram_save_iterate_big_wait(t1, i);
3340                     break;
3341                 }
3342             }
3343             i++;
3344         }
3345     }
3346     qemu_mutex_unlock(&rs->bitmap_mutex);
3347 
3348     /*
3349      * Must occur before EOS (or any QEMUFile operation)
3350      * because of RDMA protocol.
3351      */
3352     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3353 
3354 out:
3355     if (ret >= 0
3356         && migration_is_setup_or_active(migrate_get_current()->state)) {
3357         ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3358         if (ret < 0) {
3359             return ret;
3360         }
3361 
3362         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3363         qemu_fflush(f);
3364         ram_transferred_add(8);
3365 
3366         ret = qemu_file_get_error(f);
3367     }
3368     if (ret < 0) {
3369         return ret;
3370     }
3371 
3372     return done;
3373 }
3374 
3375 /**
3376  * ram_save_complete: function called to send the remaining amount of ram
3377  *
3378  * Returns zero to indicate success or negative on error
3379  *
3380  * Called with iothread lock
3381  *
3382  * @f: QEMUFile where to send the data
3383  * @opaque: RAMState pointer
3384  */
3385 static int ram_save_complete(QEMUFile *f, void *opaque)
3386 {
3387     RAMState **temp = opaque;
3388     RAMState *rs = *temp;
3389     int ret = 0;
3390 
3391     rs->last_stage = !migration_in_colo_state();
3392 
3393     WITH_RCU_READ_LOCK_GUARD() {
3394         if (!migration_in_postcopy()) {
3395             migration_bitmap_sync_precopy(rs);
3396         }
3397 
3398         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3399 
3400         /* try transferring iterative blocks of memory */
3401 
3402         /* flush all remaining blocks regardless of rate limiting */
3403         qemu_mutex_lock(&rs->bitmap_mutex);
3404         while (true) {
3405             int pages;
3406 
3407             pages = ram_find_and_save_block(rs);
3408             /* no more blocks to sent */
3409             if (pages == 0) {
3410                 break;
3411             }
3412             if (pages < 0) {
3413                 ret = pages;
3414                 break;
3415             }
3416         }
3417         qemu_mutex_unlock(&rs->bitmap_mutex);
3418 
3419         flush_compressed_data(rs);
3420         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3421     }
3422 
3423     if (ret < 0) {
3424         return ret;
3425     }
3426 
3427     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3428     if (ret < 0) {
3429         return ret;
3430     }
3431 
3432     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3433     qemu_fflush(f);
3434 
3435     return 0;
3436 }
3437 
3438 static void ram_state_pending_estimate(void *opaque,
3439                                        uint64_t *res_precopy_only,
3440                                        uint64_t *res_compatible,
3441                                        uint64_t *res_postcopy_only)
3442 {
3443     RAMState **temp = opaque;
3444     RAMState *rs = *temp;
3445 
3446     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3447 
3448     if (migrate_postcopy_ram()) {
3449         /* We can do postcopy, and all the data is postcopiable */
3450         *res_postcopy_only += remaining_size;
3451     } else {
3452         *res_precopy_only += remaining_size;
3453     }
3454 }
3455 
3456 static void ram_state_pending_exact(void *opaque,
3457                                     uint64_t *res_precopy_only,
3458                                     uint64_t *res_compatible,
3459                                     uint64_t *res_postcopy_only)
3460 {
3461     RAMState **temp = opaque;
3462     RAMState *rs = *temp;
3463 
3464     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3465 
3466     if (!migration_in_postcopy()) {
3467         qemu_mutex_lock_iothread();
3468         WITH_RCU_READ_LOCK_GUARD() {
3469             migration_bitmap_sync_precopy(rs);
3470         }
3471         qemu_mutex_unlock_iothread();
3472         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3473     }
3474 
3475     if (migrate_postcopy_ram()) {
3476         /* We can do postcopy, and all the data is postcopiable */
3477         *res_compatible += remaining_size;
3478     } else {
3479         *res_precopy_only += remaining_size;
3480     }
3481 }
3482 
3483 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3484 {
3485     unsigned int xh_len;
3486     int xh_flags;
3487     uint8_t *loaded_data;
3488 
3489     /* extract RLE header */
3490     xh_flags = qemu_get_byte(f);
3491     xh_len = qemu_get_be16(f);
3492 
3493     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3494         error_report("Failed to load XBZRLE page - wrong compression!");
3495         return -1;
3496     }
3497 
3498     if (xh_len > TARGET_PAGE_SIZE) {
3499         error_report("Failed to load XBZRLE page - len overflow!");
3500         return -1;
3501     }
3502     loaded_data = XBZRLE.decoded_buf;
3503     /* load data and decode */
3504     /* it can change loaded_data to point to an internal buffer */
3505     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3506 
3507     /* decode RLE */
3508     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3509                              TARGET_PAGE_SIZE) == -1) {
3510         error_report("Failed to load XBZRLE page - decode error!");
3511         return -1;
3512     }
3513 
3514     return 0;
3515 }
3516 
3517 /**
3518  * ram_block_from_stream: read a RAMBlock id from the migration stream
3519  *
3520  * Must be called from within a rcu critical section.
3521  *
3522  * Returns a pointer from within the RCU-protected ram_list.
3523  *
3524  * @mis: the migration incoming state pointer
3525  * @f: QEMUFile where to read the data from
3526  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3527  * @channel: the channel we're using
3528  */
3529 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3530                                               QEMUFile *f, int flags,
3531                                               int channel)
3532 {
3533     RAMBlock *block = mis->last_recv_block[channel];
3534     char id[256];
3535     uint8_t len;
3536 
3537     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3538         if (!block) {
3539             error_report("Ack, bad migration stream!");
3540             return NULL;
3541         }
3542         return block;
3543     }
3544 
3545     len = qemu_get_byte(f);
3546     qemu_get_buffer(f, (uint8_t *)id, len);
3547     id[len] = 0;
3548 
3549     block = qemu_ram_block_by_name(id);
3550     if (!block) {
3551         error_report("Can't find block %s", id);
3552         return NULL;
3553     }
3554 
3555     if (ramblock_is_ignored(block)) {
3556         error_report("block %s should not be migrated !", id);
3557         return NULL;
3558     }
3559 
3560     mis->last_recv_block[channel] = block;
3561 
3562     return block;
3563 }
3564 
3565 static inline void *host_from_ram_block_offset(RAMBlock *block,
3566                                                ram_addr_t offset)
3567 {
3568     if (!offset_in_ramblock(block, offset)) {
3569         return NULL;
3570     }
3571 
3572     return block->host + offset;
3573 }
3574 
3575 static void *host_page_from_ram_block_offset(RAMBlock *block,
3576                                              ram_addr_t offset)
3577 {
3578     /* Note: Explicitly no check against offset_in_ramblock(). */
3579     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3580                                    block->page_size);
3581 }
3582 
3583 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3584                                                          ram_addr_t offset)
3585 {
3586     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3587 }
3588 
3589 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3590                              ram_addr_t offset, bool record_bitmap)
3591 {
3592     if (!offset_in_ramblock(block, offset)) {
3593         return NULL;
3594     }
3595     if (!block->colo_cache) {
3596         error_report("%s: colo_cache is NULL in block :%s",
3597                      __func__, block->idstr);
3598         return NULL;
3599     }
3600 
3601     /*
3602     * During colo checkpoint, we need bitmap of these migrated pages.
3603     * It help us to decide which pages in ram cache should be flushed
3604     * into VM's RAM later.
3605     */
3606     if (record_bitmap &&
3607         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3608         ram_state->migration_dirty_pages++;
3609     }
3610     return block->colo_cache + offset;
3611 }
3612 
3613 /**
3614  * ram_handle_compressed: handle the zero page case
3615  *
3616  * If a page (or a whole RDMA chunk) has been
3617  * determined to be zero, then zap it.
3618  *
3619  * @host: host address for the zero page
3620  * @ch: what the page is filled from.  We only support zero
3621  * @size: size of the zero page
3622  */
3623 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3624 {
3625     if (ch != 0 || !buffer_is_zero(host, size)) {
3626         memset(host, ch, size);
3627     }
3628 }
3629 
3630 /* return the size after decompression, or negative value on error */
3631 static int
3632 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3633                      const uint8_t *source, size_t source_len)
3634 {
3635     int err;
3636 
3637     err = inflateReset(stream);
3638     if (err != Z_OK) {
3639         return -1;
3640     }
3641 
3642     stream->avail_in = source_len;
3643     stream->next_in = (uint8_t *)source;
3644     stream->avail_out = dest_len;
3645     stream->next_out = dest;
3646 
3647     err = inflate(stream, Z_NO_FLUSH);
3648     if (err != Z_STREAM_END) {
3649         return -1;
3650     }
3651 
3652     return stream->total_out;
3653 }
3654 
3655 static void *do_data_decompress(void *opaque)
3656 {
3657     DecompressParam *param = opaque;
3658     unsigned long pagesize;
3659     uint8_t *des;
3660     int len, ret;
3661 
3662     qemu_mutex_lock(&param->mutex);
3663     while (!param->quit) {
3664         if (param->des) {
3665             des = param->des;
3666             len = param->len;
3667             param->des = 0;
3668             qemu_mutex_unlock(&param->mutex);
3669 
3670             pagesize = TARGET_PAGE_SIZE;
3671 
3672             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3673                                        param->compbuf, len);
3674             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3675                 error_report("decompress data failed");
3676                 qemu_file_set_error(decomp_file, ret);
3677             }
3678 
3679             qemu_mutex_lock(&decomp_done_lock);
3680             param->done = true;
3681             qemu_cond_signal(&decomp_done_cond);
3682             qemu_mutex_unlock(&decomp_done_lock);
3683 
3684             qemu_mutex_lock(&param->mutex);
3685         } else {
3686             qemu_cond_wait(&param->cond, &param->mutex);
3687         }
3688     }
3689     qemu_mutex_unlock(&param->mutex);
3690 
3691     return NULL;
3692 }
3693 
3694 static int wait_for_decompress_done(void)
3695 {
3696     int idx, thread_count;
3697 
3698     if (!migrate_use_compression()) {
3699         return 0;
3700     }
3701 
3702     thread_count = migrate_decompress_threads();
3703     qemu_mutex_lock(&decomp_done_lock);
3704     for (idx = 0; idx < thread_count; idx++) {
3705         while (!decomp_param[idx].done) {
3706             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3707         }
3708     }
3709     qemu_mutex_unlock(&decomp_done_lock);
3710     return qemu_file_get_error(decomp_file);
3711 }
3712 
3713 static void compress_threads_load_cleanup(void)
3714 {
3715     int i, thread_count;
3716 
3717     if (!migrate_use_compression()) {
3718         return;
3719     }
3720     thread_count = migrate_decompress_threads();
3721     for (i = 0; i < thread_count; i++) {
3722         /*
3723          * we use it as a indicator which shows if the thread is
3724          * properly init'd or not
3725          */
3726         if (!decomp_param[i].compbuf) {
3727             break;
3728         }
3729 
3730         qemu_mutex_lock(&decomp_param[i].mutex);
3731         decomp_param[i].quit = true;
3732         qemu_cond_signal(&decomp_param[i].cond);
3733         qemu_mutex_unlock(&decomp_param[i].mutex);
3734     }
3735     for (i = 0; i < thread_count; i++) {
3736         if (!decomp_param[i].compbuf) {
3737             break;
3738         }
3739 
3740         qemu_thread_join(decompress_threads + i);
3741         qemu_mutex_destroy(&decomp_param[i].mutex);
3742         qemu_cond_destroy(&decomp_param[i].cond);
3743         inflateEnd(&decomp_param[i].stream);
3744         g_free(decomp_param[i].compbuf);
3745         decomp_param[i].compbuf = NULL;
3746     }
3747     g_free(decompress_threads);
3748     g_free(decomp_param);
3749     decompress_threads = NULL;
3750     decomp_param = NULL;
3751     decomp_file = NULL;
3752 }
3753 
3754 static int compress_threads_load_setup(QEMUFile *f)
3755 {
3756     int i, thread_count;
3757 
3758     if (!migrate_use_compression()) {
3759         return 0;
3760     }
3761 
3762     thread_count = migrate_decompress_threads();
3763     decompress_threads = g_new0(QemuThread, thread_count);
3764     decomp_param = g_new0(DecompressParam, thread_count);
3765     qemu_mutex_init(&decomp_done_lock);
3766     qemu_cond_init(&decomp_done_cond);
3767     decomp_file = f;
3768     for (i = 0; i < thread_count; i++) {
3769         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3770             goto exit;
3771         }
3772 
3773         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3774         qemu_mutex_init(&decomp_param[i].mutex);
3775         qemu_cond_init(&decomp_param[i].cond);
3776         decomp_param[i].done = true;
3777         decomp_param[i].quit = false;
3778         qemu_thread_create(decompress_threads + i, "decompress",
3779                            do_data_decompress, decomp_param + i,
3780                            QEMU_THREAD_JOINABLE);
3781     }
3782     return 0;
3783 exit:
3784     compress_threads_load_cleanup();
3785     return -1;
3786 }
3787 
3788 static void decompress_data_with_multi_threads(QEMUFile *f,
3789                                                void *host, int len)
3790 {
3791     int idx, thread_count;
3792 
3793     thread_count = migrate_decompress_threads();
3794     QEMU_LOCK_GUARD(&decomp_done_lock);
3795     while (true) {
3796         for (idx = 0; idx < thread_count; idx++) {
3797             if (decomp_param[idx].done) {
3798                 decomp_param[idx].done = false;
3799                 qemu_mutex_lock(&decomp_param[idx].mutex);
3800                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3801                 decomp_param[idx].des = host;
3802                 decomp_param[idx].len = len;
3803                 qemu_cond_signal(&decomp_param[idx].cond);
3804                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3805                 break;
3806             }
3807         }
3808         if (idx < thread_count) {
3809             break;
3810         } else {
3811             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3812         }
3813     }
3814 }
3815 
3816 static void colo_init_ram_state(void)
3817 {
3818     ram_state_init(&ram_state);
3819 }
3820 
3821 /*
3822  * colo cache: this is for secondary VM, we cache the whole
3823  * memory of the secondary VM, it is need to hold the global lock
3824  * to call this helper.
3825  */
3826 int colo_init_ram_cache(void)
3827 {
3828     RAMBlock *block;
3829 
3830     WITH_RCU_READ_LOCK_GUARD() {
3831         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3832             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3833                                                     NULL, false, false);
3834             if (!block->colo_cache) {
3835                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3836                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3837                              block->used_length);
3838                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3839                     if (block->colo_cache) {
3840                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3841                         block->colo_cache = NULL;
3842                     }
3843                 }
3844                 return -errno;
3845             }
3846             if (!machine_dump_guest_core(current_machine)) {
3847                 qemu_madvise(block->colo_cache, block->used_length,
3848                              QEMU_MADV_DONTDUMP);
3849             }
3850         }
3851     }
3852 
3853     /*
3854     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3855     * with to decide which page in cache should be flushed into SVM's RAM. Here
3856     * we use the same name 'ram_bitmap' as for migration.
3857     */
3858     if (ram_bytes_total()) {
3859         RAMBlock *block;
3860 
3861         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3862             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3863             block->bmap = bitmap_new(pages);
3864         }
3865     }
3866 
3867     colo_init_ram_state();
3868     return 0;
3869 }
3870 
3871 /* TODO: duplicated with ram_init_bitmaps */
3872 void colo_incoming_start_dirty_log(void)
3873 {
3874     RAMBlock *block = NULL;
3875     /* For memory_global_dirty_log_start below. */
3876     qemu_mutex_lock_iothread();
3877     qemu_mutex_lock_ramlist();
3878 
3879     memory_global_dirty_log_sync();
3880     WITH_RCU_READ_LOCK_GUARD() {
3881         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3882             ramblock_sync_dirty_bitmap(ram_state, block);
3883             /* Discard this dirty bitmap record */
3884             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3885         }
3886         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3887     }
3888     ram_state->migration_dirty_pages = 0;
3889     qemu_mutex_unlock_ramlist();
3890     qemu_mutex_unlock_iothread();
3891 }
3892 
3893 /* It is need to hold the global lock to call this helper */
3894 void colo_release_ram_cache(void)
3895 {
3896     RAMBlock *block;
3897 
3898     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3899     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3900         g_free(block->bmap);
3901         block->bmap = NULL;
3902     }
3903 
3904     WITH_RCU_READ_LOCK_GUARD() {
3905         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3906             if (block->colo_cache) {
3907                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3908                 block->colo_cache = NULL;
3909             }
3910         }
3911     }
3912     ram_state_cleanup(&ram_state);
3913 }
3914 
3915 /**
3916  * ram_load_setup: Setup RAM for migration incoming side
3917  *
3918  * Returns zero to indicate success and negative for error
3919  *
3920  * @f: QEMUFile where to receive the data
3921  * @opaque: RAMState pointer
3922  */
3923 static int ram_load_setup(QEMUFile *f, void *opaque)
3924 {
3925     if (compress_threads_load_setup(f)) {
3926         return -1;
3927     }
3928 
3929     xbzrle_load_setup();
3930     ramblock_recv_map_init();
3931 
3932     return 0;
3933 }
3934 
3935 static int ram_load_cleanup(void *opaque)
3936 {
3937     RAMBlock *rb;
3938 
3939     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3940         qemu_ram_block_writeback(rb);
3941     }
3942 
3943     xbzrle_load_cleanup();
3944     compress_threads_load_cleanup();
3945 
3946     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3947         g_free(rb->receivedmap);
3948         rb->receivedmap = NULL;
3949     }
3950 
3951     return 0;
3952 }
3953 
3954 /**
3955  * ram_postcopy_incoming_init: allocate postcopy data structures
3956  *
3957  * Returns 0 for success and negative if there was one error
3958  *
3959  * @mis: current migration incoming state
3960  *
3961  * Allocate data structures etc needed by incoming migration with
3962  * postcopy-ram. postcopy-ram's similarly names
3963  * postcopy_ram_incoming_init does the work.
3964  */
3965 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3966 {
3967     return postcopy_ram_incoming_init(mis);
3968 }
3969 
3970 /**
3971  * ram_load_postcopy: load a page in postcopy case
3972  *
3973  * Returns 0 for success or -errno in case of error
3974  *
3975  * Called in postcopy mode by ram_load().
3976  * rcu_read_lock is taken prior to this being called.
3977  *
3978  * @f: QEMUFile where to send the data
3979  * @channel: the channel to use for loading
3980  */
3981 int ram_load_postcopy(QEMUFile *f, int channel)
3982 {
3983     int flags = 0, ret = 0;
3984     bool place_needed = false;
3985     bool matches_target_page_size = false;
3986     MigrationIncomingState *mis = migration_incoming_get_current();
3987     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3988 
3989     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3990         ram_addr_t addr;
3991         void *page_buffer = NULL;
3992         void *place_source = NULL;
3993         RAMBlock *block = NULL;
3994         uint8_t ch;
3995         int len;
3996 
3997         addr = qemu_get_be64(f);
3998 
3999         /*
4000          * If qemu file error, we should stop here, and then "addr"
4001          * may be invalid
4002          */
4003         ret = qemu_file_get_error(f);
4004         if (ret) {
4005             break;
4006         }
4007 
4008         flags = addr & ~TARGET_PAGE_MASK;
4009         addr &= TARGET_PAGE_MASK;
4010 
4011         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4012         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4013                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4014             block = ram_block_from_stream(mis, f, flags, channel);
4015             if (!block) {
4016                 ret = -EINVAL;
4017                 break;
4018             }
4019 
4020             /*
4021              * Relying on used_length is racy and can result in false positives.
4022              * We might place pages beyond used_length in case RAM was shrunk
4023              * while in postcopy, which is fine - trying to place via
4024              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4025              */
4026             if (!block->host || addr >= block->postcopy_length) {
4027                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4028                 ret = -EINVAL;
4029                 break;
4030             }
4031             tmp_page->target_pages++;
4032             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4033             /*
4034              * Postcopy requires that we place whole host pages atomically;
4035              * these may be huge pages for RAMBlocks that are backed by
4036              * hugetlbfs.
4037              * To make it atomic, the data is read into a temporary page
4038              * that's moved into place later.
4039              * The migration protocol uses,  possibly smaller, target-pages
4040              * however the source ensures it always sends all the components
4041              * of a host page in one chunk.
4042              */
4043             page_buffer = tmp_page->tmp_huge_page +
4044                           host_page_offset_from_ram_block_offset(block, addr);
4045             /* If all TP are zero then we can optimise the place */
4046             if (tmp_page->target_pages == 1) {
4047                 tmp_page->host_addr =
4048                     host_page_from_ram_block_offset(block, addr);
4049             } else if (tmp_page->host_addr !=
4050                        host_page_from_ram_block_offset(block, addr)) {
4051                 /* not the 1st TP within the HP */
4052                 error_report("Non-same host page detected on channel %d: "
4053                              "Target host page %p, received host page %p "
4054                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4055                              channel, tmp_page->host_addr,
4056                              host_page_from_ram_block_offset(block, addr),
4057                              block->idstr, addr, tmp_page->target_pages);
4058                 ret = -EINVAL;
4059                 break;
4060             }
4061 
4062             /*
4063              * If it's the last part of a host page then we place the host
4064              * page
4065              */
4066             if (tmp_page->target_pages ==
4067                 (block->page_size / TARGET_PAGE_SIZE)) {
4068                 place_needed = true;
4069             }
4070             place_source = tmp_page->tmp_huge_page;
4071         }
4072 
4073         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4074         case RAM_SAVE_FLAG_ZERO:
4075             ch = qemu_get_byte(f);
4076             /*
4077              * Can skip to set page_buffer when
4078              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4079              */
4080             if (ch || !matches_target_page_size) {
4081                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4082             }
4083             if (ch) {
4084                 tmp_page->all_zero = false;
4085             }
4086             break;
4087 
4088         case RAM_SAVE_FLAG_PAGE:
4089             tmp_page->all_zero = false;
4090             if (!matches_target_page_size) {
4091                 /* For huge pages, we always use temporary buffer */
4092                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4093             } else {
4094                 /*
4095                  * For small pages that matches target page size, we
4096                  * avoid the qemu_file copy.  Instead we directly use
4097                  * the buffer of QEMUFile to place the page.  Note: we
4098                  * cannot do any QEMUFile operation before using that
4099                  * buffer to make sure the buffer is valid when
4100                  * placing the page.
4101                  */
4102                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4103                                          TARGET_PAGE_SIZE);
4104             }
4105             break;
4106         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4107             tmp_page->all_zero = false;
4108             len = qemu_get_be32(f);
4109             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4110                 error_report("Invalid compressed data length: %d", len);
4111                 ret = -EINVAL;
4112                 break;
4113             }
4114             decompress_data_with_multi_threads(f, page_buffer, len);
4115             break;
4116 
4117         case RAM_SAVE_FLAG_EOS:
4118             /* normal exit */
4119             multifd_recv_sync_main();
4120             break;
4121         default:
4122             error_report("Unknown combination of migration flags: 0x%x"
4123                          " (postcopy mode)", flags);
4124             ret = -EINVAL;
4125             break;
4126         }
4127 
4128         /* Got the whole host page, wait for decompress before placing. */
4129         if (place_needed) {
4130             ret |= wait_for_decompress_done();
4131         }
4132 
4133         /* Detect for any possible file errors */
4134         if (!ret && qemu_file_get_error(f)) {
4135             ret = qemu_file_get_error(f);
4136         }
4137 
4138         if (!ret && place_needed) {
4139             if (tmp_page->all_zero) {
4140                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4141             } else {
4142                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4143                                           place_source, block);
4144             }
4145             place_needed = false;
4146             postcopy_temp_page_reset(tmp_page);
4147         }
4148     }
4149 
4150     return ret;
4151 }
4152 
4153 static bool postcopy_is_running(void)
4154 {
4155     PostcopyState ps = postcopy_state_get();
4156     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4157 }
4158 
4159 /*
4160  * Flush content of RAM cache into SVM's memory.
4161  * Only flush the pages that be dirtied by PVM or SVM or both.
4162  */
4163 void colo_flush_ram_cache(void)
4164 {
4165     RAMBlock *block = NULL;
4166     void *dst_host;
4167     void *src_host;
4168     unsigned long offset = 0;
4169 
4170     memory_global_dirty_log_sync();
4171     WITH_RCU_READ_LOCK_GUARD() {
4172         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4173             ramblock_sync_dirty_bitmap(ram_state, block);
4174         }
4175     }
4176 
4177     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4178     WITH_RCU_READ_LOCK_GUARD() {
4179         block = QLIST_FIRST_RCU(&ram_list.blocks);
4180 
4181         while (block) {
4182             unsigned long num = 0;
4183 
4184             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4185             if (!offset_in_ramblock(block,
4186                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4187                 offset = 0;
4188                 num = 0;
4189                 block = QLIST_NEXT_RCU(block, next);
4190             } else {
4191                 unsigned long i = 0;
4192 
4193                 for (i = 0; i < num; i++) {
4194                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4195                 }
4196                 dst_host = block->host
4197                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4198                 src_host = block->colo_cache
4199                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4200                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4201                 offset += num;
4202             }
4203         }
4204     }
4205     trace_colo_flush_ram_cache_end();
4206 }
4207 
4208 /**
4209  * ram_load_precopy: load pages in precopy case
4210  *
4211  * Returns 0 for success or -errno in case of error
4212  *
4213  * Called in precopy mode by ram_load().
4214  * rcu_read_lock is taken prior to this being called.
4215  *
4216  * @f: QEMUFile where to send the data
4217  */
4218 static int ram_load_precopy(QEMUFile *f)
4219 {
4220     MigrationIncomingState *mis = migration_incoming_get_current();
4221     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4222     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4223     bool postcopy_advised = migration_incoming_postcopy_advised();
4224     if (!migrate_use_compression()) {
4225         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4226     }
4227 
4228     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4229         ram_addr_t addr, total_ram_bytes;
4230         void *host = NULL, *host_bak = NULL;
4231         uint8_t ch;
4232 
4233         /*
4234          * Yield periodically to let main loop run, but an iteration of
4235          * the main loop is expensive, so do it each some iterations
4236          */
4237         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4238             aio_co_schedule(qemu_get_current_aio_context(),
4239                             qemu_coroutine_self());
4240             qemu_coroutine_yield();
4241         }
4242         i++;
4243 
4244         addr = qemu_get_be64(f);
4245         flags = addr & ~TARGET_PAGE_MASK;
4246         addr &= TARGET_PAGE_MASK;
4247 
4248         if (flags & invalid_flags) {
4249             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4250                 error_report("Received an unexpected compressed page");
4251             }
4252 
4253             ret = -EINVAL;
4254             break;
4255         }
4256 
4257         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4258                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4259             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4260                                                     RAM_CHANNEL_PRECOPY);
4261 
4262             host = host_from_ram_block_offset(block, addr);
4263             /*
4264              * After going into COLO stage, we should not load the page
4265              * into SVM's memory directly, we put them into colo_cache firstly.
4266              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4267              * Previously, we copied all these memory in preparing stage of COLO
4268              * while we need to stop VM, which is a time-consuming process.
4269              * Here we optimize it by a trick, back-up every page while in
4270              * migration process while COLO is enabled, though it affects the
4271              * speed of the migration, but it obviously reduce the downtime of
4272              * back-up all SVM'S memory in COLO preparing stage.
4273              */
4274             if (migration_incoming_colo_enabled()) {
4275                 if (migration_incoming_in_colo_state()) {
4276                     /* In COLO stage, put all pages into cache temporarily */
4277                     host = colo_cache_from_block_offset(block, addr, true);
4278                 } else {
4279                    /*
4280                     * In migration stage but before COLO stage,
4281                     * Put all pages into both cache and SVM's memory.
4282                     */
4283                     host_bak = colo_cache_from_block_offset(block, addr, false);
4284                 }
4285             }
4286             if (!host) {
4287                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4288                 ret = -EINVAL;
4289                 break;
4290             }
4291             if (!migration_incoming_in_colo_state()) {
4292                 ramblock_recv_bitmap_set(block, host);
4293             }
4294 
4295             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4296         }
4297 
4298         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4299         case RAM_SAVE_FLAG_MEM_SIZE:
4300             /* Synchronize RAM block list */
4301             total_ram_bytes = addr;
4302             while (!ret && total_ram_bytes) {
4303                 RAMBlock *block;
4304                 char id[256];
4305                 ram_addr_t length;
4306 
4307                 len = qemu_get_byte(f);
4308                 qemu_get_buffer(f, (uint8_t *)id, len);
4309                 id[len] = 0;
4310                 length = qemu_get_be64(f);
4311 
4312                 block = qemu_ram_block_by_name(id);
4313                 if (block && !qemu_ram_is_migratable(block)) {
4314                     error_report("block %s should not be migrated !", id);
4315                     ret = -EINVAL;
4316                 } else if (block) {
4317                     if (length != block->used_length) {
4318                         Error *local_err = NULL;
4319 
4320                         ret = qemu_ram_resize(block, length,
4321                                               &local_err);
4322                         if (local_err) {
4323                             error_report_err(local_err);
4324                         }
4325                     }
4326                     /* For postcopy we need to check hugepage sizes match */
4327                     if (postcopy_advised && migrate_postcopy_ram() &&
4328                         block->page_size != qemu_host_page_size) {
4329                         uint64_t remote_page_size = qemu_get_be64(f);
4330                         if (remote_page_size != block->page_size) {
4331                             error_report("Mismatched RAM page size %s "
4332                                          "(local) %zd != %" PRId64,
4333                                          id, block->page_size,
4334                                          remote_page_size);
4335                             ret = -EINVAL;
4336                         }
4337                     }
4338                     if (migrate_ignore_shared()) {
4339                         hwaddr addr = qemu_get_be64(f);
4340                         if (ramblock_is_ignored(block) &&
4341                             block->mr->addr != addr) {
4342                             error_report("Mismatched GPAs for block %s "
4343                                          "%" PRId64 "!= %" PRId64,
4344                                          id, (uint64_t)addr,
4345                                          (uint64_t)block->mr->addr);
4346                             ret = -EINVAL;
4347                         }
4348                     }
4349                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4350                                           block->idstr);
4351                 } else {
4352                     error_report("Unknown ramblock \"%s\", cannot "
4353                                  "accept migration", id);
4354                     ret = -EINVAL;
4355                 }
4356 
4357                 total_ram_bytes -= length;
4358             }
4359             break;
4360 
4361         case RAM_SAVE_FLAG_ZERO:
4362             ch = qemu_get_byte(f);
4363             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4364             break;
4365 
4366         case RAM_SAVE_FLAG_PAGE:
4367             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4368             break;
4369 
4370         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4371             len = qemu_get_be32(f);
4372             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4373                 error_report("Invalid compressed data length: %d", len);
4374                 ret = -EINVAL;
4375                 break;
4376             }
4377             decompress_data_with_multi_threads(f, host, len);
4378             break;
4379 
4380         case RAM_SAVE_FLAG_XBZRLE:
4381             if (load_xbzrle(f, addr, host) < 0) {
4382                 error_report("Failed to decompress XBZRLE page at "
4383                              RAM_ADDR_FMT, addr);
4384                 ret = -EINVAL;
4385                 break;
4386             }
4387             break;
4388         case RAM_SAVE_FLAG_EOS:
4389             /* normal exit */
4390             multifd_recv_sync_main();
4391             break;
4392         default:
4393             if (flags & RAM_SAVE_FLAG_HOOK) {
4394                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4395             } else {
4396                 error_report("Unknown combination of migration flags: 0x%x",
4397                              flags);
4398                 ret = -EINVAL;
4399             }
4400         }
4401         if (!ret) {
4402             ret = qemu_file_get_error(f);
4403         }
4404         if (!ret && host_bak) {
4405             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4406         }
4407     }
4408 
4409     ret |= wait_for_decompress_done();
4410     return ret;
4411 }
4412 
4413 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4414 {
4415     int ret = 0;
4416     static uint64_t seq_iter;
4417     /*
4418      * If system is running in postcopy mode, page inserts to host memory must
4419      * be atomic
4420      */
4421     bool postcopy_running = postcopy_is_running();
4422 
4423     seq_iter++;
4424 
4425     if (version_id != 4) {
4426         return -EINVAL;
4427     }
4428 
4429     /*
4430      * This RCU critical section can be very long running.
4431      * When RCU reclaims in the code start to become numerous,
4432      * it will be necessary to reduce the granularity of this
4433      * critical section.
4434      */
4435     WITH_RCU_READ_LOCK_GUARD() {
4436         if (postcopy_running) {
4437             /*
4438              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4439              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4440              * service fast page faults.
4441              */
4442             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4443         } else {
4444             ret = ram_load_precopy(f);
4445         }
4446     }
4447     trace_ram_load_complete(ret, seq_iter);
4448 
4449     return ret;
4450 }
4451 
4452 static bool ram_has_postcopy(void *opaque)
4453 {
4454     RAMBlock *rb;
4455     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4456         if (ramblock_is_pmem(rb)) {
4457             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4458                          "is not supported now!", rb->idstr, rb->host);
4459             return false;
4460         }
4461     }
4462 
4463     return migrate_postcopy_ram();
4464 }
4465 
4466 /* Sync all the dirty bitmap with destination VM.  */
4467 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4468 {
4469     RAMBlock *block;
4470     QEMUFile *file = s->to_dst_file;
4471     int ramblock_count = 0;
4472 
4473     trace_ram_dirty_bitmap_sync_start();
4474 
4475     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4476         qemu_savevm_send_recv_bitmap(file, block->idstr);
4477         trace_ram_dirty_bitmap_request(block->idstr);
4478         ramblock_count++;
4479     }
4480 
4481     trace_ram_dirty_bitmap_sync_wait();
4482 
4483     /* Wait until all the ramblocks' dirty bitmap synced */
4484     while (ramblock_count--) {
4485         qemu_sem_wait(&s->rp_state.rp_sem);
4486     }
4487 
4488     trace_ram_dirty_bitmap_sync_complete();
4489 
4490     return 0;
4491 }
4492 
4493 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4494 {
4495     qemu_sem_post(&s->rp_state.rp_sem);
4496 }
4497 
4498 /*
4499  * Read the received bitmap, revert it as the initial dirty bitmap.
4500  * This is only used when the postcopy migration is paused but wants
4501  * to resume from a middle point.
4502  */
4503 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4504 {
4505     int ret = -EINVAL;
4506     /* from_dst_file is always valid because we're within rp_thread */
4507     QEMUFile *file = s->rp_state.from_dst_file;
4508     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4509     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4510     uint64_t size, end_mark;
4511 
4512     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4513 
4514     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4515         error_report("%s: incorrect state %s", __func__,
4516                      MigrationStatus_str(s->state));
4517         return -EINVAL;
4518     }
4519 
4520     /*
4521      * Note: see comments in ramblock_recv_bitmap_send() on why we
4522      * need the endianness conversion, and the paddings.
4523      */
4524     local_size = ROUND_UP(local_size, 8);
4525 
4526     /* Add paddings */
4527     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4528 
4529     size = qemu_get_be64(file);
4530 
4531     /* The size of the bitmap should match with our ramblock */
4532     if (size != local_size) {
4533         error_report("%s: ramblock '%s' bitmap size mismatch "
4534                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4535                      block->idstr, size, local_size);
4536         ret = -EINVAL;
4537         goto out;
4538     }
4539 
4540     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4541     end_mark = qemu_get_be64(file);
4542 
4543     ret = qemu_file_get_error(file);
4544     if (ret || size != local_size) {
4545         error_report("%s: read bitmap failed for ramblock '%s': %d"
4546                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4547                      __func__, block->idstr, ret, local_size, size);
4548         ret = -EIO;
4549         goto out;
4550     }
4551 
4552     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4553         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4554                      __func__, block->idstr, end_mark);
4555         ret = -EINVAL;
4556         goto out;
4557     }
4558 
4559     /*
4560      * Endianness conversion. We are during postcopy (though paused).
4561      * The dirty bitmap won't change. We can directly modify it.
4562      */
4563     bitmap_from_le(block->bmap, le_bitmap, nbits);
4564 
4565     /*
4566      * What we received is "received bitmap". Revert it as the initial
4567      * dirty bitmap for this ramblock.
4568      */
4569     bitmap_complement(block->bmap, block->bmap, nbits);
4570 
4571     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4572     ramblock_dirty_bitmap_clear_discarded_pages(block);
4573 
4574     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4575     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4576 
4577     /*
4578      * We succeeded to sync bitmap for current ramblock. If this is
4579      * the last one to sync, we need to notify the main send thread.
4580      */
4581     ram_dirty_bitmap_reload_notify(s);
4582 
4583     ret = 0;
4584 out:
4585     g_free(le_bitmap);
4586     return ret;
4587 }
4588 
4589 static int ram_resume_prepare(MigrationState *s, void *opaque)
4590 {
4591     RAMState *rs = *(RAMState **)opaque;
4592     int ret;
4593 
4594     ret = ram_dirty_bitmap_sync_all(s, rs);
4595     if (ret) {
4596         return ret;
4597     }
4598 
4599     ram_state_resume_prepare(rs, s->to_dst_file);
4600 
4601     return 0;
4602 }
4603 
4604 void postcopy_preempt_shutdown_file(MigrationState *s)
4605 {
4606     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4607     qemu_fflush(s->postcopy_qemufile_src);
4608 }
4609 
4610 static SaveVMHandlers savevm_ram_handlers = {
4611     .save_setup = ram_save_setup,
4612     .save_live_iterate = ram_save_iterate,
4613     .save_live_complete_postcopy = ram_save_complete,
4614     .save_live_complete_precopy = ram_save_complete,
4615     .has_postcopy = ram_has_postcopy,
4616     .state_pending_exact = ram_state_pending_exact,
4617     .state_pending_estimate = ram_state_pending_estimate,
4618     .load_state = ram_load,
4619     .save_cleanup = ram_save_cleanup,
4620     .load_setup = ram_load_setup,
4621     .load_cleanup = ram_load_cleanup,
4622     .resume_prepare = ram_resume_prepare,
4623 };
4624 
4625 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4626                                       size_t old_size, size_t new_size)
4627 {
4628     PostcopyState ps = postcopy_state_get();
4629     ram_addr_t offset;
4630     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4631     Error *err = NULL;
4632 
4633     if (ramblock_is_ignored(rb)) {
4634         return;
4635     }
4636 
4637     if (!migration_is_idle()) {
4638         /*
4639          * Precopy code on the source cannot deal with the size of RAM blocks
4640          * changing at random points in time - especially after sending the
4641          * RAM block sizes in the migration stream, they must no longer change.
4642          * Abort and indicate a proper reason.
4643          */
4644         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4645         migration_cancel(err);
4646         error_free(err);
4647     }
4648 
4649     switch (ps) {
4650     case POSTCOPY_INCOMING_ADVISE:
4651         /*
4652          * Update what ram_postcopy_incoming_init()->init_range() does at the
4653          * time postcopy was advised. Syncing RAM blocks with the source will
4654          * result in RAM resizes.
4655          */
4656         if (old_size < new_size) {
4657             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4658                 error_report("RAM block '%s' discard of resized RAM failed",
4659                              rb->idstr);
4660             }
4661         }
4662         rb->postcopy_length = new_size;
4663         break;
4664     case POSTCOPY_INCOMING_NONE:
4665     case POSTCOPY_INCOMING_RUNNING:
4666     case POSTCOPY_INCOMING_END:
4667         /*
4668          * Once our guest is running, postcopy does no longer care about
4669          * resizes. When growing, the new memory was not available on the
4670          * source, no handler needed.
4671          */
4672         break;
4673     default:
4674         error_report("RAM block '%s' resized during postcopy state: %d",
4675                      rb->idstr, ps);
4676         exit(-1);
4677     }
4678 }
4679 
4680 static RAMBlockNotifier ram_mig_ram_notifier = {
4681     .ram_block_resized = ram_mig_ram_block_resized,
4682 };
4683 
4684 void ram_mig_init(void)
4685 {
4686     qemu_mutex_init(&XBZRLE.lock);
4687     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4688     ram_block_notifier_add(&ram_mig_ram_notifier);
4689 }
4690