xref: /openbmc/qemu/migration/ram.c (revision 85d8da3f)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
64 
65 /***********************************************************/
66 /* ram save/restore */
67 
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69  * worked for pages that where filled with the same char.  We switched
70  * it to only search for the zero value.  And to avoid confusion with
71  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72  */
73 
74 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO     0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE     0x08
78 #define RAM_SAVE_FLAG_EOS      0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE   0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
83 
84 static inline bool is_zero_range(uint8_t *p, uint64_t size)
85 {
86     return buffer_is_zero(p, size);
87 }
88 
89 XBZRLECacheStats xbzrle_counters;
90 
91 /* struct contains XBZRLE cache and a static page
92    used by the compression */
93 static struct {
94     /* buffer used for XBZRLE encoding */
95     uint8_t *encoded_buf;
96     /* buffer for storing page content */
97     uint8_t *current_buf;
98     /* Cache for XBZRLE, Protected by lock. */
99     PageCache *cache;
100     QemuMutex lock;
101     /* it will store a page full of zeros */
102     uint8_t *zero_target_page;
103     /* buffer used for XBZRLE decoding */
104     uint8_t *decoded_buf;
105 } XBZRLE;
106 
107 static void XBZRLE_cache_lock(void)
108 {
109     if (migrate_use_xbzrle()) {
110         qemu_mutex_lock(&XBZRLE.lock);
111     }
112 }
113 
114 static void XBZRLE_cache_unlock(void)
115 {
116     if (migrate_use_xbzrle()) {
117         qemu_mutex_unlock(&XBZRLE.lock);
118     }
119 }
120 
121 /**
122  * xbzrle_cache_resize: resize the xbzrle cache
123  *
124  * This function is called from migrate_params_apply in main
125  * thread, possibly while a migration is in progress.  A running
126  * migration may be using the cache and might finish during this call,
127  * hence changes to the cache are protected by XBZRLE.lock().
128  *
129  * Returns 0 for success or -1 for error
130  *
131  * @new_size: new cache size
132  * @errp: set *errp if the check failed, with reason
133  */
134 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
135 {
136     PageCache *new_cache;
137     int64_t ret = 0;
138 
139     /* Check for truncation */
140     if (new_size != (size_t)new_size) {
141         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
142                    "exceeding address space");
143         return -1;
144     }
145 
146     if (new_size == migrate_xbzrle_cache_size()) {
147         /* nothing to do */
148         return 0;
149     }
150 
151     XBZRLE_cache_lock();
152 
153     if (XBZRLE.cache != NULL) {
154         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
155         if (!new_cache) {
156             ret = -1;
157             goto out;
158         }
159 
160         cache_fini(XBZRLE.cache);
161         XBZRLE.cache = new_cache;
162     }
163 out:
164     XBZRLE_cache_unlock();
165     return ret;
166 }
167 
168 bool ramblock_is_ignored(RAMBlock *block)
169 {
170     return !qemu_ram_is_migratable(block) ||
171            (migrate_ignore_shared() && qemu_ram_is_shared(block));
172 }
173 
174 #undef RAMBLOCK_FOREACH
175 
176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
177 {
178     RAMBlock *block;
179     int ret = 0;
180 
181     RCU_READ_LOCK_GUARD();
182 
183     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
184         ret = func(block, opaque);
185         if (ret) {
186             break;
187         }
188     }
189     return ret;
190 }
191 
192 static void ramblock_recv_map_init(void)
193 {
194     RAMBlock *rb;
195 
196     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
197         assert(!rb->receivedmap);
198         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
199     }
200 }
201 
202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
203 {
204     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
205                     rb->receivedmap);
206 }
207 
208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
209 {
210     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
211 }
212 
213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
214 {
215     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
216 }
217 
218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
219                                     size_t nr)
220 {
221     bitmap_set_atomic(rb->receivedmap,
222                       ramblock_recv_bitmap_offset(host_addr, rb),
223                       nr);
224 }
225 
226 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
227 
228 /*
229  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
230  *
231  * Returns >0 if success with sent bytes, or <0 if error.
232  */
233 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
234                                   const char *block_name)
235 {
236     RAMBlock *block = qemu_ram_block_by_name(block_name);
237     unsigned long *le_bitmap, nbits;
238     uint64_t size;
239 
240     if (!block) {
241         error_report("%s: invalid block name: %s", __func__, block_name);
242         return -1;
243     }
244 
245     nbits = block->used_length >> TARGET_PAGE_BITS;
246 
247     /*
248      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
249      * machines we may need 4 more bytes for padding (see below
250      * comment). So extend it a bit before hand.
251      */
252     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
253 
254     /*
255      * Always use little endian when sending the bitmap. This is
256      * required that when source and destination VMs are not using the
257      * same endianness. (Note: big endian won't work.)
258      */
259     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
260 
261     /* Size of the bitmap, in bytes */
262     size = DIV_ROUND_UP(nbits, 8);
263 
264     /*
265      * size is always aligned to 8 bytes for 64bit machines, but it
266      * may not be true for 32bit machines. We need this padding to
267      * make sure the migration can survive even between 32bit and
268      * 64bit machines.
269      */
270     size = ROUND_UP(size, 8);
271 
272     qemu_put_be64(file, size);
273     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
274     /*
275      * Mark as an end, in case the middle part is screwed up due to
276      * some "mysterious" reason.
277      */
278     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
279     qemu_fflush(file);
280 
281     g_free(le_bitmap);
282 
283     if (qemu_file_get_error(file)) {
284         return qemu_file_get_error(file);
285     }
286 
287     return size + sizeof(size);
288 }
289 
290 /*
291  * An outstanding page request, on the source, having been received
292  * and queued
293  */
294 struct RAMSrcPageRequest {
295     RAMBlock *rb;
296     hwaddr    offset;
297     hwaddr    len;
298 
299     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
300 };
301 
302 /* State of RAM for migration */
303 struct RAMState {
304     /* QEMUFile used for this migration */
305     QEMUFile *f;
306     /* UFFD file descriptor, used in 'write-tracking' migration */
307     int uffdio_fd;
308     /* Last block that we have visited searching for dirty pages */
309     RAMBlock *last_seen_block;
310     /* Last block from where we have sent data */
311     RAMBlock *last_sent_block;
312     /* Last dirty target page we have sent */
313     ram_addr_t last_page;
314     /* last ram version we have seen */
315     uint32_t last_version;
316     /* We are in the first round */
317     bool ram_bulk_stage;
318     /* The free page optimization is enabled */
319     bool fpo_enabled;
320     /* How many times we have dirty too many pages */
321     int dirty_rate_high_cnt;
322     /* these variables are used for bitmap sync */
323     /* last time we did a full bitmap_sync */
324     int64_t time_last_bitmap_sync;
325     /* bytes transferred at start_time */
326     uint64_t bytes_xfer_prev;
327     /* number of dirty pages since start_time */
328     uint64_t num_dirty_pages_period;
329     /* xbzrle misses since the beginning of the period */
330     uint64_t xbzrle_cache_miss_prev;
331     /* Amount of xbzrle pages since the beginning of the period */
332     uint64_t xbzrle_pages_prev;
333     /* Amount of xbzrle encoded bytes since the beginning of the period */
334     uint64_t xbzrle_bytes_prev;
335 
336     /* compression statistics since the beginning of the period */
337     /* amount of count that no free thread to compress data */
338     uint64_t compress_thread_busy_prev;
339     /* amount bytes after compression */
340     uint64_t compressed_size_prev;
341     /* amount of compressed pages */
342     uint64_t compress_pages_prev;
343 
344     /* total handled target pages at the beginning of period */
345     uint64_t target_page_count_prev;
346     /* total handled target pages since start */
347     uint64_t target_page_count;
348     /* number of dirty bits in the bitmap */
349     uint64_t migration_dirty_pages;
350     /* Protects modification of the bitmap and migration dirty pages */
351     QemuMutex bitmap_mutex;
352     /* The RAMBlock used in the last src_page_requests */
353     RAMBlock *last_req_rb;
354     /* Queue of outstanding page requests from the destination */
355     QemuMutex src_page_req_mutex;
356     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
357 };
358 typedef struct RAMState RAMState;
359 
360 static RAMState *ram_state;
361 
362 static NotifierWithReturnList precopy_notifier_list;
363 
364 void precopy_infrastructure_init(void)
365 {
366     notifier_with_return_list_init(&precopy_notifier_list);
367 }
368 
369 void precopy_add_notifier(NotifierWithReturn *n)
370 {
371     notifier_with_return_list_add(&precopy_notifier_list, n);
372 }
373 
374 void precopy_remove_notifier(NotifierWithReturn *n)
375 {
376     notifier_with_return_remove(n);
377 }
378 
379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
380 {
381     PrecopyNotifyData pnd;
382     pnd.reason = reason;
383     pnd.errp = errp;
384 
385     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
386 }
387 
388 void precopy_enable_free_page_optimization(void)
389 {
390     if (!ram_state) {
391         return;
392     }
393 
394     ram_state->fpo_enabled = true;
395 }
396 
397 uint64_t ram_bytes_remaining(void)
398 {
399     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
400                        0;
401 }
402 
403 MigrationStats ram_counters;
404 
405 /* used by the search for pages to send */
406 struct PageSearchStatus {
407     /* Current block being searched */
408     RAMBlock    *block;
409     /* Current page to search from */
410     unsigned long page;
411     /* Set once we wrap around */
412     bool         complete_round;
413 };
414 typedef struct PageSearchStatus PageSearchStatus;
415 
416 CompressionStats compression_counters;
417 
418 struct CompressParam {
419     bool done;
420     bool quit;
421     bool zero_page;
422     QEMUFile *file;
423     QemuMutex mutex;
424     QemuCond cond;
425     RAMBlock *block;
426     ram_addr_t offset;
427 
428     /* internally used fields */
429     z_stream stream;
430     uint8_t *originbuf;
431 };
432 typedef struct CompressParam CompressParam;
433 
434 struct DecompressParam {
435     bool done;
436     bool quit;
437     QemuMutex mutex;
438     QemuCond cond;
439     void *des;
440     uint8_t *compbuf;
441     int len;
442     z_stream stream;
443 };
444 typedef struct DecompressParam DecompressParam;
445 
446 static CompressParam *comp_param;
447 static QemuThread *compress_threads;
448 /* comp_done_cond is used to wake up the migration thread when
449  * one of the compression threads has finished the compression.
450  * comp_done_lock is used to co-work with comp_done_cond.
451  */
452 static QemuMutex comp_done_lock;
453 static QemuCond comp_done_cond;
454 /* The empty QEMUFileOps will be used by file in CompressParam */
455 static const QEMUFileOps empty_ops = { };
456 
457 static QEMUFile *decomp_file;
458 static DecompressParam *decomp_param;
459 static QemuThread *decompress_threads;
460 static QemuMutex decomp_done_lock;
461 static QemuCond decomp_done_cond;
462 
463 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
464                                  ram_addr_t offset, uint8_t *source_buf);
465 
466 static void *do_data_compress(void *opaque)
467 {
468     CompressParam *param = opaque;
469     RAMBlock *block;
470     ram_addr_t offset;
471     bool zero_page;
472 
473     qemu_mutex_lock(&param->mutex);
474     while (!param->quit) {
475         if (param->block) {
476             block = param->block;
477             offset = param->offset;
478             param->block = NULL;
479             qemu_mutex_unlock(&param->mutex);
480 
481             zero_page = do_compress_ram_page(param->file, &param->stream,
482                                              block, offset, param->originbuf);
483 
484             qemu_mutex_lock(&comp_done_lock);
485             param->done = true;
486             param->zero_page = zero_page;
487             qemu_cond_signal(&comp_done_cond);
488             qemu_mutex_unlock(&comp_done_lock);
489 
490             qemu_mutex_lock(&param->mutex);
491         } else {
492             qemu_cond_wait(&param->cond, &param->mutex);
493         }
494     }
495     qemu_mutex_unlock(&param->mutex);
496 
497     return NULL;
498 }
499 
500 static void compress_threads_save_cleanup(void)
501 {
502     int i, thread_count;
503 
504     if (!migrate_use_compression() || !comp_param) {
505         return;
506     }
507 
508     thread_count = migrate_compress_threads();
509     for (i = 0; i < thread_count; i++) {
510         /*
511          * we use it as a indicator which shows if the thread is
512          * properly init'd or not
513          */
514         if (!comp_param[i].file) {
515             break;
516         }
517 
518         qemu_mutex_lock(&comp_param[i].mutex);
519         comp_param[i].quit = true;
520         qemu_cond_signal(&comp_param[i].cond);
521         qemu_mutex_unlock(&comp_param[i].mutex);
522 
523         qemu_thread_join(compress_threads + i);
524         qemu_mutex_destroy(&comp_param[i].mutex);
525         qemu_cond_destroy(&comp_param[i].cond);
526         deflateEnd(&comp_param[i].stream);
527         g_free(comp_param[i].originbuf);
528         qemu_fclose(comp_param[i].file);
529         comp_param[i].file = NULL;
530     }
531     qemu_mutex_destroy(&comp_done_lock);
532     qemu_cond_destroy(&comp_done_cond);
533     g_free(compress_threads);
534     g_free(comp_param);
535     compress_threads = NULL;
536     comp_param = NULL;
537 }
538 
539 static int compress_threads_save_setup(void)
540 {
541     int i, thread_count;
542 
543     if (!migrate_use_compression()) {
544         return 0;
545     }
546     thread_count = migrate_compress_threads();
547     compress_threads = g_new0(QemuThread, thread_count);
548     comp_param = g_new0(CompressParam, thread_count);
549     qemu_cond_init(&comp_done_cond);
550     qemu_mutex_init(&comp_done_lock);
551     for (i = 0; i < thread_count; i++) {
552         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
553         if (!comp_param[i].originbuf) {
554             goto exit;
555         }
556 
557         if (deflateInit(&comp_param[i].stream,
558                         migrate_compress_level()) != Z_OK) {
559             g_free(comp_param[i].originbuf);
560             goto exit;
561         }
562 
563         /* comp_param[i].file is just used as a dummy buffer to save data,
564          * set its ops to empty.
565          */
566         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
567         comp_param[i].done = true;
568         comp_param[i].quit = false;
569         qemu_mutex_init(&comp_param[i].mutex);
570         qemu_cond_init(&comp_param[i].cond);
571         qemu_thread_create(compress_threads + i, "compress",
572                            do_data_compress, comp_param + i,
573                            QEMU_THREAD_JOINABLE);
574     }
575     return 0;
576 
577 exit:
578     compress_threads_save_cleanup();
579     return -1;
580 }
581 
582 /**
583  * save_page_header: write page header to wire
584  *
585  * If this is the 1st block, it also writes the block identification
586  *
587  * Returns the number of bytes written
588  *
589  * @f: QEMUFile where to send the data
590  * @block: block that contains the page we want to send
591  * @offset: offset inside the block for the page
592  *          in the lower bits, it contains flags
593  */
594 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
595                                ram_addr_t offset)
596 {
597     size_t size, len;
598 
599     if (block == rs->last_sent_block) {
600         offset |= RAM_SAVE_FLAG_CONTINUE;
601     }
602     qemu_put_be64(f, offset);
603     size = 8;
604 
605     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
606         len = strlen(block->idstr);
607         qemu_put_byte(f, len);
608         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
609         size += 1 + len;
610         rs->last_sent_block = block;
611     }
612     return size;
613 }
614 
615 /**
616  * mig_throttle_guest_down: throotle down the guest
617  *
618  * Reduce amount of guest cpu execution to hopefully slow down memory
619  * writes. If guest dirty memory rate is reduced below the rate at
620  * which we can transfer pages to the destination then we should be
621  * able to complete migration. Some workloads dirty memory way too
622  * fast and will not effectively converge, even with auto-converge.
623  */
624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
625                                     uint64_t bytes_dirty_threshold)
626 {
627     MigrationState *s = migrate_get_current();
628     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
629     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
630     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
631     int pct_max = s->parameters.max_cpu_throttle;
632 
633     uint64_t throttle_now = cpu_throttle_get_percentage();
634     uint64_t cpu_now, cpu_ideal, throttle_inc;
635 
636     /* We have not started throttling yet. Let's start it. */
637     if (!cpu_throttle_active()) {
638         cpu_throttle_set(pct_initial);
639     } else {
640         /* Throttling already on, just increase the rate */
641         if (!pct_tailslow) {
642             throttle_inc = pct_increment;
643         } else {
644             /* Compute the ideal CPU percentage used by Guest, which may
645              * make the dirty rate match the dirty rate threshold. */
646             cpu_now = 100 - throttle_now;
647             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
648                         bytes_dirty_period);
649             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
650         }
651         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
652     }
653 }
654 
655 /**
656  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
657  *
658  * @rs: current RAM state
659  * @current_addr: address for the zero page
660  *
661  * Update the xbzrle cache to reflect a page that's been sent as all 0.
662  * The important thing is that a stale (not-yet-0'd) page be replaced
663  * by the new data.
664  * As a bonus, if the page wasn't in the cache it gets added so that
665  * when a small write is made into the 0'd page it gets XBZRLE sent.
666  */
667 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
668 {
669     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
670         return;
671     }
672 
673     /* We don't care if this fails to allocate a new cache page
674      * as long as it updated an old one */
675     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
676                  ram_counters.dirty_sync_count);
677 }
678 
679 #define ENCODING_FLAG_XBZRLE 0x1
680 
681 /**
682  * save_xbzrle_page: compress and send current page
683  *
684  * Returns: 1 means that we wrote the page
685  *          0 means that page is identical to the one already sent
686  *          -1 means that xbzrle would be longer than normal
687  *
688  * @rs: current RAM state
689  * @current_data: pointer to the address of the page contents
690  * @current_addr: addr of the page
691  * @block: block that contains the page we want to send
692  * @offset: offset inside the block for the page
693  * @last_stage: if we are at the completion stage
694  */
695 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
696                             ram_addr_t current_addr, RAMBlock *block,
697                             ram_addr_t offset, bool last_stage)
698 {
699     int encoded_len = 0, bytes_xbzrle;
700     uint8_t *prev_cached_page;
701 
702     if (!cache_is_cached(XBZRLE.cache, current_addr,
703                          ram_counters.dirty_sync_count)) {
704         xbzrle_counters.cache_miss++;
705         if (!last_stage) {
706             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
707                              ram_counters.dirty_sync_count) == -1) {
708                 return -1;
709             } else {
710                 /* update *current_data when the page has been
711                    inserted into cache */
712                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
713             }
714         }
715         return -1;
716     }
717 
718     /*
719      * Reaching here means the page has hit the xbzrle cache, no matter what
720      * encoding result it is (normal encoding, overflow or skipping the page),
721      * count the page as encoded. This is used to calculate the encoding rate.
722      *
723      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
724      * 2nd page turns out to be skipped (i.e. no new bytes written to the
725      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
726      * skipped page included. In this way, the encoding rate can tell if the
727      * guest page is good for xbzrle encoding.
728      */
729     xbzrle_counters.pages++;
730     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
731 
732     /* save current buffer into memory */
733     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
734 
735     /* XBZRLE encoding (if there is no overflow) */
736     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
737                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
738                                        TARGET_PAGE_SIZE);
739 
740     /*
741      * Update the cache contents, so that it corresponds to the data
742      * sent, in all cases except where we skip the page.
743      */
744     if (!last_stage && encoded_len != 0) {
745         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
746         /*
747          * In the case where we couldn't compress, ensure that the caller
748          * sends the data from the cache, since the guest might have
749          * changed the RAM since we copied it.
750          */
751         *current_data = prev_cached_page;
752     }
753 
754     if (encoded_len == 0) {
755         trace_save_xbzrle_page_skipping();
756         return 0;
757     } else if (encoded_len == -1) {
758         trace_save_xbzrle_page_overflow();
759         xbzrle_counters.overflow++;
760         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
761         return -1;
762     }
763 
764     /* Send XBZRLE based compressed page */
765     bytes_xbzrle = save_page_header(rs, rs->f, block,
766                                     offset | RAM_SAVE_FLAG_XBZRLE);
767     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
768     qemu_put_be16(rs->f, encoded_len);
769     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
770     bytes_xbzrle += encoded_len + 1 + 2;
771     /*
772      * Like compressed_size (please see update_compress_thread_counts),
773      * the xbzrle encoded bytes don't count the 8 byte header with
774      * RAM_SAVE_FLAG_CONTINUE.
775      */
776     xbzrle_counters.bytes += bytes_xbzrle - 8;
777     ram_counters.transferred += bytes_xbzrle;
778 
779     return 1;
780 }
781 
782 /**
783  * migration_bitmap_find_dirty: find the next dirty page from start
784  *
785  * Returns the page offset within memory region of the start of a dirty page
786  *
787  * @rs: current RAM state
788  * @rb: RAMBlock where to search for dirty pages
789  * @start: page where we start the search
790  */
791 static inline
792 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
793                                           unsigned long start)
794 {
795     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
796     unsigned long *bitmap = rb->bmap;
797     unsigned long next;
798 
799     if (ramblock_is_ignored(rb)) {
800         return size;
801     }
802 
803     /*
804      * When the free page optimization is enabled, we need to check the bitmap
805      * to send the non-free pages rather than all the pages in the bulk stage.
806      */
807     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
808         next = start + 1;
809     } else {
810         next = find_next_bit(bitmap, size, start);
811     }
812 
813     return next;
814 }
815 
816 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
817                                                 RAMBlock *rb,
818                                                 unsigned long page)
819 {
820     bool ret;
821 
822     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
823 
824     /*
825      * Clear dirty bitmap if needed.  This _must_ be called before we
826      * send any of the page in the chunk because we need to make sure
827      * we can capture further page content changes when we sync dirty
828      * log the next time.  So as long as we are going to send any of
829      * the page in the chunk we clear the remote dirty bitmap for all.
830      * Clearing it earlier won't be a problem, but too late will.
831      */
832     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
833         uint8_t shift = rb->clear_bmap_shift;
834         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
835         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
836 
837         /*
838          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
839          * can make things easier sometimes since then start address
840          * of the small chunk will always be 64 pages aligned so the
841          * bitmap will always be aligned to unsigned long.  We should
842          * even be able to remove this restriction but I'm simply
843          * keeping it.
844          */
845         assert(shift >= 6);
846         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
847         memory_region_clear_dirty_bitmap(rb->mr, start, size);
848     }
849 
850     ret = test_and_clear_bit(page, rb->bmap);
851 
852     if (ret) {
853         rs->migration_dirty_pages--;
854     }
855 
856     return ret;
857 }
858 
859 /* Called with RCU critical section */
860 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
861 {
862     uint64_t new_dirty_pages =
863         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
864 
865     rs->migration_dirty_pages += new_dirty_pages;
866     rs->num_dirty_pages_period += new_dirty_pages;
867 }
868 
869 /**
870  * ram_pagesize_summary: calculate all the pagesizes of a VM
871  *
872  * Returns a summary bitmap of the page sizes of all RAMBlocks
873  *
874  * For VMs with just normal pages this is equivalent to the host page
875  * size. If it's got some huge pages then it's the OR of all the
876  * different page sizes.
877  */
878 uint64_t ram_pagesize_summary(void)
879 {
880     RAMBlock *block;
881     uint64_t summary = 0;
882 
883     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
884         summary |= block->page_size;
885     }
886 
887     return summary;
888 }
889 
890 uint64_t ram_get_total_transferred_pages(void)
891 {
892     return  ram_counters.normal + ram_counters.duplicate +
893                 compression_counters.pages + xbzrle_counters.pages;
894 }
895 
896 static void migration_update_rates(RAMState *rs, int64_t end_time)
897 {
898     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
899     double compressed_size;
900 
901     /* calculate period counters */
902     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
903                 / (end_time - rs->time_last_bitmap_sync);
904 
905     if (!page_count) {
906         return;
907     }
908 
909     if (migrate_use_xbzrle()) {
910         double encoded_size, unencoded_size;
911 
912         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
913             rs->xbzrle_cache_miss_prev) / page_count;
914         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
915         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
916                          TARGET_PAGE_SIZE;
917         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
918         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
919             xbzrle_counters.encoding_rate = 0;
920         } else {
921             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
922         }
923         rs->xbzrle_pages_prev = xbzrle_counters.pages;
924         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
925     }
926 
927     if (migrate_use_compression()) {
928         compression_counters.busy_rate = (double)(compression_counters.busy -
929             rs->compress_thread_busy_prev) / page_count;
930         rs->compress_thread_busy_prev = compression_counters.busy;
931 
932         compressed_size = compression_counters.compressed_size -
933                           rs->compressed_size_prev;
934         if (compressed_size) {
935             double uncompressed_size = (compression_counters.pages -
936                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
937 
938             /* Compression-Ratio = Uncompressed-size / Compressed-size */
939             compression_counters.compression_rate =
940                                         uncompressed_size / compressed_size;
941 
942             rs->compress_pages_prev = compression_counters.pages;
943             rs->compressed_size_prev = compression_counters.compressed_size;
944         }
945     }
946 }
947 
948 static void migration_trigger_throttle(RAMState *rs)
949 {
950     MigrationState *s = migrate_get_current();
951     uint64_t threshold = s->parameters.throttle_trigger_threshold;
952 
953     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
954     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
955     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
956 
957     /* During block migration the auto-converge logic incorrectly detects
958      * that ram migration makes no progress. Avoid this by disabling the
959      * throttling logic during the bulk phase of block migration. */
960     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
961         /* The following detection logic can be refined later. For now:
962            Check to see if the ratio between dirtied bytes and the approx.
963            amount of bytes that just got transferred since the last time
964            we were in this routine reaches the threshold. If that happens
965            twice, start or increase throttling. */
966 
967         if ((bytes_dirty_period > bytes_dirty_threshold) &&
968             (++rs->dirty_rate_high_cnt >= 2)) {
969             trace_migration_throttle();
970             rs->dirty_rate_high_cnt = 0;
971             mig_throttle_guest_down(bytes_dirty_period,
972                                     bytes_dirty_threshold);
973         }
974     }
975 }
976 
977 static void migration_bitmap_sync(RAMState *rs)
978 {
979     RAMBlock *block;
980     int64_t end_time;
981 
982     ram_counters.dirty_sync_count++;
983 
984     if (!rs->time_last_bitmap_sync) {
985         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
986     }
987 
988     trace_migration_bitmap_sync_start();
989     memory_global_dirty_log_sync();
990 
991     qemu_mutex_lock(&rs->bitmap_mutex);
992     WITH_RCU_READ_LOCK_GUARD() {
993         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
994             ramblock_sync_dirty_bitmap(rs, block);
995         }
996         ram_counters.remaining = ram_bytes_remaining();
997     }
998     qemu_mutex_unlock(&rs->bitmap_mutex);
999 
1000     memory_global_after_dirty_log_sync();
1001     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1002 
1003     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1004 
1005     /* more than 1 second = 1000 millisecons */
1006     if (end_time > rs->time_last_bitmap_sync + 1000) {
1007         migration_trigger_throttle(rs);
1008 
1009         migration_update_rates(rs, end_time);
1010 
1011         rs->target_page_count_prev = rs->target_page_count;
1012 
1013         /* reset period counters */
1014         rs->time_last_bitmap_sync = end_time;
1015         rs->num_dirty_pages_period = 0;
1016         rs->bytes_xfer_prev = ram_counters.transferred;
1017     }
1018     if (migrate_use_events()) {
1019         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1020     }
1021 }
1022 
1023 static void migration_bitmap_sync_precopy(RAMState *rs)
1024 {
1025     Error *local_err = NULL;
1026 
1027     /*
1028      * The current notifier usage is just an optimization to migration, so we
1029      * don't stop the normal migration process in the error case.
1030      */
1031     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1032         error_report_err(local_err);
1033         local_err = NULL;
1034     }
1035 
1036     migration_bitmap_sync(rs);
1037 
1038     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1039         error_report_err(local_err);
1040     }
1041 }
1042 
1043 /**
1044  * save_zero_page_to_file: send the zero page to the file
1045  *
1046  * Returns the size of data written to the file, 0 means the page is not
1047  * a zero page
1048  *
1049  * @rs: current RAM state
1050  * @file: the file where the data is saved
1051  * @block: block that contains the page we want to send
1052  * @offset: offset inside the block for the page
1053  */
1054 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1055                                   RAMBlock *block, ram_addr_t offset)
1056 {
1057     uint8_t *p = block->host + offset;
1058     int len = 0;
1059 
1060     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1061         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1062         qemu_put_byte(file, 0);
1063         len += 1;
1064     }
1065     return len;
1066 }
1067 
1068 /**
1069  * save_zero_page: send the zero page to the stream
1070  *
1071  * Returns the number of pages written.
1072  *
1073  * @rs: current RAM state
1074  * @block: block that contains the page we want to send
1075  * @offset: offset inside the block for the page
1076  */
1077 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1078 {
1079     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1080 
1081     if (len) {
1082         ram_counters.duplicate++;
1083         ram_counters.transferred += len;
1084         return 1;
1085     }
1086     return -1;
1087 }
1088 
1089 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1090 {
1091     if (!migrate_release_ram() || !migration_in_postcopy()) {
1092         return;
1093     }
1094 
1095     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1096 }
1097 
1098 /*
1099  * @pages: the number of pages written by the control path,
1100  *        < 0 - error
1101  *        > 0 - number of pages written
1102  *
1103  * Return true if the pages has been saved, otherwise false is returned.
1104  */
1105 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1106                               int *pages)
1107 {
1108     uint64_t bytes_xmit = 0;
1109     int ret;
1110 
1111     *pages = -1;
1112     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1113                                 &bytes_xmit);
1114     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1115         return false;
1116     }
1117 
1118     if (bytes_xmit) {
1119         ram_counters.transferred += bytes_xmit;
1120         *pages = 1;
1121     }
1122 
1123     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1124         return true;
1125     }
1126 
1127     if (bytes_xmit > 0) {
1128         ram_counters.normal++;
1129     } else if (bytes_xmit == 0) {
1130         ram_counters.duplicate++;
1131     }
1132 
1133     return true;
1134 }
1135 
1136 /*
1137  * directly send the page to the stream
1138  *
1139  * Returns the number of pages written.
1140  *
1141  * @rs: current RAM state
1142  * @block: block that contains the page we want to send
1143  * @offset: offset inside the block for the page
1144  * @buf: the page to be sent
1145  * @async: send to page asyncly
1146  */
1147 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1148                             uint8_t *buf, bool async)
1149 {
1150     ram_counters.transferred += save_page_header(rs, rs->f, block,
1151                                                  offset | RAM_SAVE_FLAG_PAGE);
1152     if (async) {
1153         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1154                               migrate_release_ram() &
1155                               migration_in_postcopy());
1156     } else {
1157         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1158     }
1159     ram_counters.transferred += TARGET_PAGE_SIZE;
1160     ram_counters.normal++;
1161     return 1;
1162 }
1163 
1164 /**
1165  * ram_save_page: send the given page to the stream
1166  *
1167  * Returns the number of pages written.
1168  *          < 0 - error
1169  *          >=0 - Number of pages written - this might legally be 0
1170  *                if xbzrle noticed the page was the same.
1171  *
1172  * @rs: current RAM state
1173  * @block: block that contains the page we want to send
1174  * @offset: offset inside the block for the page
1175  * @last_stage: if we are at the completion stage
1176  */
1177 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1178 {
1179     int pages = -1;
1180     uint8_t *p;
1181     bool send_async = true;
1182     RAMBlock *block = pss->block;
1183     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1184     ram_addr_t current_addr = block->offset + offset;
1185 
1186     p = block->host + offset;
1187     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1188 
1189     XBZRLE_cache_lock();
1190     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1191         migrate_use_xbzrle()) {
1192         pages = save_xbzrle_page(rs, &p, current_addr, block,
1193                                  offset, last_stage);
1194         if (!last_stage) {
1195             /* Can't send this cached data async, since the cache page
1196              * might get updated before it gets to the wire
1197              */
1198             send_async = false;
1199         }
1200     }
1201 
1202     /* XBZRLE overflow or normal page */
1203     if (pages == -1) {
1204         pages = save_normal_page(rs, block, offset, p, send_async);
1205     }
1206 
1207     XBZRLE_cache_unlock();
1208 
1209     return pages;
1210 }
1211 
1212 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1213                                  ram_addr_t offset)
1214 {
1215     if (multifd_queue_page(rs->f, block, offset) < 0) {
1216         return -1;
1217     }
1218     ram_counters.normal++;
1219 
1220     return 1;
1221 }
1222 
1223 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1224                                  ram_addr_t offset, uint8_t *source_buf)
1225 {
1226     RAMState *rs = ram_state;
1227     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1228     bool zero_page = false;
1229     int ret;
1230 
1231     if (save_zero_page_to_file(rs, f, block, offset)) {
1232         zero_page = true;
1233         goto exit;
1234     }
1235 
1236     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1237 
1238     /*
1239      * copy it to a internal buffer to avoid it being modified by VM
1240      * so that we can catch up the error during compression and
1241      * decompression
1242      */
1243     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1244     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1245     if (ret < 0) {
1246         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1247         error_report("compressed data failed!");
1248         return false;
1249     }
1250 
1251 exit:
1252     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1253     return zero_page;
1254 }
1255 
1256 static void
1257 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1258 {
1259     ram_counters.transferred += bytes_xmit;
1260 
1261     if (param->zero_page) {
1262         ram_counters.duplicate++;
1263         return;
1264     }
1265 
1266     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1267     compression_counters.compressed_size += bytes_xmit - 8;
1268     compression_counters.pages++;
1269 }
1270 
1271 static bool save_page_use_compression(RAMState *rs);
1272 
1273 static void flush_compressed_data(RAMState *rs)
1274 {
1275     int idx, len, thread_count;
1276 
1277     if (!save_page_use_compression(rs)) {
1278         return;
1279     }
1280     thread_count = migrate_compress_threads();
1281 
1282     qemu_mutex_lock(&comp_done_lock);
1283     for (idx = 0; idx < thread_count; idx++) {
1284         while (!comp_param[idx].done) {
1285             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1286         }
1287     }
1288     qemu_mutex_unlock(&comp_done_lock);
1289 
1290     for (idx = 0; idx < thread_count; idx++) {
1291         qemu_mutex_lock(&comp_param[idx].mutex);
1292         if (!comp_param[idx].quit) {
1293             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1294             /*
1295              * it's safe to fetch zero_page without holding comp_done_lock
1296              * as there is no further request submitted to the thread,
1297              * i.e, the thread should be waiting for a request at this point.
1298              */
1299             update_compress_thread_counts(&comp_param[idx], len);
1300         }
1301         qemu_mutex_unlock(&comp_param[idx].mutex);
1302     }
1303 }
1304 
1305 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1306                                        ram_addr_t offset)
1307 {
1308     param->block = block;
1309     param->offset = offset;
1310 }
1311 
1312 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1313                                            ram_addr_t offset)
1314 {
1315     int idx, thread_count, bytes_xmit = -1, pages = -1;
1316     bool wait = migrate_compress_wait_thread();
1317 
1318     thread_count = migrate_compress_threads();
1319     qemu_mutex_lock(&comp_done_lock);
1320 retry:
1321     for (idx = 0; idx < thread_count; idx++) {
1322         if (comp_param[idx].done) {
1323             comp_param[idx].done = false;
1324             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1325             qemu_mutex_lock(&comp_param[idx].mutex);
1326             set_compress_params(&comp_param[idx], block, offset);
1327             qemu_cond_signal(&comp_param[idx].cond);
1328             qemu_mutex_unlock(&comp_param[idx].mutex);
1329             pages = 1;
1330             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1331             break;
1332         }
1333     }
1334 
1335     /*
1336      * wait for the free thread if the user specifies 'compress-wait-thread',
1337      * otherwise we will post the page out in the main thread as normal page.
1338      */
1339     if (pages < 0 && wait) {
1340         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1341         goto retry;
1342     }
1343     qemu_mutex_unlock(&comp_done_lock);
1344 
1345     return pages;
1346 }
1347 
1348 /**
1349  * find_dirty_block: find the next dirty page and update any state
1350  * associated with the search process.
1351  *
1352  * Returns true if a page is found
1353  *
1354  * @rs: current RAM state
1355  * @pss: data about the state of the current dirty page scan
1356  * @again: set to false if the search has scanned the whole of RAM
1357  */
1358 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1359 {
1360     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1361     if (pss->complete_round && pss->block == rs->last_seen_block &&
1362         pss->page >= rs->last_page) {
1363         /*
1364          * We've been once around the RAM and haven't found anything.
1365          * Give up.
1366          */
1367         *again = false;
1368         return false;
1369     }
1370     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1371         >= pss->block->used_length) {
1372         /* Didn't find anything in this RAM Block */
1373         pss->page = 0;
1374         pss->block = QLIST_NEXT_RCU(pss->block, next);
1375         if (!pss->block) {
1376             /*
1377              * If memory migration starts over, we will meet a dirtied page
1378              * which may still exists in compression threads's ring, so we
1379              * should flush the compressed data to make sure the new page
1380              * is not overwritten by the old one in the destination.
1381              *
1382              * Also If xbzrle is on, stop using the data compression at this
1383              * point. In theory, xbzrle can do better than compression.
1384              */
1385             flush_compressed_data(rs);
1386 
1387             /* Hit the end of the list */
1388             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1389             /* Flag that we've looped */
1390             pss->complete_round = true;
1391             rs->ram_bulk_stage = false;
1392         }
1393         /* Didn't find anything this time, but try again on the new block */
1394         *again = true;
1395         return false;
1396     } else {
1397         /* Can go around again, but... */
1398         *again = true;
1399         /* We've found something so probably don't need to */
1400         return true;
1401     }
1402 }
1403 
1404 /**
1405  * unqueue_page: gets a page of the queue
1406  *
1407  * Helper for 'get_queued_page' - gets a page off the queue
1408  *
1409  * Returns the block of the page (or NULL if none available)
1410  *
1411  * @rs: current RAM state
1412  * @offset: used to return the offset within the RAMBlock
1413  */
1414 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1415 {
1416     RAMBlock *block = NULL;
1417 
1418     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1419         return NULL;
1420     }
1421 
1422     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1423     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1424         struct RAMSrcPageRequest *entry =
1425                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1426         block = entry->rb;
1427         *offset = entry->offset;
1428 
1429         if (entry->len > TARGET_PAGE_SIZE) {
1430             entry->len -= TARGET_PAGE_SIZE;
1431             entry->offset += TARGET_PAGE_SIZE;
1432         } else {
1433             memory_region_unref(block->mr);
1434             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435             g_free(entry);
1436             migration_consume_urgent_request();
1437         }
1438     }
1439 
1440     return block;
1441 }
1442 
1443 #if defined(__linux__)
1444 /**
1445  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1446  *   is found, return RAM block pointer and page offset
1447  *
1448  * Returns pointer to the RAMBlock containing faulting page,
1449  *   NULL if no write faults are pending
1450  *
1451  * @rs: current RAM state
1452  * @offset: page offset from the beginning of the block
1453  */
1454 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1455 {
1456     struct uffd_msg uffd_msg;
1457     void *page_address;
1458     RAMBlock *block;
1459     int res;
1460 
1461     if (!migrate_background_snapshot()) {
1462         return NULL;
1463     }
1464 
1465     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1466     if (res <= 0) {
1467         return NULL;
1468     }
1469 
1470     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1471     block = qemu_ram_block_from_host(page_address, false, offset);
1472     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1473     return block;
1474 }
1475 
1476 /**
1477  * ram_save_release_protection: release UFFD write protection after
1478  *   a range of pages has been saved
1479  *
1480  * @rs: current RAM state
1481  * @pss: page-search-status structure
1482  * @start_page: index of the first page in the range relative to pss->block
1483  *
1484  * Returns 0 on success, negative value in case of an error
1485 */
1486 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1487         unsigned long start_page)
1488 {
1489     int res = 0;
1490 
1491     /* Check if page is from UFFD-managed region. */
1492     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1493         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1494         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1495 
1496         /* Flush async buffers before un-protect. */
1497         qemu_fflush(rs->f);
1498         /* Un-protect memory range. */
1499         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1500                 false, false);
1501     }
1502 
1503     return res;
1504 }
1505 
1506 /* ram_write_tracking_available: check if kernel supports required UFFD features
1507  *
1508  * Returns true if supports, false otherwise
1509  */
1510 bool ram_write_tracking_available(void)
1511 {
1512     uint64_t uffd_features;
1513     int res;
1514 
1515     res = uffd_query_features(&uffd_features);
1516     return (res == 0 &&
1517             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1518 }
1519 
1520 /* ram_write_tracking_compatible: check if guest configuration is
1521  *   compatible with 'write-tracking'
1522  *
1523  * Returns true if compatible, false otherwise
1524  */
1525 bool ram_write_tracking_compatible(void)
1526 {
1527     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1528     int uffd_fd;
1529     RAMBlock *block;
1530     bool ret = false;
1531 
1532     /* Open UFFD file descriptor */
1533     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1534     if (uffd_fd < 0) {
1535         return false;
1536     }
1537 
1538     RCU_READ_LOCK_GUARD();
1539 
1540     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1541         uint64_t uffd_ioctls;
1542 
1543         /* Nothing to do with read-only and MMIO-writable regions */
1544         if (block->mr->readonly || block->mr->rom_device) {
1545             continue;
1546         }
1547         /* Try to register block memory via UFFD-IO to track writes */
1548         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1549                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1550             goto out;
1551         }
1552         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1553             goto out;
1554         }
1555     }
1556     ret = true;
1557 
1558 out:
1559     uffd_close_fd(uffd_fd);
1560     return ret;
1561 }
1562 
1563 /*
1564  * ram_block_populate_pages: populate memory in the RAM block by reading
1565  *   an integer from the beginning of each page.
1566  *
1567  * Since it's solely used for userfault_fd WP feature, here we just
1568  *   hardcode page size to qemu_real_host_page_size.
1569  *
1570  * @block: RAM block to populate
1571  */
1572 static void ram_block_populate_pages(RAMBlock *block)
1573 {
1574     char *ptr = (char *) block->host;
1575 
1576     for (ram_addr_t offset = 0; offset < block->used_length;
1577             offset += qemu_real_host_page_size) {
1578         char tmp = *(ptr + offset);
1579 
1580         /* Don't optimize the read out */
1581         asm volatile("" : "+r" (tmp));
1582     }
1583 }
1584 
1585 /*
1586  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1587  */
1588 void ram_write_tracking_prepare(void)
1589 {
1590     RAMBlock *block;
1591 
1592     RCU_READ_LOCK_GUARD();
1593 
1594     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1595         /* Nothing to do with read-only and MMIO-writable regions */
1596         if (block->mr->readonly || block->mr->rom_device) {
1597             continue;
1598         }
1599 
1600         /*
1601          * Populate pages of the RAM block before enabling userfault_fd
1602          * write protection.
1603          *
1604          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1605          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1606          * pages with pte_none() entries in page table.
1607          */
1608         ram_block_populate_pages(block);
1609     }
1610 }
1611 
1612 /*
1613  * ram_write_tracking_start: start UFFD-WP memory tracking
1614  *
1615  * Returns 0 for success or negative value in case of error
1616  */
1617 int ram_write_tracking_start(void)
1618 {
1619     int uffd_fd;
1620     RAMState *rs = ram_state;
1621     RAMBlock *block;
1622 
1623     /* Open UFFD file descriptor */
1624     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1625     if (uffd_fd < 0) {
1626         return uffd_fd;
1627     }
1628     rs->uffdio_fd = uffd_fd;
1629 
1630     RCU_READ_LOCK_GUARD();
1631 
1632     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1633         /* Nothing to do with read-only and MMIO-writable regions */
1634         if (block->mr->readonly || block->mr->rom_device) {
1635             continue;
1636         }
1637 
1638         /* Register block memory with UFFD to track writes */
1639         if (uffd_register_memory(rs->uffdio_fd, block->host,
1640                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1641             goto fail;
1642         }
1643         /* Apply UFFD write protection to the block memory range */
1644         if (uffd_change_protection(rs->uffdio_fd, block->host,
1645                 block->max_length, true, false)) {
1646             goto fail;
1647         }
1648         block->flags |= RAM_UF_WRITEPROTECT;
1649         memory_region_ref(block->mr);
1650 
1651         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1652                 block->host, block->max_length);
1653     }
1654 
1655     return 0;
1656 
1657 fail:
1658     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1659 
1660     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1661         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1662             continue;
1663         }
1664         /*
1665          * In case some memory block failed to be write-protected
1666          * remove protection and unregister all succeeded RAM blocks
1667          */
1668         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1669                 false, false);
1670         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1671         /* Cleanup flags and remove reference */
1672         block->flags &= ~RAM_UF_WRITEPROTECT;
1673         memory_region_unref(block->mr);
1674     }
1675 
1676     uffd_close_fd(uffd_fd);
1677     rs->uffdio_fd = -1;
1678     return -1;
1679 }
1680 
1681 /**
1682  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1683  */
1684 void ram_write_tracking_stop(void)
1685 {
1686     RAMState *rs = ram_state;
1687     RAMBlock *block;
1688 
1689     RCU_READ_LOCK_GUARD();
1690 
1691     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1692         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1693             continue;
1694         }
1695         /* Remove protection and unregister all affected RAM blocks */
1696         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1697                 false, false);
1698         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1699 
1700         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1701                 block->host, block->max_length);
1702 
1703         /* Cleanup flags and remove reference */
1704         block->flags &= ~RAM_UF_WRITEPROTECT;
1705         memory_region_unref(block->mr);
1706     }
1707 
1708     /* Finally close UFFD file descriptor */
1709     uffd_close_fd(rs->uffdio_fd);
1710     rs->uffdio_fd = -1;
1711 }
1712 
1713 #else
1714 /* No target OS support, stubs just fail or ignore */
1715 
1716 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1717 {
1718     (void) rs;
1719     (void) offset;
1720 
1721     return NULL;
1722 }
1723 
1724 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1725         unsigned long start_page)
1726 {
1727     (void) rs;
1728     (void) pss;
1729     (void) start_page;
1730 
1731     return 0;
1732 }
1733 
1734 bool ram_write_tracking_available(void)
1735 {
1736     return false;
1737 }
1738 
1739 bool ram_write_tracking_compatible(void)
1740 {
1741     assert(0);
1742     return false;
1743 }
1744 
1745 int ram_write_tracking_start(void)
1746 {
1747     assert(0);
1748     return -1;
1749 }
1750 
1751 void ram_write_tracking_stop(void)
1752 {
1753     assert(0);
1754 }
1755 #endif /* defined(__linux__) */
1756 
1757 /**
1758  * get_queued_page: unqueue a page from the postcopy requests
1759  *
1760  * Skips pages that are already sent (!dirty)
1761  *
1762  * Returns true if a queued page is found
1763  *
1764  * @rs: current RAM state
1765  * @pss: data about the state of the current dirty page scan
1766  */
1767 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1768 {
1769     RAMBlock  *block;
1770     ram_addr_t offset;
1771     bool dirty;
1772 
1773     do {
1774         block = unqueue_page(rs, &offset);
1775         /*
1776          * We're sending this page, and since it's postcopy nothing else
1777          * will dirty it, and we must make sure it doesn't get sent again
1778          * even if this queue request was received after the background
1779          * search already sent it.
1780          */
1781         if (block) {
1782             unsigned long page;
1783 
1784             page = offset >> TARGET_PAGE_BITS;
1785             dirty = test_bit(page, block->bmap);
1786             if (!dirty) {
1787                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1788                                                 page);
1789             } else {
1790                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1791             }
1792         }
1793 
1794     } while (block && !dirty);
1795 
1796     if (!block) {
1797         /*
1798          * Poll write faults too if background snapshot is enabled; that's
1799          * when we have vcpus got blocked by the write protected pages.
1800          */
1801         block = poll_fault_page(rs, &offset);
1802     }
1803 
1804     if (block) {
1805         /*
1806          * As soon as we start servicing pages out of order, then we have
1807          * to kill the bulk stage, since the bulk stage assumes
1808          * in (migration_bitmap_find_and_reset_dirty) that every page is
1809          * dirty, that's no longer true.
1810          */
1811         rs->ram_bulk_stage = false;
1812 
1813         /*
1814          * We want the background search to continue from the queued page
1815          * since the guest is likely to want other pages near to the page
1816          * it just requested.
1817          */
1818         pss->block = block;
1819         pss->page = offset >> TARGET_PAGE_BITS;
1820 
1821         /*
1822          * This unqueued page would break the "one round" check, even is
1823          * really rare.
1824          */
1825         pss->complete_round = false;
1826     }
1827 
1828     return !!block;
1829 }
1830 
1831 /**
1832  * migration_page_queue_free: drop any remaining pages in the ram
1833  * request queue
1834  *
1835  * It should be empty at the end anyway, but in error cases there may
1836  * be some left.  in case that there is any page left, we drop it.
1837  *
1838  */
1839 static void migration_page_queue_free(RAMState *rs)
1840 {
1841     struct RAMSrcPageRequest *mspr, *next_mspr;
1842     /* This queue generally should be empty - but in the case of a failed
1843      * migration might have some droppings in.
1844      */
1845     RCU_READ_LOCK_GUARD();
1846     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1847         memory_region_unref(mspr->rb->mr);
1848         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1849         g_free(mspr);
1850     }
1851 }
1852 
1853 /**
1854  * ram_save_queue_pages: queue the page for transmission
1855  *
1856  * A request from postcopy destination for example.
1857  *
1858  * Returns zero on success or negative on error
1859  *
1860  * @rbname: Name of the RAMBLock of the request. NULL means the
1861  *          same that last one.
1862  * @start: starting address from the start of the RAMBlock
1863  * @len: length (in bytes) to send
1864  */
1865 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1866 {
1867     RAMBlock *ramblock;
1868     RAMState *rs = ram_state;
1869 
1870     ram_counters.postcopy_requests++;
1871     RCU_READ_LOCK_GUARD();
1872 
1873     if (!rbname) {
1874         /* Reuse last RAMBlock */
1875         ramblock = rs->last_req_rb;
1876 
1877         if (!ramblock) {
1878             /*
1879              * Shouldn't happen, we can't reuse the last RAMBlock if
1880              * it's the 1st request.
1881              */
1882             error_report("ram_save_queue_pages no previous block");
1883             return -1;
1884         }
1885     } else {
1886         ramblock = qemu_ram_block_by_name(rbname);
1887 
1888         if (!ramblock) {
1889             /* We shouldn't be asked for a non-existent RAMBlock */
1890             error_report("ram_save_queue_pages no block '%s'", rbname);
1891             return -1;
1892         }
1893         rs->last_req_rb = ramblock;
1894     }
1895     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1896     if (start + len > ramblock->used_length) {
1897         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1898                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1899                      __func__, start, len, ramblock->used_length);
1900         return -1;
1901     }
1902 
1903     struct RAMSrcPageRequest *new_entry =
1904         g_malloc0(sizeof(struct RAMSrcPageRequest));
1905     new_entry->rb = ramblock;
1906     new_entry->offset = start;
1907     new_entry->len = len;
1908 
1909     memory_region_ref(ramblock->mr);
1910     qemu_mutex_lock(&rs->src_page_req_mutex);
1911     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1912     migration_make_urgent_request();
1913     qemu_mutex_unlock(&rs->src_page_req_mutex);
1914 
1915     return 0;
1916 }
1917 
1918 static bool save_page_use_compression(RAMState *rs)
1919 {
1920     if (!migrate_use_compression()) {
1921         return false;
1922     }
1923 
1924     /*
1925      * If xbzrle is on, stop using the data compression after first
1926      * round of migration even if compression is enabled. In theory,
1927      * xbzrle can do better than compression.
1928      */
1929     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1930         return true;
1931     }
1932 
1933     return false;
1934 }
1935 
1936 /*
1937  * try to compress the page before posting it out, return true if the page
1938  * has been properly handled by compression, otherwise needs other
1939  * paths to handle it
1940  */
1941 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1942 {
1943     if (!save_page_use_compression(rs)) {
1944         return false;
1945     }
1946 
1947     /*
1948      * When starting the process of a new block, the first page of
1949      * the block should be sent out before other pages in the same
1950      * block, and all the pages in last block should have been sent
1951      * out, keeping this order is important, because the 'cont' flag
1952      * is used to avoid resending the block name.
1953      *
1954      * We post the fist page as normal page as compression will take
1955      * much CPU resource.
1956      */
1957     if (block != rs->last_sent_block) {
1958         flush_compressed_data(rs);
1959         return false;
1960     }
1961 
1962     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1963         return true;
1964     }
1965 
1966     compression_counters.busy++;
1967     return false;
1968 }
1969 
1970 /**
1971  * ram_save_target_page: save one target page
1972  *
1973  * Returns the number of pages written
1974  *
1975  * @rs: current RAM state
1976  * @pss: data about the page we want to send
1977  * @last_stage: if we are at the completion stage
1978  */
1979 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1980                                 bool last_stage)
1981 {
1982     RAMBlock *block = pss->block;
1983     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1984     int res;
1985 
1986     if (control_save_page(rs, block, offset, &res)) {
1987         return res;
1988     }
1989 
1990     if (save_compress_page(rs, block, offset)) {
1991         return 1;
1992     }
1993 
1994     res = save_zero_page(rs, block, offset);
1995     if (res > 0) {
1996         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1997          * page would be stale
1998          */
1999         if (!save_page_use_compression(rs)) {
2000             XBZRLE_cache_lock();
2001             xbzrle_cache_zero_page(rs, block->offset + offset);
2002             XBZRLE_cache_unlock();
2003         }
2004         ram_release_pages(block->idstr, offset, res);
2005         return res;
2006     }
2007 
2008     /*
2009      * Do not use multifd for:
2010      * 1. Compression as the first page in the new block should be posted out
2011      *    before sending the compressed page
2012      * 2. In postcopy as one whole host page should be placed
2013      */
2014     if (!save_page_use_compression(rs) && migrate_use_multifd()
2015         && !migration_in_postcopy()) {
2016         return ram_save_multifd_page(rs, block, offset);
2017     }
2018 
2019     return ram_save_page(rs, pss, last_stage);
2020 }
2021 
2022 /**
2023  * ram_save_host_page: save a whole host page
2024  *
2025  * Starting at *offset send pages up to the end of the current host
2026  * page. It's valid for the initial offset to point into the middle of
2027  * a host page in which case the remainder of the hostpage is sent.
2028  * Only dirty target pages are sent. Note that the host page size may
2029  * be a huge page for this block.
2030  * The saving stops at the boundary of the used_length of the block
2031  * if the RAMBlock isn't a multiple of the host page size.
2032  *
2033  * Returns the number of pages written or negative on error
2034  *
2035  * @rs: current RAM state
2036  * @ms: current migration state
2037  * @pss: data about the page we want to send
2038  * @last_stage: if we are at the completion stage
2039  */
2040 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2041                               bool last_stage)
2042 {
2043     int tmppages, pages = 0;
2044     size_t pagesize_bits =
2045         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2046     unsigned long start_page = pss->page;
2047     int res;
2048 
2049     if (ramblock_is_ignored(pss->block)) {
2050         error_report("block %s should not be migrated !", pss->block->idstr);
2051         return 0;
2052     }
2053 
2054     do {
2055         /* Check the pages is dirty and if it is send it */
2056         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2057             pss->page++;
2058             continue;
2059         }
2060 
2061         tmppages = ram_save_target_page(rs, pss, last_stage);
2062         if (tmppages < 0) {
2063             return tmppages;
2064         }
2065 
2066         pages += tmppages;
2067         pss->page++;
2068         /* Allow rate limiting to happen in the middle of huge pages */
2069         migration_rate_limit();
2070     } while ((pss->page & (pagesize_bits - 1)) &&
2071              offset_in_ramblock(pss->block,
2072                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2073     /* The offset we leave with is the last one we looked at */
2074     pss->page--;
2075 
2076     res = ram_save_release_protection(rs, pss, start_page);
2077     return (res < 0 ? res : pages);
2078 }
2079 
2080 /**
2081  * ram_find_and_save_block: finds a dirty page and sends it to f
2082  *
2083  * Called within an RCU critical section.
2084  *
2085  * Returns the number of pages written where zero means no dirty pages,
2086  * or negative on error
2087  *
2088  * @rs: current RAM state
2089  * @last_stage: if we are at the completion stage
2090  *
2091  * On systems where host-page-size > target-page-size it will send all the
2092  * pages in a host page that are dirty.
2093  */
2094 
2095 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2096 {
2097     PageSearchStatus pss;
2098     int pages = 0;
2099     bool again, found;
2100 
2101     /* No dirty page as there is zero RAM */
2102     if (!ram_bytes_total()) {
2103         return pages;
2104     }
2105 
2106     pss.block = rs->last_seen_block;
2107     pss.page = rs->last_page;
2108     pss.complete_round = false;
2109 
2110     if (!pss.block) {
2111         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2112     }
2113 
2114     do {
2115         again = true;
2116         found = get_queued_page(rs, &pss);
2117 
2118         if (!found) {
2119             /* priority queue empty, so just search for something dirty */
2120             found = find_dirty_block(rs, &pss, &again);
2121         }
2122 
2123         if (found) {
2124             pages = ram_save_host_page(rs, &pss, last_stage);
2125         }
2126     } while (!pages && again);
2127 
2128     rs->last_seen_block = pss.block;
2129     rs->last_page = pss.page;
2130 
2131     return pages;
2132 }
2133 
2134 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2135 {
2136     uint64_t pages = size / TARGET_PAGE_SIZE;
2137 
2138     if (zero) {
2139         ram_counters.duplicate += pages;
2140     } else {
2141         ram_counters.normal += pages;
2142         ram_counters.transferred += size;
2143         qemu_update_position(f, size);
2144     }
2145 }
2146 
2147 static uint64_t ram_bytes_total_common(bool count_ignored)
2148 {
2149     RAMBlock *block;
2150     uint64_t total = 0;
2151 
2152     RCU_READ_LOCK_GUARD();
2153 
2154     if (count_ignored) {
2155         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2156             total += block->used_length;
2157         }
2158     } else {
2159         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2160             total += block->used_length;
2161         }
2162     }
2163     return total;
2164 }
2165 
2166 uint64_t ram_bytes_total(void)
2167 {
2168     return ram_bytes_total_common(false);
2169 }
2170 
2171 static void xbzrle_load_setup(void)
2172 {
2173     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2174 }
2175 
2176 static void xbzrle_load_cleanup(void)
2177 {
2178     g_free(XBZRLE.decoded_buf);
2179     XBZRLE.decoded_buf = NULL;
2180 }
2181 
2182 static void ram_state_cleanup(RAMState **rsp)
2183 {
2184     if (*rsp) {
2185         migration_page_queue_free(*rsp);
2186         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2187         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2188         g_free(*rsp);
2189         *rsp = NULL;
2190     }
2191 }
2192 
2193 static void xbzrle_cleanup(void)
2194 {
2195     XBZRLE_cache_lock();
2196     if (XBZRLE.cache) {
2197         cache_fini(XBZRLE.cache);
2198         g_free(XBZRLE.encoded_buf);
2199         g_free(XBZRLE.current_buf);
2200         g_free(XBZRLE.zero_target_page);
2201         XBZRLE.cache = NULL;
2202         XBZRLE.encoded_buf = NULL;
2203         XBZRLE.current_buf = NULL;
2204         XBZRLE.zero_target_page = NULL;
2205     }
2206     XBZRLE_cache_unlock();
2207 }
2208 
2209 static void ram_save_cleanup(void *opaque)
2210 {
2211     RAMState **rsp = opaque;
2212     RAMBlock *block;
2213 
2214     /* We don't use dirty log with background snapshots */
2215     if (!migrate_background_snapshot()) {
2216         /* caller have hold iothread lock or is in a bh, so there is
2217          * no writing race against the migration bitmap
2218          */
2219         memory_global_dirty_log_stop();
2220     }
2221 
2222     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2223         g_free(block->clear_bmap);
2224         block->clear_bmap = NULL;
2225         g_free(block->bmap);
2226         block->bmap = NULL;
2227     }
2228 
2229     xbzrle_cleanup();
2230     compress_threads_save_cleanup();
2231     ram_state_cleanup(rsp);
2232 }
2233 
2234 static void ram_state_reset(RAMState *rs)
2235 {
2236     rs->last_seen_block = NULL;
2237     rs->last_sent_block = NULL;
2238     rs->last_page = 0;
2239     rs->last_version = ram_list.version;
2240     rs->ram_bulk_stage = true;
2241     rs->fpo_enabled = false;
2242 }
2243 
2244 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2245 
2246 /*
2247  * 'expected' is the value you expect the bitmap mostly to be full
2248  * of; it won't bother printing lines that are all this value.
2249  * If 'todump' is null the migration bitmap is dumped.
2250  */
2251 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2252                            unsigned long pages)
2253 {
2254     int64_t cur;
2255     int64_t linelen = 128;
2256     char linebuf[129];
2257 
2258     for (cur = 0; cur < pages; cur += linelen) {
2259         int64_t curb;
2260         bool found = false;
2261         /*
2262          * Last line; catch the case where the line length
2263          * is longer than remaining ram
2264          */
2265         if (cur + linelen > pages) {
2266             linelen = pages - cur;
2267         }
2268         for (curb = 0; curb < linelen; curb++) {
2269             bool thisbit = test_bit(cur + curb, todump);
2270             linebuf[curb] = thisbit ? '1' : '.';
2271             found = found || (thisbit != expected);
2272         }
2273         if (found) {
2274             linebuf[curb] = '\0';
2275             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2276         }
2277     }
2278 }
2279 
2280 /* **** functions for postcopy ***** */
2281 
2282 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2283 {
2284     struct RAMBlock *block;
2285 
2286     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2287         unsigned long *bitmap = block->bmap;
2288         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2289         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2290 
2291         while (run_start < range) {
2292             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2293             ram_discard_range(block->idstr,
2294                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2295                               ((ram_addr_t)(run_end - run_start))
2296                                 << TARGET_PAGE_BITS);
2297             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2298         }
2299     }
2300 }
2301 
2302 /**
2303  * postcopy_send_discard_bm_ram: discard a RAMBlock
2304  *
2305  * Returns zero on success
2306  *
2307  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2308  *
2309  * @ms: current migration state
2310  * @block: RAMBlock to discard
2311  */
2312 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2313 {
2314     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2315     unsigned long current;
2316     unsigned long *bitmap = block->bmap;
2317 
2318     for (current = 0; current < end; ) {
2319         unsigned long one = find_next_bit(bitmap, end, current);
2320         unsigned long zero, discard_length;
2321 
2322         if (one >= end) {
2323             break;
2324         }
2325 
2326         zero = find_next_zero_bit(bitmap, end, one + 1);
2327 
2328         if (zero >= end) {
2329             discard_length = end - one;
2330         } else {
2331             discard_length = zero - one;
2332         }
2333         postcopy_discard_send_range(ms, one, discard_length);
2334         current = one + discard_length;
2335     }
2336 
2337     return 0;
2338 }
2339 
2340 /**
2341  * postcopy_each_ram_send_discard: discard all RAMBlocks
2342  *
2343  * Returns 0 for success or negative for error
2344  *
2345  * Utility for the outgoing postcopy code.
2346  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2347  *   passing it bitmap indexes and name.
2348  * (qemu_ram_foreach_block ends up passing unscaled lengths
2349  *  which would mean postcopy code would have to deal with target page)
2350  *
2351  * @ms: current migration state
2352  */
2353 static int postcopy_each_ram_send_discard(MigrationState *ms)
2354 {
2355     struct RAMBlock *block;
2356     int ret;
2357 
2358     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2359         postcopy_discard_send_init(ms, block->idstr);
2360 
2361         /*
2362          * Postcopy sends chunks of bitmap over the wire, but it
2363          * just needs indexes at this point, avoids it having
2364          * target page specific code.
2365          */
2366         ret = postcopy_send_discard_bm_ram(ms, block);
2367         postcopy_discard_send_finish(ms);
2368         if (ret) {
2369             return ret;
2370         }
2371     }
2372 
2373     return 0;
2374 }
2375 
2376 /**
2377  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2378  *
2379  * Helper for postcopy_chunk_hostpages; it's called twice to
2380  * canonicalize the two bitmaps, that are similar, but one is
2381  * inverted.
2382  *
2383  * Postcopy requires that all target pages in a hostpage are dirty or
2384  * clean, not a mix.  This function canonicalizes the bitmaps.
2385  *
2386  * @ms: current migration state
2387  * @block: block that contains the page we want to canonicalize
2388  */
2389 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2390 {
2391     RAMState *rs = ram_state;
2392     unsigned long *bitmap = block->bmap;
2393     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2394     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2395     unsigned long run_start;
2396 
2397     if (block->page_size == TARGET_PAGE_SIZE) {
2398         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2399         return;
2400     }
2401 
2402     /* Find a dirty page */
2403     run_start = find_next_bit(bitmap, pages, 0);
2404 
2405     while (run_start < pages) {
2406 
2407         /*
2408          * If the start of this run of pages is in the middle of a host
2409          * page, then we need to fixup this host page.
2410          */
2411         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2412             /* Find the end of this run */
2413             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2414             /*
2415              * If the end isn't at the start of a host page, then the
2416              * run doesn't finish at the end of a host page
2417              * and we need to discard.
2418              */
2419         }
2420 
2421         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2422             unsigned long page;
2423             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2424                                                              host_ratio);
2425             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2426 
2427             /* Clean up the bitmap */
2428             for (page = fixup_start_addr;
2429                  page < fixup_start_addr + host_ratio; page++) {
2430                 /*
2431                  * Remark them as dirty, updating the count for any pages
2432                  * that weren't previously dirty.
2433                  */
2434                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2435             }
2436         }
2437 
2438         /* Find the next dirty page for the next iteration */
2439         run_start = find_next_bit(bitmap, pages, run_start);
2440     }
2441 }
2442 
2443 /**
2444  * postcopy_chunk_hostpages: discard any partially sent host page
2445  *
2446  * Utility for the outgoing postcopy code.
2447  *
2448  * Discard any partially sent host-page size chunks, mark any partially
2449  * dirty host-page size chunks as all dirty.  In this case the host-page
2450  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2451  *
2452  * Returns zero on success
2453  *
2454  * @ms: current migration state
2455  * @block: block we want to work with
2456  */
2457 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2458 {
2459     postcopy_discard_send_init(ms, block->idstr);
2460 
2461     /*
2462      * Ensure that all partially dirty host pages are made fully dirty.
2463      */
2464     postcopy_chunk_hostpages_pass(ms, block);
2465 
2466     postcopy_discard_send_finish(ms);
2467     return 0;
2468 }
2469 
2470 /**
2471  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2472  *
2473  * Returns zero on success
2474  *
2475  * Transmit the set of pages to be discarded after precopy to the target
2476  * these are pages that:
2477  *     a) Have been previously transmitted but are now dirty again
2478  *     b) Pages that have never been transmitted, this ensures that
2479  *        any pages on the destination that have been mapped by background
2480  *        tasks get discarded (transparent huge pages is the specific concern)
2481  * Hopefully this is pretty sparse
2482  *
2483  * @ms: current migration state
2484  */
2485 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2486 {
2487     RAMState *rs = ram_state;
2488     RAMBlock *block;
2489     int ret;
2490 
2491     RCU_READ_LOCK_GUARD();
2492 
2493     /* This should be our last sync, the src is now paused */
2494     migration_bitmap_sync(rs);
2495 
2496     /* Easiest way to make sure we don't resume in the middle of a host-page */
2497     rs->last_seen_block = NULL;
2498     rs->last_sent_block = NULL;
2499     rs->last_page = 0;
2500 
2501     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2502         /* Deal with TPS != HPS and huge pages */
2503         ret = postcopy_chunk_hostpages(ms, block);
2504         if (ret) {
2505             return ret;
2506         }
2507 
2508 #ifdef DEBUG_POSTCOPY
2509         ram_debug_dump_bitmap(block->bmap, true,
2510                               block->used_length >> TARGET_PAGE_BITS);
2511 #endif
2512     }
2513     trace_ram_postcopy_send_discard_bitmap();
2514 
2515     return postcopy_each_ram_send_discard(ms);
2516 }
2517 
2518 /**
2519  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2520  *
2521  * Returns zero on success
2522  *
2523  * @rbname: name of the RAMBlock of the request. NULL means the
2524  *          same that last one.
2525  * @start: RAMBlock starting page
2526  * @length: RAMBlock size
2527  */
2528 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2529 {
2530     trace_ram_discard_range(rbname, start, length);
2531 
2532     RCU_READ_LOCK_GUARD();
2533     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2534 
2535     if (!rb) {
2536         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2537         return -1;
2538     }
2539 
2540     /*
2541      * On source VM, we don't need to update the received bitmap since
2542      * we don't even have one.
2543      */
2544     if (rb->receivedmap) {
2545         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2546                      length >> qemu_target_page_bits());
2547     }
2548 
2549     return ram_block_discard_range(rb, start, length);
2550 }
2551 
2552 /*
2553  * For every allocation, we will try not to crash the VM if the
2554  * allocation failed.
2555  */
2556 static int xbzrle_init(void)
2557 {
2558     Error *local_err = NULL;
2559 
2560     if (!migrate_use_xbzrle()) {
2561         return 0;
2562     }
2563 
2564     XBZRLE_cache_lock();
2565 
2566     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2567     if (!XBZRLE.zero_target_page) {
2568         error_report("%s: Error allocating zero page", __func__);
2569         goto err_out;
2570     }
2571 
2572     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2573                               TARGET_PAGE_SIZE, &local_err);
2574     if (!XBZRLE.cache) {
2575         error_report_err(local_err);
2576         goto free_zero_page;
2577     }
2578 
2579     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2580     if (!XBZRLE.encoded_buf) {
2581         error_report("%s: Error allocating encoded_buf", __func__);
2582         goto free_cache;
2583     }
2584 
2585     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2586     if (!XBZRLE.current_buf) {
2587         error_report("%s: Error allocating current_buf", __func__);
2588         goto free_encoded_buf;
2589     }
2590 
2591     /* We are all good */
2592     XBZRLE_cache_unlock();
2593     return 0;
2594 
2595 free_encoded_buf:
2596     g_free(XBZRLE.encoded_buf);
2597     XBZRLE.encoded_buf = NULL;
2598 free_cache:
2599     cache_fini(XBZRLE.cache);
2600     XBZRLE.cache = NULL;
2601 free_zero_page:
2602     g_free(XBZRLE.zero_target_page);
2603     XBZRLE.zero_target_page = NULL;
2604 err_out:
2605     XBZRLE_cache_unlock();
2606     return -ENOMEM;
2607 }
2608 
2609 static int ram_state_init(RAMState **rsp)
2610 {
2611     *rsp = g_try_new0(RAMState, 1);
2612 
2613     if (!*rsp) {
2614         error_report("%s: Init ramstate fail", __func__);
2615         return -1;
2616     }
2617 
2618     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2619     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2620     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2621 
2622     /*
2623      * Count the total number of pages used by ram blocks not including any
2624      * gaps due to alignment or unplugs.
2625      * This must match with the initial values of dirty bitmap.
2626      */
2627     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2628     ram_state_reset(*rsp);
2629 
2630     return 0;
2631 }
2632 
2633 static void ram_list_init_bitmaps(void)
2634 {
2635     MigrationState *ms = migrate_get_current();
2636     RAMBlock *block;
2637     unsigned long pages;
2638     uint8_t shift;
2639 
2640     /* Skip setting bitmap if there is no RAM */
2641     if (ram_bytes_total()) {
2642         shift = ms->clear_bitmap_shift;
2643         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2644             error_report("clear_bitmap_shift (%u) too big, using "
2645                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2646             shift = CLEAR_BITMAP_SHIFT_MAX;
2647         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2648             error_report("clear_bitmap_shift (%u) too small, using "
2649                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2650             shift = CLEAR_BITMAP_SHIFT_MIN;
2651         }
2652 
2653         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2654             pages = block->max_length >> TARGET_PAGE_BITS;
2655             /*
2656              * The initial dirty bitmap for migration must be set with all
2657              * ones to make sure we'll migrate every guest RAM page to
2658              * destination.
2659              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2660              * new migration after a failed migration, ram_list.
2661              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2662              * guest memory.
2663              */
2664             block->bmap = bitmap_new(pages);
2665             bitmap_set(block->bmap, 0, pages);
2666             block->clear_bmap_shift = shift;
2667             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2668         }
2669     }
2670 }
2671 
2672 static void ram_init_bitmaps(RAMState *rs)
2673 {
2674     /* For memory_global_dirty_log_start below.  */
2675     qemu_mutex_lock_iothread();
2676     qemu_mutex_lock_ramlist();
2677 
2678     WITH_RCU_READ_LOCK_GUARD() {
2679         ram_list_init_bitmaps();
2680         /* We don't use dirty log with background snapshots */
2681         if (!migrate_background_snapshot()) {
2682             memory_global_dirty_log_start();
2683             migration_bitmap_sync_precopy(rs);
2684         }
2685     }
2686     qemu_mutex_unlock_ramlist();
2687     qemu_mutex_unlock_iothread();
2688 }
2689 
2690 static int ram_init_all(RAMState **rsp)
2691 {
2692     if (ram_state_init(rsp)) {
2693         return -1;
2694     }
2695 
2696     if (xbzrle_init()) {
2697         ram_state_cleanup(rsp);
2698         return -1;
2699     }
2700 
2701     ram_init_bitmaps(*rsp);
2702 
2703     return 0;
2704 }
2705 
2706 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2707 {
2708     RAMBlock *block;
2709     uint64_t pages = 0;
2710 
2711     /*
2712      * Postcopy is not using xbzrle/compression, so no need for that.
2713      * Also, since source are already halted, we don't need to care
2714      * about dirty page logging as well.
2715      */
2716 
2717     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2718         pages += bitmap_count_one(block->bmap,
2719                                   block->used_length >> TARGET_PAGE_BITS);
2720     }
2721 
2722     /* This may not be aligned with current bitmaps. Recalculate. */
2723     rs->migration_dirty_pages = pages;
2724 
2725     rs->last_seen_block = NULL;
2726     rs->last_sent_block = NULL;
2727     rs->last_page = 0;
2728     rs->last_version = ram_list.version;
2729     /*
2730      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2731      * matter what we have sent.
2732      */
2733     rs->ram_bulk_stage = false;
2734 
2735     /* Update RAMState cache of output QEMUFile */
2736     rs->f = out;
2737 
2738     trace_ram_state_resume_prepare(pages);
2739 }
2740 
2741 /*
2742  * This function clears bits of the free pages reported by the caller from the
2743  * migration dirty bitmap. @addr is the host address corresponding to the
2744  * start of the continuous guest free pages, and @len is the total bytes of
2745  * those pages.
2746  */
2747 void qemu_guest_free_page_hint(void *addr, size_t len)
2748 {
2749     RAMBlock *block;
2750     ram_addr_t offset;
2751     size_t used_len, start, npages;
2752     MigrationState *s = migrate_get_current();
2753 
2754     /* This function is currently expected to be used during live migration */
2755     if (!migration_is_setup_or_active(s->state)) {
2756         return;
2757     }
2758 
2759     for (; len > 0; len -= used_len, addr += used_len) {
2760         block = qemu_ram_block_from_host(addr, false, &offset);
2761         if (unlikely(!block || offset >= block->used_length)) {
2762             /*
2763              * The implementation might not support RAMBlock resize during
2764              * live migration, but it could happen in theory with future
2765              * updates. So we add a check here to capture that case.
2766              */
2767             error_report_once("%s unexpected error", __func__);
2768             return;
2769         }
2770 
2771         if (len <= block->used_length - offset) {
2772             used_len = len;
2773         } else {
2774             used_len = block->used_length - offset;
2775         }
2776 
2777         start = offset >> TARGET_PAGE_BITS;
2778         npages = used_len >> TARGET_PAGE_BITS;
2779 
2780         qemu_mutex_lock(&ram_state->bitmap_mutex);
2781         ram_state->migration_dirty_pages -=
2782                       bitmap_count_one_with_offset(block->bmap, start, npages);
2783         bitmap_clear(block->bmap, start, npages);
2784         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2785     }
2786 }
2787 
2788 /*
2789  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2790  * long-running RCU critical section.  When rcu-reclaims in the code
2791  * start to become numerous it will be necessary to reduce the
2792  * granularity of these critical sections.
2793  */
2794 
2795 /**
2796  * ram_save_setup: Setup RAM for migration
2797  *
2798  * Returns zero to indicate success and negative for error
2799  *
2800  * @f: QEMUFile where to send the data
2801  * @opaque: RAMState pointer
2802  */
2803 static int ram_save_setup(QEMUFile *f, void *opaque)
2804 {
2805     RAMState **rsp = opaque;
2806     RAMBlock *block;
2807 
2808     if (compress_threads_save_setup()) {
2809         return -1;
2810     }
2811 
2812     /* migration has already setup the bitmap, reuse it. */
2813     if (!migration_in_colo_state()) {
2814         if (ram_init_all(rsp) != 0) {
2815             compress_threads_save_cleanup();
2816             return -1;
2817         }
2818     }
2819     (*rsp)->f = f;
2820 
2821     WITH_RCU_READ_LOCK_GUARD() {
2822         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2823 
2824         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2825             qemu_put_byte(f, strlen(block->idstr));
2826             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2827             qemu_put_be64(f, block->used_length);
2828             if (migrate_postcopy_ram() && block->page_size !=
2829                                           qemu_host_page_size) {
2830                 qemu_put_be64(f, block->page_size);
2831             }
2832             if (migrate_ignore_shared()) {
2833                 qemu_put_be64(f, block->mr->addr);
2834             }
2835         }
2836     }
2837 
2838     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2839     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2840 
2841     multifd_send_sync_main(f);
2842     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2843     qemu_fflush(f);
2844 
2845     return 0;
2846 }
2847 
2848 /**
2849  * ram_save_iterate: iterative stage for migration
2850  *
2851  * Returns zero to indicate success and negative for error
2852  *
2853  * @f: QEMUFile where to send the data
2854  * @opaque: RAMState pointer
2855  */
2856 static int ram_save_iterate(QEMUFile *f, void *opaque)
2857 {
2858     RAMState **temp = opaque;
2859     RAMState *rs = *temp;
2860     int ret = 0;
2861     int i;
2862     int64_t t0;
2863     int done = 0;
2864 
2865     if (blk_mig_bulk_active()) {
2866         /* Avoid transferring ram during bulk phase of block migration as
2867          * the bulk phase will usually take a long time and transferring
2868          * ram updates during that time is pointless. */
2869         goto out;
2870     }
2871 
2872     WITH_RCU_READ_LOCK_GUARD() {
2873         if (ram_list.version != rs->last_version) {
2874             ram_state_reset(rs);
2875         }
2876 
2877         /* Read version before ram_list.blocks */
2878         smp_rmb();
2879 
2880         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2881 
2882         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2883         i = 0;
2884         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2885                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2886             int pages;
2887 
2888             if (qemu_file_get_error(f)) {
2889                 break;
2890             }
2891 
2892             pages = ram_find_and_save_block(rs, false);
2893             /* no more pages to sent */
2894             if (pages == 0) {
2895                 done = 1;
2896                 break;
2897             }
2898 
2899             if (pages < 0) {
2900                 qemu_file_set_error(f, pages);
2901                 break;
2902             }
2903 
2904             rs->target_page_count += pages;
2905 
2906             /*
2907              * During postcopy, it is necessary to make sure one whole host
2908              * page is sent in one chunk.
2909              */
2910             if (migrate_postcopy_ram()) {
2911                 flush_compressed_data(rs);
2912             }
2913 
2914             /*
2915              * we want to check in the 1st loop, just in case it was the 1st
2916              * time and we had to sync the dirty bitmap.
2917              * qemu_clock_get_ns() is a bit expensive, so we only check each
2918              * some iterations
2919              */
2920             if ((i & 63) == 0) {
2921                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2922                               1000000;
2923                 if (t1 > MAX_WAIT) {
2924                     trace_ram_save_iterate_big_wait(t1, i);
2925                     break;
2926                 }
2927             }
2928             i++;
2929         }
2930     }
2931 
2932     /*
2933      * Must occur before EOS (or any QEMUFile operation)
2934      * because of RDMA protocol.
2935      */
2936     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2937 
2938 out:
2939     if (ret >= 0
2940         && migration_is_setup_or_active(migrate_get_current()->state)) {
2941         multifd_send_sync_main(rs->f);
2942         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2943         qemu_fflush(f);
2944         ram_counters.transferred += 8;
2945 
2946         ret = qemu_file_get_error(f);
2947     }
2948     if (ret < 0) {
2949         return ret;
2950     }
2951 
2952     return done;
2953 }
2954 
2955 /**
2956  * ram_save_complete: function called to send the remaining amount of ram
2957  *
2958  * Returns zero to indicate success or negative on error
2959  *
2960  * Called with iothread lock
2961  *
2962  * @f: QEMUFile where to send the data
2963  * @opaque: RAMState pointer
2964  */
2965 static int ram_save_complete(QEMUFile *f, void *opaque)
2966 {
2967     RAMState **temp = opaque;
2968     RAMState *rs = *temp;
2969     int ret = 0;
2970 
2971     WITH_RCU_READ_LOCK_GUARD() {
2972         if (!migration_in_postcopy()) {
2973             migration_bitmap_sync_precopy(rs);
2974         }
2975 
2976         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2977 
2978         /* try transferring iterative blocks of memory */
2979 
2980         /* flush all remaining blocks regardless of rate limiting */
2981         while (true) {
2982             int pages;
2983 
2984             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2985             /* no more blocks to sent */
2986             if (pages == 0) {
2987                 break;
2988             }
2989             if (pages < 0) {
2990                 ret = pages;
2991                 break;
2992             }
2993         }
2994 
2995         flush_compressed_data(rs);
2996         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2997     }
2998 
2999     if (ret >= 0) {
3000         multifd_send_sync_main(rs->f);
3001         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3002         qemu_fflush(f);
3003     }
3004 
3005     return ret;
3006 }
3007 
3008 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3009                              uint64_t *res_precopy_only,
3010                              uint64_t *res_compatible,
3011                              uint64_t *res_postcopy_only)
3012 {
3013     RAMState **temp = opaque;
3014     RAMState *rs = *temp;
3015     uint64_t remaining_size;
3016 
3017     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3018 
3019     if (!migration_in_postcopy() &&
3020         remaining_size < max_size) {
3021         qemu_mutex_lock_iothread();
3022         WITH_RCU_READ_LOCK_GUARD() {
3023             migration_bitmap_sync_precopy(rs);
3024         }
3025         qemu_mutex_unlock_iothread();
3026         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3027     }
3028 
3029     if (migrate_postcopy_ram()) {
3030         /* We can do postcopy, and all the data is postcopiable */
3031         *res_compatible += remaining_size;
3032     } else {
3033         *res_precopy_only += remaining_size;
3034     }
3035 }
3036 
3037 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3038 {
3039     unsigned int xh_len;
3040     int xh_flags;
3041     uint8_t *loaded_data;
3042 
3043     /* extract RLE header */
3044     xh_flags = qemu_get_byte(f);
3045     xh_len = qemu_get_be16(f);
3046 
3047     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3048         error_report("Failed to load XBZRLE page - wrong compression!");
3049         return -1;
3050     }
3051 
3052     if (xh_len > TARGET_PAGE_SIZE) {
3053         error_report("Failed to load XBZRLE page - len overflow!");
3054         return -1;
3055     }
3056     loaded_data = XBZRLE.decoded_buf;
3057     /* load data and decode */
3058     /* it can change loaded_data to point to an internal buffer */
3059     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3060 
3061     /* decode RLE */
3062     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3063                              TARGET_PAGE_SIZE) == -1) {
3064         error_report("Failed to load XBZRLE page - decode error!");
3065         return -1;
3066     }
3067 
3068     return 0;
3069 }
3070 
3071 /**
3072  * ram_block_from_stream: read a RAMBlock id from the migration stream
3073  *
3074  * Must be called from within a rcu critical section.
3075  *
3076  * Returns a pointer from within the RCU-protected ram_list.
3077  *
3078  * @f: QEMUFile where to read the data from
3079  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3080  */
3081 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3082 {
3083     static RAMBlock *block;
3084     char id[256];
3085     uint8_t len;
3086 
3087     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3088         if (!block) {
3089             error_report("Ack, bad migration stream!");
3090             return NULL;
3091         }
3092         return block;
3093     }
3094 
3095     len = qemu_get_byte(f);
3096     qemu_get_buffer(f, (uint8_t *)id, len);
3097     id[len] = 0;
3098 
3099     block = qemu_ram_block_by_name(id);
3100     if (!block) {
3101         error_report("Can't find block %s", id);
3102         return NULL;
3103     }
3104 
3105     if (ramblock_is_ignored(block)) {
3106         error_report("block %s should not be migrated !", id);
3107         return NULL;
3108     }
3109 
3110     return block;
3111 }
3112 
3113 static inline void *host_from_ram_block_offset(RAMBlock *block,
3114                                                ram_addr_t offset)
3115 {
3116     if (!offset_in_ramblock(block, offset)) {
3117         return NULL;
3118     }
3119 
3120     return block->host + offset;
3121 }
3122 
3123 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3124                              ram_addr_t offset, bool record_bitmap)
3125 {
3126     if (!offset_in_ramblock(block, offset)) {
3127         return NULL;
3128     }
3129     if (!block->colo_cache) {
3130         error_report("%s: colo_cache is NULL in block :%s",
3131                      __func__, block->idstr);
3132         return NULL;
3133     }
3134 
3135     /*
3136     * During colo checkpoint, we need bitmap of these migrated pages.
3137     * It help us to decide which pages in ram cache should be flushed
3138     * into VM's RAM later.
3139     */
3140     if (record_bitmap &&
3141         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3142         ram_state->migration_dirty_pages++;
3143     }
3144     return block->colo_cache + offset;
3145 }
3146 
3147 /**
3148  * ram_handle_compressed: handle the zero page case
3149  *
3150  * If a page (or a whole RDMA chunk) has been
3151  * determined to be zero, then zap it.
3152  *
3153  * @host: host address for the zero page
3154  * @ch: what the page is filled from.  We only support zero
3155  * @size: size of the zero page
3156  */
3157 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3158 {
3159     if (ch != 0 || !is_zero_range(host, size)) {
3160         memset(host, ch, size);
3161     }
3162 }
3163 
3164 /* return the size after decompression, or negative value on error */
3165 static int
3166 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3167                      const uint8_t *source, size_t source_len)
3168 {
3169     int err;
3170 
3171     err = inflateReset(stream);
3172     if (err != Z_OK) {
3173         return -1;
3174     }
3175 
3176     stream->avail_in = source_len;
3177     stream->next_in = (uint8_t *)source;
3178     stream->avail_out = dest_len;
3179     stream->next_out = dest;
3180 
3181     err = inflate(stream, Z_NO_FLUSH);
3182     if (err != Z_STREAM_END) {
3183         return -1;
3184     }
3185 
3186     return stream->total_out;
3187 }
3188 
3189 static void *do_data_decompress(void *opaque)
3190 {
3191     DecompressParam *param = opaque;
3192     unsigned long pagesize;
3193     uint8_t *des;
3194     int len, ret;
3195 
3196     qemu_mutex_lock(&param->mutex);
3197     while (!param->quit) {
3198         if (param->des) {
3199             des = param->des;
3200             len = param->len;
3201             param->des = 0;
3202             qemu_mutex_unlock(&param->mutex);
3203 
3204             pagesize = TARGET_PAGE_SIZE;
3205 
3206             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3207                                        param->compbuf, len);
3208             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3209                 error_report("decompress data failed");
3210                 qemu_file_set_error(decomp_file, ret);
3211             }
3212 
3213             qemu_mutex_lock(&decomp_done_lock);
3214             param->done = true;
3215             qemu_cond_signal(&decomp_done_cond);
3216             qemu_mutex_unlock(&decomp_done_lock);
3217 
3218             qemu_mutex_lock(&param->mutex);
3219         } else {
3220             qemu_cond_wait(&param->cond, &param->mutex);
3221         }
3222     }
3223     qemu_mutex_unlock(&param->mutex);
3224 
3225     return NULL;
3226 }
3227 
3228 static int wait_for_decompress_done(void)
3229 {
3230     int idx, thread_count;
3231 
3232     if (!migrate_use_compression()) {
3233         return 0;
3234     }
3235 
3236     thread_count = migrate_decompress_threads();
3237     qemu_mutex_lock(&decomp_done_lock);
3238     for (idx = 0; idx < thread_count; idx++) {
3239         while (!decomp_param[idx].done) {
3240             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3241         }
3242     }
3243     qemu_mutex_unlock(&decomp_done_lock);
3244     return qemu_file_get_error(decomp_file);
3245 }
3246 
3247 static void compress_threads_load_cleanup(void)
3248 {
3249     int i, thread_count;
3250 
3251     if (!migrate_use_compression()) {
3252         return;
3253     }
3254     thread_count = migrate_decompress_threads();
3255     for (i = 0; i < thread_count; i++) {
3256         /*
3257          * we use it as a indicator which shows if the thread is
3258          * properly init'd or not
3259          */
3260         if (!decomp_param[i].compbuf) {
3261             break;
3262         }
3263 
3264         qemu_mutex_lock(&decomp_param[i].mutex);
3265         decomp_param[i].quit = true;
3266         qemu_cond_signal(&decomp_param[i].cond);
3267         qemu_mutex_unlock(&decomp_param[i].mutex);
3268     }
3269     for (i = 0; i < thread_count; i++) {
3270         if (!decomp_param[i].compbuf) {
3271             break;
3272         }
3273 
3274         qemu_thread_join(decompress_threads + i);
3275         qemu_mutex_destroy(&decomp_param[i].mutex);
3276         qemu_cond_destroy(&decomp_param[i].cond);
3277         inflateEnd(&decomp_param[i].stream);
3278         g_free(decomp_param[i].compbuf);
3279         decomp_param[i].compbuf = NULL;
3280     }
3281     g_free(decompress_threads);
3282     g_free(decomp_param);
3283     decompress_threads = NULL;
3284     decomp_param = NULL;
3285     decomp_file = NULL;
3286 }
3287 
3288 static int compress_threads_load_setup(QEMUFile *f)
3289 {
3290     int i, thread_count;
3291 
3292     if (!migrate_use_compression()) {
3293         return 0;
3294     }
3295 
3296     thread_count = migrate_decompress_threads();
3297     decompress_threads = g_new0(QemuThread, thread_count);
3298     decomp_param = g_new0(DecompressParam, thread_count);
3299     qemu_mutex_init(&decomp_done_lock);
3300     qemu_cond_init(&decomp_done_cond);
3301     decomp_file = f;
3302     for (i = 0; i < thread_count; i++) {
3303         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3304             goto exit;
3305         }
3306 
3307         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3308         qemu_mutex_init(&decomp_param[i].mutex);
3309         qemu_cond_init(&decomp_param[i].cond);
3310         decomp_param[i].done = true;
3311         decomp_param[i].quit = false;
3312         qemu_thread_create(decompress_threads + i, "decompress",
3313                            do_data_decompress, decomp_param + i,
3314                            QEMU_THREAD_JOINABLE);
3315     }
3316     return 0;
3317 exit:
3318     compress_threads_load_cleanup();
3319     return -1;
3320 }
3321 
3322 static void decompress_data_with_multi_threads(QEMUFile *f,
3323                                                void *host, int len)
3324 {
3325     int idx, thread_count;
3326 
3327     thread_count = migrate_decompress_threads();
3328     QEMU_LOCK_GUARD(&decomp_done_lock);
3329     while (true) {
3330         for (idx = 0; idx < thread_count; idx++) {
3331             if (decomp_param[idx].done) {
3332                 decomp_param[idx].done = false;
3333                 qemu_mutex_lock(&decomp_param[idx].mutex);
3334                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3335                 decomp_param[idx].des = host;
3336                 decomp_param[idx].len = len;
3337                 qemu_cond_signal(&decomp_param[idx].cond);
3338                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3339                 break;
3340             }
3341         }
3342         if (idx < thread_count) {
3343             break;
3344         } else {
3345             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3346         }
3347     }
3348 }
3349 
3350  /*
3351   * we must set ram_bulk_stage to false, otherwise in
3352   * migation_bitmap_find_dirty the bitmap will be unused and
3353   * all the pages in ram cache wil be flushed to the ram of
3354   * secondary VM.
3355   */
3356 static void colo_init_ram_state(void)
3357 {
3358     ram_state_init(&ram_state);
3359     ram_state->ram_bulk_stage = false;
3360 }
3361 
3362 /*
3363  * colo cache: this is for secondary VM, we cache the whole
3364  * memory of the secondary VM, it is need to hold the global lock
3365  * to call this helper.
3366  */
3367 int colo_init_ram_cache(void)
3368 {
3369     RAMBlock *block;
3370 
3371     WITH_RCU_READ_LOCK_GUARD() {
3372         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3373             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3374                                                     NULL,
3375                                                     false);
3376             if (!block->colo_cache) {
3377                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3378                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3379                              block->used_length);
3380                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3381                     if (block->colo_cache) {
3382                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3383                         block->colo_cache = NULL;
3384                     }
3385                 }
3386                 return -errno;
3387             }
3388         }
3389     }
3390 
3391     /*
3392     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3393     * with to decide which page in cache should be flushed into SVM's RAM. Here
3394     * we use the same name 'ram_bitmap' as for migration.
3395     */
3396     if (ram_bytes_total()) {
3397         RAMBlock *block;
3398 
3399         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3400             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3401             block->bmap = bitmap_new(pages);
3402         }
3403     }
3404 
3405     colo_init_ram_state();
3406     return 0;
3407 }
3408 
3409 /* TODO: duplicated with ram_init_bitmaps */
3410 void colo_incoming_start_dirty_log(void)
3411 {
3412     RAMBlock *block = NULL;
3413     /* For memory_global_dirty_log_start below. */
3414     qemu_mutex_lock_iothread();
3415     qemu_mutex_lock_ramlist();
3416 
3417     memory_global_dirty_log_sync();
3418     WITH_RCU_READ_LOCK_GUARD() {
3419         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3420             ramblock_sync_dirty_bitmap(ram_state, block);
3421             /* Discard this dirty bitmap record */
3422             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3423         }
3424         memory_global_dirty_log_start();
3425     }
3426     ram_state->migration_dirty_pages = 0;
3427     qemu_mutex_unlock_ramlist();
3428     qemu_mutex_unlock_iothread();
3429 }
3430 
3431 /* It is need to hold the global lock to call this helper */
3432 void colo_release_ram_cache(void)
3433 {
3434     RAMBlock *block;
3435 
3436     memory_global_dirty_log_stop();
3437     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3438         g_free(block->bmap);
3439         block->bmap = NULL;
3440     }
3441 
3442     WITH_RCU_READ_LOCK_GUARD() {
3443         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3444             if (block->colo_cache) {
3445                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3446                 block->colo_cache = NULL;
3447             }
3448         }
3449     }
3450     ram_state_cleanup(&ram_state);
3451 }
3452 
3453 /**
3454  * ram_load_setup: Setup RAM for migration incoming side
3455  *
3456  * Returns zero to indicate success and negative for error
3457  *
3458  * @f: QEMUFile where to receive the data
3459  * @opaque: RAMState pointer
3460  */
3461 static int ram_load_setup(QEMUFile *f, void *opaque)
3462 {
3463     if (compress_threads_load_setup(f)) {
3464         return -1;
3465     }
3466 
3467     xbzrle_load_setup();
3468     ramblock_recv_map_init();
3469 
3470     return 0;
3471 }
3472 
3473 static int ram_load_cleanup(void *opaque)
3474 {
3475     RAMBlock *rb;
3476 
3477     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3478         qemu_ram_block_writeback(rb);
3479     }
3480 
3481     xbzrle_load_cleanup();
3482     compress_threads_load_cleanup();
3483 
3484     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3485         g_free(rb->receivedmap);
3486         rb->receivedmap = NULL;
3487     }
3488 
3489     return 0;
3490 }
3491 
3492 /**
3493  * ram_postcopy_incoming_init: allocate postcopy data structures
3494  *
3495  * Returns 0 for success and negative if there was one error
3496  *
3497  * @mis: current migration incoming state
3498  *
3499  * Allocate data structures etc needed by incoming migration with
3500  * postcopy-ram. postcopy-ram's similarly names
3501  * postcopy_ram_incoming_init does the work.
3502  */
3503 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3504 {
3505     return postcopy_ram_incoming_init(mis);
3506 }
3507 
3508 /**
3509  * ram_load_postcopy: load a page in postcopy case
3510  *
3511  * Returns 0 for success or -errno in case of error
3512  *
3513  * Called in postcopy mode by ram_load().
3514  * rcu_read_lock is taken prior to this being called.
3515  *
3516  * @f: QEMUFile where to send the data
3517  */
3518 static int ram_load_postcopy(QEMUFile *f)
3519 {
3520     int flags = 0, ret = 0;
3521     bool place_needed = false;
3522     bool matches_target_page_size = false;
3523     MigrationIncomingState *mis = migration_incoming_get_current();
3524     /* Temporary page that is later 'placed' */
3525     void *postcopy_host_page = mis->postcopy_tmp_page;
3526     void *this_host = NULL;
3527     bool all_zero = true;
3528     int target_pages = 0;
3529 
3530     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3531         ram_addr_t addr;
3532         void *host = NULL;
3533         void *page_buffer = NULL;
3534         void *place_source = NULL;
3535         RAMBlock *block = NULL;
3536         uint8_t ch;
3537         int len;
3538 
3539         addr = qemu_get_be64(f);
3540 
3541         /*
3542          * If qemu file error, we should stop here, and then "addr"
3543          * may be invalid
3544          */
3545         ret = qemu_file_get_error(f);
3546         if (ret) {
3547             break;
3548         }
3549 
3550         flags = addr & ~TARGET_PAGE_MASK;
3551         addr &= TARGET_PAGE_MASK;
3552 
3553         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3554         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3555                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3556             block = ram_block_from_stream(f, flags);
3557 
3558             host = host_from_ram_block_offset(block, addr);
3559             if (!host) {
3560                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3561                 ret = -EINVAL;
3562                 break;
3563             }
3564             target_pages++;
3565             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3566             /*
3567              * Postcopy requires that we place whole host pages atomically;
3568              * these may be huge pages for RAMBlocks that are backed by
3569              * hugetlbfs.
3570              * To make it atomic, the data is read into a temporary page
3571              * that's moved into place later.
3572              * The migration protocol uses,  possibly smaller, target-pages
3573              * however the source ensures it always sends all the components
3574              * of a host page in one chunk.
3575              */
3576             page_buffer = postcopy_host_page +
3577                           ((uintptr_t)host & (block->page_size - 1));
3578             if (target_pages == 1) {
3579                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3580                                                     block->page_size);
3581             } else {
3582                 /* not the 1st TP within the HP */
3583                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3584                     (uintptr_t)this_host) {
3585                     error_report("Non-same host page %p/%p",
3586                                   host, this_host);
3587                     ret = -EINVAL;
3588                     break;
3589                 }
3590             }
3591 
3592             /*
3593              * If it's the last part of a host page then we place the host
3594              * page
3595              */
3596             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3597                 place_needed = true;
3598             }
3599             place_source = postcopy_host_page;
3600         }
3601 
3602         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3603         case RAM_SAVE_FLAG_ZERO:
3604             ch = qemu_get_byte(f);
3605             /*
3606              * Can skip to set page_buffer when
3607              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3608              */
3609             if (ch || !matches_target_page_size) {
3610                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3611             }
3612             if (ch) {
3613                 all_zero = false;
3614             }
3615             break;
3616 
3617         case RAM_SAVE_FLAG_PAGE:
3618             all_zero = false;
3619             if (!matches_target_page_size) {
3620                 /* For huge pages, we always use temporary buffer */
3621                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3622             } else {
3623                 /*
3624                  * For small pages that matches target page size, we
3625                  * avoid the qemu_file copy.  Instead we directly use
3626                  * the buffer of QEMUFile to place the page.  Note: we
3627                  * cannot do any QEMUFile operation before using that
3628                  * buffer to make sure the buffer is valid when
3629                  * placing the page.
3630                  */
3631                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3632                                          TARGET_PAGE_SIZE);
3633             }
3634             break;
3635         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3636             all_zero = false;
3637             len = qemu_get_be32(f);
3638             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3639                 error_report("Invalid compressed data length: %d", len);
3640                 ret = -EINVAL;
3641                 break;
3642             }
3643             decompress_data_with_multi_threads(f, page_buffer, len);
3644             break;
3645 
3646         case RAM_SAVE_FLAG_EOS:
3647             /* normal exit */
3648             multifd_recv_sync_main();
3649             break;
3650         default:
3651             error_report("Unknown combination of migration flags: 0x%x"
3652                          " (postcopy mode)", flags);
3653             ret = -EINVAL;
3654             break;
3655         }
3656 
3657         /* Got the whole host page, wait for decompress before placing. */
3658         if (place_needed) {
3659             ret |= wait_for_decompress_done();
3660         }
3661 
3662         /* Detect for any possible file errors */
3663         if (!ret && qemu_file_get_error(f)) {
3664             ret = qemu_file_get_error(f);
3665         }
3666 
3667         if (!ret && place_needed) {
3668             /* This gets called at the last target page in the host page */
3669             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3670                                                        block->page_size);
3671 
3672             if (all_zero) {
3673                 ret = postcopy_place_page_zero(mis, place_dest,
3674                                                block);
3675             } else {
3676                 ret = postcopy_place_page(mis, place_dest,
3677                                           place_source, block);
3678             }
3679             place_needed = false;
3680             target_pages = 0;
3681             /* Assume we have a zero page until we detect something different */
3682             all_zero = true;
3683         }
3684     }
3685 
3686     return ret;
3687 }
3688 
3689 static bool postcopy_is_advised(void)
3690 {
3691     PostcopyState ps = postcopy_state_get();
3692     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3693 }
3694 
3695 static bool postcopy_is_running(void)
3696 {
3697     PostcopyState ps = postcopy_state_get();
3698     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3699 }
3700 
3701 /*
3702  * Flush content of RAM cache into SVM's memory.
3703  * Only flush the pages that be dirtied by PVM or SVM or both.
3704  */
3705 void colo_flush_ram_cache(void)
3706 {
3707     RAMBlock *block = NULL;
3708     void *dst_host;
3709     void *src_host;
3710     unsigned long offset = 0;
3711 
3712     memory_global_dirty_log_sync();
3713     WITH_RCU_READ_LOCK_GUARD() {
3714         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3715             ramblock_sync_dirty_bitmap(ram_state, block);
3716         }
3717     }
3718 
3719     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3720     WITH_RCU_READ_LOCK_GUARD() {
3721         block = QLIST_FIRST_RCU(&ram_list.blocks);
3722 
3723         while (block) {
3724             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3725 
3726             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3727                 >= block->used_length) {
3728                 offset = 0;
3729                 block = QLIST_NEXT_RCU(block, next);
3730             } else {
3731                 migration_bitmap_clear_dirty(ram_state, block, offset);
3732                 dst_host = block->host
3733                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3734                 src_host = block->colo_cache
3735                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3736                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3737             }
3738         }
3739     }
3740     trace_colo_flush_ram_cache_end();
3741 }
3742 
3743 /**
3744  * ram_load_precopy: load pages in precopy case
3745  *
3746  * Returns 0 for success or -errno in case of error
3747  *
3748  * Called in precopy mode by ram_load().
3749  * rcu_read_lock is taken prior to this being called.
3750  *
3751  * @f: QEMUFile where to send the data
3752  */
3753 static int ram_load_precopy(QEMUFile *f)
3754 {
3755     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3756     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3757     bool postcopy_advised = postcopy_is_advised();
3758     if (!migrate_use_compression()) {
3759         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3760     }
3761 
3762     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3763         ram_addr_t addr, total_ram_bytes;
3764         void *host = NULL, *host_bak = NULL;
3765         uint8_t ch;
3766 
3767         /*
3768          * Yield periodically to let main loop run, but an iteration of
3769          * the main loop is expensive, so do it each some iterations
3770          */
3771         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3772             aio_co_schedule(qemu_get_current_aio_context(),
3773                             qemu_coroutine_self());
3774             qemu_coroutine_yield();
3775         }
3776         i++;
3777 
3778         addr = qemu_get_be64(f);
3779         flags = addr & ~TARGET_PAGE_MASK;
3780         addr &= TARGET_PAGE_MASK;
3781 
3782         if (flags & invalid_flags) {
3783             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3784                 error_report("Received an unexpected compressed page");
3785             }
3786 
3787             ret = -EINVAL;
3788             break;
3789         }
3790 
3791         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3792                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3793             RAMBlock *block = ram_block_from_stream(f, flags);
3794 
3795             host = host_from_ram_block_offset(block, addr);
3796             /*
3797              * After going into COLO stage, we should not load the page
3798              * into SVM's memory directly, we put them into colo_cache firstly.
3799              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3800              * Previously, we copied all these memory in preparing stage of COLO
3801              * while we need to stop VM, which is a time-consuming process.
3802              * Here we optimize it by a trick, back-up every page while in
3803              * migration process while COLO is enabled, though it affects the
3804              * speed of the migration, but it obviously reduce the downtime of
3805              * back-up all SVM'S memory in COLO preparing stage.
3806              */
3807             if (migration_incoming_colo_enabled()) {
3808                 if (migration_incoming_in_colo_state()) {
3809                     /* In COLO stage, put all pages into cache temporarily */
3810                     host = colo_cache_from_block_offset(block, addr, true);
3811                 } else {
3812                    /*
3813                     * In migration stage but before COLO stage,
3814                     * Put all pages into both cache and SVM's memory.
3815                     */
3816                     host_bak = colo_cache_from_block_offset(block, addr, false);
3817                 }
3818             }
3819             if (!host) {
3820                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3821                 ret = -EINVAL;
3822                 break;
3823             }
3824             if (!migration_incoming_in_colo_state()) {
3825                 ramblock_recv_bitmap_set(block, host);
3826             }
3827 
3828             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3829         }
3830 
3831         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3832         case RAM_SAVE_FLAG_MEM_SIZE:
3833             /* Synchronize RAM block list */
3834             total_ram_bytes = addr;
3835             while (!ret && total_ram_bytes) {
3836                 RAMBlock *block;
3837                 char id[256];
3838                 ram_addr_t length;
3839 
3840                 len = qemu_get_byte(f);
3841                 qemu_get_buffer(f, (uint8_t *)id, len);
3842                 id[len] = 0;
3843                 length = qemu_get_be64(f);
3844 
3845                 block = qemu_ram_block_by_name(id);
3846                 if (block && !qemu_ram_is_migratable(block)) {
3847                     error_report("block %s should not be migrated !", id);
3848                     ret = -EINVAL;
3849                 } else if (block) {
3850                     if (length != block->used_length) {
3851                         Error *local_err = NULL;
3852 
3853                         ret = qemu_ram_resize(block, length,
3854                                               &local_err);
3855                         if (local_err) {
3856                             error_report_err(local_err);
3857                         }
3858                     }
3859                     /* For postcopy we need to check hugepage sizes match */
3860                     if (postcopy_advised && migrate_postcopy_ram() &&
3861                         block->page_size != qemu_host_page_size) {
3862                         uint64_t remote_page_size = qemu_get_be64(f);
3863                         if (remote_page_size != block->page_size) {
3864                             error_report("Mismatched RAM page size %s "
3865                                          "(local) %zd != %" PRId64,
3866                                          id, block->page_size,
3867                                          remote_page_size);
3868                             ret = -EINVAL;
3869                         }
3870                     }
3871                     if (migrate_ignore_shared()) {
3872                         hwaddr addr = qemu_get_be64(f);
3873                         if (ramblock_is_ignored(block) &&
3874                             block->mr->addr != addr) {
3875                             error_report("Mismatched GPAs for block %s "
3876                                          "%" PRId64 "!= %" PRId64,
3877                                          id, (uint64_t)addr,
3878                                          (uint64_t)block->mr->addr);
3879                             ret = -EINVAL;
3880                         }
3881                     }
3882                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3883                                           block->idstr);
3884                 } else {
3885                     error_report("Unknown ramblock \"%s\", cannot "
3886                                  "accept migration", id);
3887                     ret = -EINVAL;
3888                 }
3889 
3890                 total_ram_bytes -= length;
3891             }
3892             break;
3893 
3894         case RAM_SAVE_FLAG_ZERO:
3895             ch = qemu_get_byte(f);
3896             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3897             break;
3898 
3899         case RAM_SAVE_FLAG_PAGE:
3900             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3901             break;
3902 
3903         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3904             len = qemu_get_be32(f);
3905             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3906                 error_report("Invalid compressed data length: %d", len);
3907                 ret = -EINVAL;
3908                 break;
3909             }
3910             decompress_data_with_multi_threads(f, host, len);
3911             break;
3912 
3913         case RAM_SAVE_FLAG_XBZRLE:
3914             if (load_xbzrle(f, addr, host) < 0) {
3915                 error_report("Failed to decompress XBZRLE page at "
3916                              RAM_ADDR_FMT, addr);
3917                 ret = -EINVAL;
3918                 break;
3919             }
3920             break;
3921         case RAM_SAVE_FLAG_EOS:
3922             /* normal exit */
3923             multifd_recv_sync_main();
3924             break;
3925         default:
3926             if (flags & RAM_SAVE_FLAG_HOOK) {
3927                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3928             } else {
3929                 error_report("Unknown combination of migration flags: 0x%x",
3930                              flags);
3931                 ret = -EINVAL;
3932             }
3933         }
3934         if (!ret) {
3935             ret = qemu_file_get_error(f);
3936         }
3937         if (!ret && host_bak) {
3938             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3939         }
3940     }
3941 
3942     ret |= wait_for_decompress_done();
3943     return ret;
3944 }
3945 
3946 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3947 {
3948     int ret = 0;
3949     static uint64_t seq_iter;
3950     /*
3951      * If system is running in postcopy mode, page inserts to host memory must
3952      * be atomic
3953      */
3954     bool postcopy_running = postcopy_is_running();
3955 
3956     seq_iter++;
3957 
3958     if (version_id != 4) {
3959         return -EINVAL;
3960     }
3961 
3962     /*
3963      * This RCU critical section can be very long running.
3964      * When RCU reclaims in the code start to become numerous,
3965      * it will be necessary to reduce the granularity of this
3966      * critical section.
3967      */
3968     WITH_RCU_READ_LOCK_GUARD() {
3969         if (postcopy_running) {
3970             ret = ram_load_postcopy(f);
3971         } else {
3972             ret = ram_load_precopy(f);
3973         }
3974     }
3975     trace_ram_load_complete(ret, seq_iter);
3976 
3977     return ret;
3978 }
3979 
3980 static bool ram_has_postcopy(void *opaque)
3981 {
3982     RAMBlock *rb;
3983     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3984         if (ramblock_is_pmem(rb)) {
3985             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3986                          "is not supported now!", rb->idstr, rb->host);
3987             return false;
3988         }
3989     }
3990 
3991     return migrate_postcopy_ram();
3992 }
3993 
3994 /* Sync all the dirty bitmap with destination VM.  */
3995 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3996 {
3997     RAMBlock *block;
3998     QEMUFile *file = s->to_dst_file;
3999     int ramblock_count = 0;
4000 
4001     trace_ram_dirty_bitmap_sync_start();
4002 
4003     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4004         qemu_savevm_send_recv_bitmap(file, block->idstr);
4005         trace_ram_dirty_bitmap_request(block->idstr);
4006         ramblock_count++;
4007     }
4008 
4009     trace_ram_dirty_bitmap_sync_wait();
4010 
4011     /* Wait until all the ramblocks' dirty bitmap synced */
4012     while (ramblock_count--) {
4013         qemu_sem_wait(&s->rp_state.rp_sem);
4014     }
4015 
4016     trace_ram_dirty_bitmap_sync_complete();
4017 
4018     return 0;
4019 }
4020 
4021 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4022 {
4023     qemu_sem_post(&s->rp_state.rp_sem);
4024 }
4025 
4026 /*
4027  * Read the received bitmap, revert it as the initial dirty bitmap.
4028  * This is only used when the postcopy migration is paused but wants
4029  * to resume from a middle point.
4030  */
4031 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4032 {
4033     int ret = -EINVAL;
4034     QEMUFile *file = s->rp_state.from_dst_file;
4035     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4036     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4037     uint64_t size, end_mark;
4038 
4039     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4040 
4041     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4042         error_report("%s: incorrect state %s", __func__,
4043                      MigrationStatus_str(s->state));
4044         return -EINVAL;
4045     }
4046 
4047     /*
4048      * Note: see comments in ramblock_recv_bitmap_send() on why we
4049      * need the endianness conversion, and the paddings.
4050      */
4051     local_size = ROUND_UP(local_size, 8);
4052 
4053     /* Add paddings */
4054     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4055 
4056     size = qemu_get_be64(file);
4057 
4058     /* The size of the bitmap should match with our ramblock */
4059     if (size != local_size) {
4060         error_report("%s: ramblock '%s' bitmap size mismatch "
4061                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4062                      block->idstr, size, local_size);
4063         ret = -EINVAL;
4064         goto out;
4065     }
4066 
4067     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4068     end_mark = qemu_get_be64(file);
4069 
4070     ret = qemu_file_get_error(file);
4071     if (ret || size != local_size) {
4072         error_report("%s: read bitmap failed for ramblock '%s': %d"
4073                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4074                      __func__, block->idstr, ret, local_size, size);
4075         ret = -EIO;
4076         goto out;
4077     }
4078 
4079     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4080         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4081                      __func__, block->idstr, end_mark);
4082         ret = -EINVAL;
4083         goto out;
4084     }
4085 
4086     /*
4087      * Endianness conversion. We are during postcopy (though paused).
4088      * The dirty bitmap won't change. We can directly modify it.
4089      */
4090     bitmap_from_le(block->bmap, le_bitmap, nbits);
4091 
4092     /*
4093      * What we received is "received bitmap". Revert it as the initial
4094      * dirty bitmap for this ramblock.
4095      */
4096     bitmap_complement(block->bmap, block->bmap, nbits);
4097 
4098     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4099 
4100     /*
4101      * We succeeded to sync bitmap for current ramblock. If this is
4102      * the last one to sync, we need to notify the main send thread.
4103      */
4104     ram_dirty_bitmap_reload_notify(s);
4105 
4106     ret = 0;
4107 out:
4108     g_free(le_bitmap);
4109     return ret;
4110 }
4111 
4112 static int ram_resume_prepare(MigrationState *s, void *opaque)
4113 {
4114     RAMState *rs = *(RAMState **)opaque;
4115     int ret;
4116 
4117     ret = ram_dirty_bitmap_sync_all(s, rs);
4118     if (ret) {
4119         return ret;
4120     }
4121 
4122     ram_state_resume_prepare(rs, s->to_dst_file);
4123 
4124     return 0;
4125 }
4126 
4127 static SaveVMHandlers savevm_ram_handlers = {
4128     .save_setup = ram_save_setup,
4129     .save_live_iterate = ram_save_iterate,
4130     .save_live_complete_postcopy = ram_save_complete,
4131     .save_live_complete_precopy = ram_save_complete,
4132     .has_postcopy = ram_has_postcopy,
4133     .save_live_pending = ram_save_pending,
4134     .load_state = ram_load,
4135     .save_cleanup = ram_save_cleanup,
4136     .load_setup = ram_load_setup,
4137     .load_cleanup = ram_load_cleanup,
4138     .resume_prepare = ram_resume_prepare,
4139 };
4140 
4141 void ram_mig_init(void)
4142 {
4143     qemu_mutex_init(&XBZRLE.lock);
4144     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4145 }
4146