xref: /openbmc/qemu/migration/ram.c (revision 56e2cd24)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "migration/postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 #include "migration/colo.h"
47 
48 static int dirty_rate_high_cnt;
49 
50 static uint64_t bitmap_sync_count;
51 
52 /***********************************************************/
53 /* ram save/restore */
54 
55 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
56 #define RAM_SAVE_FLAG_COMPRESS 0x02
57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
58 #define RAM_SAVE_FLAG_PAGE     0x08
59 #define RAM_SAVE_FLAG_EOS      0x10
60 #define RAM_SAVE_FLAG_CONTINUE 0x20
61 #define RAM_SAVE_FLAG_XBZRLE   0x40
62 /* 0x80 is reserved in migration.h start with 0x100 next */
63 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
64 
65 static uint8_t *ZERO_TARGET_PAGE;
66 
67 static inline bool is_zero_range(uint8_t *p, uint64_t size)
68 {
69     return buffer_is_zero(p, size);
70 }
71 
72 /* struct contains XBZRLE cache and a static page
73    used by the compression */
74 static struct {
75     /* buffer used for XBZRLE encoding */
76     uint8_t *encoded_buf;
77     /* buffer for storing page content */
78     uint8_t *current_buf;
79     /* Cache for XBZRLE, Protected by lock. */
80     PageCache *cache;
81     QemuMutex lock;
82 } XBZRLE;
83 
84 /* buffer used for XBZRLE decoding */
85 static uint8_t *xbzrle_decoded_buf;
86 
87 static void XBZRLE_cache_lock(void)
88 {
89     if (migrate_use_xbzrle())
90         qemu_mutex_lock(&XBZRLE.lock);
91 }
92 
93 static void XBZRLE_cache_unlock(void)
94 {
95     if (migrate_use_xbzrle())
96         qemu_mutex_unlock(&XBZRLE.lock);
97 }
98 
99 /*
100  * called from qmp_migrate_set_cache_size in main thread, possibly while
101  * a migration is in progress.
102  * A running migration maybe using the cache and might finish during this
103  * call, hence changes to the cache are protected by XBZRLE.lock().
104  */
105 int64_t xbzrle_cache_resize(int64_t new_size)
106 {
107     PageCache *new_cache;
108     int64_t ret;
109 
110     if (new_size < TARGET_PAGE_SIZE) {
111         return -1;
112     }
113 
114     XBZRLE_cache_lock();
115 
116     if (XBZRLE.cache != NULL) {
117         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
118             goto out_new_size;
119         }
120         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
121                                         TARGET_PAGE_SIZE);
122         if (!new_cache) {
123             error_report("Error creating cache");
124             ret = -1;
125             goto out;
126         }
127 
128         cache_fini(XBZRLE.cache);
129         XBZRLE.cache = new_cache;
130     }
131 
132 out_new_size:
133     ret = pow2floor(new_size);
134 out:
135     XBZRLE_cache_unlock();
136     return ret;
137 }
138 
139 /* accounting for migration statistics */
140 typedef struct AccountingInfo {
141     uint64_t dup_pages;
142     uint64_t skipped_pages;
143     uint64_t norm_pages;
144     uint64_t iterations;
145     uint64_t xbzrle_bytes;
146     uint64_t xbzrle_pages;
147     uint64_t xbzrle_cache_miss;
148     double xbzrle_cache_miss_rate;
149     uint64_t xbzrle_overflows;
150 } AccountingInfo;
151 
152 static AccountingInfo acct_info;
153 
154 static void acct_clear(void)
155 {
156     memset(&acct_info, 0, sizeof(acct_info));
157 }
158 
159 uint64_t dup_mig_bytes_transferred(void)
160 {
161     return acct_info.dup_pages * TARGET_PAGE_SIZE;
162 }
163 
164 uint64_t dup_mig_pages_transferred(void)
165 {
166     return acct_info.dup_pages;
167 }
168 
169 uint64_t skipped_mig_bytes_transferred(void)
170 {
171     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
172 }
173 
174 uint64_t skipped_mig_pages_transferred(void)
175 {
176     return acct_info.skipped_pages;
177 }
178 
179 uint64_t norm_mig_bytes_transferred(void)
180 {
181     return acct_info.norm_pages * TARGET_PAGE_SIZE;
182 }
183 
184 uint64_t norm_mig_pages_transferred(void)
185 {
186     return acct_info.norm_pages;
187 }
188 
189 uint64_t xbzrle_mig_bytes_transferred(void)
190 {
191     return acct_info.xbzrle_bytes;
192 }
193 
194 uint64_t xbzrle_mig_pages_transferred(void)
195 {
196     return acct_info.xbzrle_pages;
197 }
198 
199 uint64_t xbzrle_mig_pages_cache_miss(void)
200 {
201     return acct_info.xbzrle_cache_miss;
202 }
203 
204 double xbzrle_mig_cache_miss_rate(void)
205 {
206     return acct_info.xbzrle_cache_miss_rate;
207 }
208 
209 uint64_t xbzrle_mig_pages_overflow(void)
210 {
211     return acct_info.xbzrle_overflows;
212 }
213 
214 /* This is the last block that we have visited serching for dirty pages
215  */
216 static RAMBlock *last_seen_block;
217 /* This is the last block from where we have sent data */
218 static RAMBlock *last_sent_block;
219 static ram_addr_t last_offset;
220 static QemuMutex migration_bitmap_mutex;
221 static uint64_t migration_dirty_pages;
222 static uint32_t last_version;
223 static bool ram_bulk_stage;
224 
225 /* used by the search for pages to send */
226 struct PageSearchStatus {
227     /* Current block being searched */
228     RAMBlock    *block;
229     /* Current offset to search from */
230     ram_addr_t   offset;
231     /* Set once we wrap around */
232     bool         complete_round;
233 };
234 typedef struct PageSearchStatus PageSearchStatus;
235 
236 static struct BitmapRcu {
237     struct rcu_head rcu;
238     /* Main migration bitmap */
239     unsigned long *bmap;
240     /* bitmap of pages that haven't been sent even once
241      * only maintained and used in postcopy at the moment
242      * where it's used to send the dirtymap at the start
243      * of the postcopy phase
244      */
245     unsigned long *unsentmap;
246 } *migration_bitmap_rcu;
247 
248 struct CompressParam {
249     bool done;
250     bool quit;
251     QEMUFile *file;
252     QemuMutex mutex;
253     QemuCond cond;
254     RAMBlock *block;
255     ram_addr_t offset;
256 };
257 typedef struct CompressParam CompressParam;
258 
259 struct DecompressParam {
260     bool done;
261     bool quit;
262     QemuMutex mutex;
263     QemuCond cond;
264     void *des;
265     uint8_t *compbuf;
266     int len;
267 };
268 typedef struct DecompressParam DecompressParam;
269 
270 static CompressParam *comp_param;
271 static QemuThread *compress_threads;
272 /* comp_done_cond is used to wake up the migration thread when
273  * one of the compression threads has finished the compression.
274  * comp_done_lock is used to co-work with comp_done_cond.
275  */
276 static QemuMutex comp_done_lock;
277 static QemuCond comp_done_cond;
278 /* The empty QEMUFileOps will be used by file in CompressParam */
279 static const QEMUFileOps empty_ops = { };
280 
281 static bool compression_switch;
282 static DecompressParam *decomp_param;
283 static QemuThread *decompress_threads;
284 static QemuMutex decomp_done_lock;
285 static QemuCond decomp_done_cond;
286 
287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
288                                 ram_addr_t offset);
289 
290 static void *do_data_compress(void *opaque)
291 {
292     CompressParam *param = opaque;
293     RAMBlock *block;
294     ram_addr_t offset;
295 
296     qemu_mutex_lock(&param->mutex);
297     while (!param->quit) {
298         if (param->block) {
299             block = param->block;
300             offset = param->offset;
301             param->block = NULL;
302             qemu_mutex_unlock(&param->mutex);
303 
304             do_compress_ram_page(param->file, block, offset);
305 
306             qemu_mutex_lock(&comp_done_lock);
307             param->done = true;
308             qemu_cond_signal(&comp_done_cond);
309             qemu_mutex_unlock(&comp_done_lock);
310 
311             qemu_mutex_lock(&param->mutex);
312         } else {
313             qemu_cond_wait(&param->cond, &param->mutex);
314         }
315     }
316     qemu_mutex_unlock(&param->mutex);
317 
318     return NULL;
319 }
320 
321 static inline void terminate_compression_threads(void)
322 {
323     int idx, thread_count;
324 
325     thread_count = migrate_compress_threads();
326     for (idx = 0; idx < thread_count; idx++) {
327         qemu_mutex_lock(&comp_param[idx].mutex);
328         comp_param[idx].quit = true;
329         qemu_cond_signal(&comp_param[idx].cond);
330         qemu_mutex_unlock(&comp_param[idx].mutex);
331     }
332 }
333 
334 void migrate_compress_threads_join(void)
335 {
336     int i, thread_count;
337 
338     if (!migrate_use_compression()) {
339         return;
340     }
341     terminate_compression_threads();
342     thread_count = migrate_compress_threads();
343     for (i = 0; i < thread_count; i++) {
344         qemu_thread_join(compress_threads + i);
345         qemu_fclose(comp_param[i].file);
346         qemu_mutex_destroy(&comp_param[i].mutex);
347         qemu_cond_destroy(&comp_param[i].cond);
348     }
349     qemu_mutex_destroy(&comp_done_lock);
350     qemu_cond_destroy(&comp_done_cond);
351     g_free(compress_threads);
352     g_free(comp_param);
353     compress_threads = NULL;
354     comp_param = NULL;
355 }
356 
357 void migrate_compress_threads_create(void)
358 {
359     int i, thread_count;
360 
361     if (!migrate_use_compression()) {
362         return;
363     }
364     compression_switch = true;
365     thread_count = migrate_compress_threads();
366     compress_threads = g_new0(QemuThread, thread_count);
367     comp_param = g_new0(CompressParam, thread_count);
368     qemu_cond_init(&comp_done_cond);
369     qemu_mutex_init(&comp_done_lock);
370     for (i = 0; i < thread_count; i++) {
371         /* comp_param[i].file is just used as a dummy buffer to save data,
372          * set its ops to empty.
373          */
374         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
375         comp_param[i].done = true;
376         comp_param[i].quit = false;
377         qemu_mutex_init(&comp_param[i].mutex);
378         qemu_cond_init(&comp_param[i].cond);
379         qemu_thread_create(compress_threads + i, "compress",
380                            do_data_compress, comp_param + i,
381                            QEMU_THREAD_JOINABLE);
382     }
383 }
384 
385 /**
386  * save_page_header: Write page header to wire
387  *
388  * If this is the 1st block, it also writes the block identification
389  *
390  * Returns: Number of bytes written
391  *
392  * @f: QEMUFile where to send the data
393  * @block: block that contains the page we want to send
394  * @offset: offset inside the block for the page
395  *          in the lower bits, it contains flags
396  */
397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
398 {
399     size_t size, len;
400 
401     qemu_put_be64(f, offset);
402     size = 8;
403 
404     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
405         len = strlen(block->idstr);
406         qemu_put_byte(f, len);
407         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
408         size += 1 + len;
409     }
410     return size;
411 }
412 
413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
414  * If guest dirty memory rate is reduced below the rate at which we can
415  * transfer pages to the destination then we should be able to complete
416  * migration. Some workloads dirty memory way too fast and will not effectively
417  * converge, even with auto-converge.
418  */
419 static void mig_throttle_guest_down(void)
420 {
421     MigrationState *s = migrate_get_current();
422     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
423     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
424 
425     /* We have not started throttling yet. Let's start it. */
426     if (!cpu_throttle_active()) {
427         cpu_throttle_set(pct_initial);
428     } else {
429         /* Throttling already on, just increase the rate */
430         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
431     }
432 }
433 
434 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
435  * The important thing is that a stale (not-yet-0'd) page be replaced
436  * by the new data.
437  * As a bonus, if the page wasn't in the cache it gets added so that
438  * when a small write is made into the 0'd page it gets XBZRLE sent
439  */
440 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
441 {
442     if (ram_bulk_stage || !migrate_use_xbzrle()) {
443         return;
444     }
445 
446     /* We don't care if this fails to allocate a new cache page
447      * as long as it updated an old one */
448     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
449                  bitmap_sync_count);
450 }
451 
452 #define ENCODING_FLAG_XBZRLE 0x1
453 
454 /**
455  * save_xbzrle_page: compress and send current page
456  *
457  * Returns: 1 means that we wrote the page
458  *          0 means that page is identical to the one already sent
459  *          -1 means that xbzrle would be longer than normal
460  *
461  * @f: QEMUFile where to send the data
462  * @current_data:
463  * @current_addr:
464  * @block: block that contains the page we want to send
465  * @offset: offset inside the block for the page
466  * @last_stage: if we are at the completion stage
467  * @bytes_transferred: increase it with the number of transferred bytes
468  */
469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
470                             ram_addr_t current_addr, RAMBlock *block,
471                             ram_addr_t offset, bool last_stage,
472                             uint64_t *bytes_transferred)
473 {
474     int encoded_len = 0, bytes_xbzrle;
475     uint8_t *prev_cached_page;
476 
477     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
478         acct_info.xbzrle_cache_miss++;
479         if (!last_stage) {
480             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
481                              bitmap_sync_count) == -1) {
482                 return -1;
483             } else {
484                 /* update *current_data when the page has been
485                    inserted into cache */
486                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
487             }
488         }
489         return -1;
490     }
491 
492     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
493 
494     /* save current buffer into memory */
495     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
496 
497     /* XBZRLE encoding (if there is no overflow) */
498     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
499                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
500                                        TARGET_PAGE_SIZE);
501     if (encoded_len == 0) {
502         trace_save_xbzrle_page_skipping();
503         return 0;
504     } else if (encoded_len == -1) {
505         trace_save_xbzrle_page_overflow();
506         acct_info.xbzrle_overflows++;
507         /* update data in the cache */
508         if (!last_stage) {
509             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
510             *current_data = prev_cached_page;
511         }
512         return -1;
513     }
514 
515     /* we need to update the data in the cache, in order to get the same data */
516     if (!last_stage) {
517         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
518     }
519 
520     /* Send XBZRLE based compressed page */
521     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
522     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
523     qemu_put_be16(f, encoded_len);
524     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
525     bytes_xbzrle += encoded_len + 1 + 2;
526     acct_info.xbzrle_pages++;
527     acct_info.xbzrle_bytes += bytes_xbzrle;
528     *bytes_transferred += bytes_xbzrle;
529 
530     return 1;
531 }
532 
533 /* Called with rcu_read_lock() to protect migration_bitmap
534  * rb: The RAMBlock  to search for dirty pages in
535  * start: Start address (typically so we can continue from previous page)
536  * ram_addr_abs: Pointer into which to store the address of the dirty page
537  *               within the global ram_addr space
538  *
539  * Returns: byte offset within memory region of the start of a dirty page
540  */
541 static inline
542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
543                                        ram_addr_t start,
544                                        ram_addr_t *ram_addr_abs)
545 {
546     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
547     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
548     uint64_t rb_size = rb->used_length;
549     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
550     unsigned long *bitmap;
551 
552     unsigned long next;
553 
554     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
555     if (ram_bulk_stage && nr > base) {
556         next = nr + 1;
557     } else {
558         next = find_next_bit(bitmap, size, nr);
559     }
560 
561     *ram_addr_abs = next << TARGET_PAGE_BITS;
562     return (next - base) << TARGET_PAGE_BITS;
563 }
564 
565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
566 {
567     bool ret;
568     int nr = addr >> TARGET_PAGE_BITS;
569     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
570 
571     ret = test_and_clear_bit(nr, bitmap);
572 
573     if (ret) {
574         migration_dirty_pages--;
575     }
576     return ret;
577 }
578 
579 static int64_t num_dirty_pages_period;
580 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
581 {
582     unsigned long *bitmap;
583     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
584     migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
585                              start, length, &num_dirty_pages_period);
586 }
587 
588 /* Fix me: there are too many global variables used in migration process. */
589 static int64_t start_time;
590 static int64_t bytes_xfer_prev;
591 static uint64_t xbzrle_cache_miss_prev;
592 static uint64_t iterations_prev;
593 
594 static void migration_bitmap_sync_init(void)
595 {
596     start_time = 0;
597     bytes_xfer_prev = 0;
598     num_dirty_pages_period = 0;
599     xbzrle_cache_miss_prev = 0;
600     iterations_prev = 0;
601 }
602 
603 /* Returns a summary bitmap of the page sizes of all RAMBlocks;
604  * for VMs with just normal pages this is equivalent to the
605  * host page size.  If it's got some huge pages then it's the OR
606  * of all the different page sizes.
607  */
608 uint64_t ram_pagesize_summary(void)
609 {
610     RAMBlock *block;
611     uint64_t summary = 0;
612 
613     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
614         summary |= block->page_size;
615     }
616 
617     return summary;
618 }
619 
620 static void migration_bitmap_sync(void)
621 {
622     RAMBlock *block;
623     MigrationState *s = migrate_get_current();
624     int64_t end_time;
625     int64_t bytes_xfer_now;
626 
627     bitmap_sync_count++;
628 
629     if (!bytes_xfer_prev) {
630         bytes_xfer_prev = ram_bytes_transferred();
631     }
632 
633     if (!start_time) {
634         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
635     }
636 
637     trace_migration_bitmap_sync_start();
638     memory_global_dirty_log_sync();
639 
640     qemu_mutex_lock(&migration_bitmap_mutex);
641     rcu_read_lock();
642     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
643         migration_bitmap_sync_range(block->offset, block->used_length);
644     }
645     rcu_read_unlock();
646     qemu_mutex_unlock(&migration_bitmap_mutex);
647 
648     trace_migration_bitmap_sync_end(num_dirty_pages_period);
649 
650     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
651 
652     /* more than 1 second = 1000 millisecons */
653     if (end_time > start_time + 1000) {
654         if (migrate_auto_converge()) {
655             /* The following detection logic can be refined later. For now:
656                Check to see if the dirtied bytes is 50% more than the approx.
657                amount of bytes that just got transferred since the last time we
658                were in this routine. If that happens twice, start or increase
659                throttling */
660             bytes_xfer_now = ram_bytes_transferred();
661 
662             if (s->dirty_pages_rate &&
663                (num_dirty_pages_period * TARGET_PAGE_SIZE >
664                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
665                (dirty_rate_high_cnt++ >= 2)) {
666                     trace_migration_throttle();
667                     dirty_rate_high_cnt = 0;
668                     mig_throttle_guest_down();
669              }
670              bytes_xfer_prev = bytes_xfer_now;
671         }
672 
673         if (migrate_use_xbzrle()) {
674             if (iterations_prev != acct_info.iterations) {
675                 acct_info.xbzrle_cache_miss_rate =
676                    (double)(acct_info.xbzrle_cache_miss -
677                             xbzrle_cache_miss_prev) /
678                    (acct_info.iterations - iterations_prev);
679             }
680             iterations_prev = acct_info.iterations;
681             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
682         }
683         s->dirty_pages_rate = num_dirty_pages_period * 1000
684             / (end_time - start_time);
685         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
686         start_time = end_time;
687         num_dirty_pages_period = 0;
688     }
689     s->dirty_sync_count = bitmap_sync_count;
690     if (migrate_use_events()) {
691         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
692     }
693 }
694 
695 /**
696  * save_zero_page: Send the zero page to the stream
697  *
698  * Returns: Number of pages written.
699  *
700  * @f: QEMUFile where to send the data
701  * @block: block that contains the page we want to send
702  * @offset: offset inside the block for the page
703  * @p: pointer to the page
704  * @bytes_transferred: increase it with the number of transferred bytes
705  */
706 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
707                           uint8_t *p, uint64_t *bytes_transferred)
708 {
709     int pages = -1;
710 
711     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
712         acct_info.dup_pages++;
713         *bytes_transferred += save_page_header(f, block,
714                                                offset | RAM_SAVE_FLAG_COMPRESS);
715         qemu_put_byte(f, 0);
716         *bytes_transferred += 1;
717         pages = 1;
718     }
719 
720     return pages;
721 }
722 
723 static void ram_release_pages(MigrationState *ms, const char *block_name,
724                               uint64_t offset, int pages)
725 {
726     if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
727         return;
728     }
729 
730     ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS);
731 }
732 
733 /**
734  * ram_save_page: Send the given page to the stream
735  *
736  * Returns: Number of pages written.
737  *          < 0 - error
738  *          >=0 - Number of pages written - this might legally be 0
739  *                if xbzrle noticed the page was the same.
740  *
741  * @ms: The current migration state.
742  * @f: QEMUFile where to send the data
743  * @block: block that contains the page we want to send
744  * @offset: offset inside the block for the page
745  * @last_stage: if we are at the completion stage
746  * @bytes_transferred: increase it with the number of transferred bytes
747  */
748 static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
749                          bool last_stage, uint64_t *bytes_transferred)
750 {
751     int pages = -1;
752     uint64_t bytes_xmit;
753     ram_addr_t current_addr;
754     uint8_t *p;
755     int ret;
756     bool send_async = true;
757     RAMBlock *block = pss->block;
758     ram_addr_t offset = pss->offset;
759 
760     p = block->host + offset;
761 
762     /* In doubt sent page as normal */
763     bytes_xmit = 0;
764     ret = ram_control_save_page(f, block->offset,
765                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
766     if (bytes_xmit) {
767         *bytes_transferred += bytes_xmit;
768         pages = 1;
769     }
770 
771     XBZRLE_cache_lock();
772 
773     current_addr = block->offset + offset;
774 
775     if (block == last_sent_block) {
776         offset |= RAM_SAVE_FLAG_CONTINUE;
777     }
778     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
779         if (ret != RAM_SAVE_CONTROL_DELAYED) {
780             if (bytes_xmit > 0) {
781                 acct_info.norm_pages++;
782             } else if (bytes_xmit == 0) {
783                 acct_info.dup_pages++;
784             }
785         }
786     } else {
787         pages = save_zero_page(f, block, offset, p, bytes_transferred);
788         if (pages > 0) {
789             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
790              * page would be stale
791              */
792             xbzrle_cache_zero_page(current_addr);
793             ram_release_pages(ms, block->idstr, pss->offset, pages);
794         } else if (!ram_bulk_stage &&
795                    !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
796             pages = save_xbzrle_page(f, &p, current_addr, block,
797                                      offset, last_stage, bytes_transferred);
798             if (!last_stage) {
799                 /* Can't send this cached data async, since the cache page
800                  * might get updated before it gets to the wire
801                  */
802                 send_async = false;
803             }
804         }
805     }
806 
807     /* XBZRLE overflow or normal page */
808     if (pages == -1) {
809         *bytes_transferred += save_page_header(f, block,
810                                                offset | RAM_SAVE_FLAG_PAGE);
811         if (send_async) {
812             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
813                                   migrate_release_ram() &
814                                   migration_in_postcopy(ms));
815         } else {
816             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
817         }
818         *bytes_transferred += TARGET_PAGE_SIZE;
819         pages = 1;
820         acct_info.norm_pages++;
821     }
822 
823     XBZRLE_cache_unlock();
824 
825     return pages;
826 }
827 
828 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
829                                 ram_addr_t offset)
830 {
831     int bytes_sent, blen;
832     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
833 
834     bytes_sent = save_page_header(f, block, offset |
835                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
836     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
837                                      migrate_compress_level());
838     if (blen < 0) {
839         bytes_sent = 0;
840         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
841         error_report("compressed data failed!");
842     } else {
843         bytes_sent += blen;
844         ram_release_pages(migrate_get_current(), block->idstr,
845                           offset & TARGET_PAGE_MASK, 1);
846     }
847 
848     return bytes_sent;
849 }
850 
851 static uint64_t bytes_transferred;
852 
853 static void flush_compressed_data(QEMUFile *f)
854 {
855     int idx, len, thread_count;
856 
857     if (!migrate_use_compression()) {
858         return;
859     }
860     thread_count = migrate_compress_threads();
861 
862     qemu_mutex_lock(&comp_done_lock);
863     for (idx = 0; idx < thread_count; idx++) {
864         while (!comp_param[idx].done) {
865             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
866         }
867     }
868     qemu_mutex_unlock(&comp_done_lock);
869 
870     for (idx = 0; idx < thread_count; idx++) {
871         qemu_mutex_lock(&comp_param[idx].mutex);
872         if (!comp_param[idx].quit) {
873             len = qemu_put_qemu_file(f, comp_param[idx].file);
874             bytes_transferred += len;
875         }
876         qemu_mutex_unlock(&comp_param[idx].mutex);
877     }
878 }
879 
880 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
881                                        ram_addr_t offset)
882 {
883     param->block = block;
884     param->offset = offset;
885 }
886 
887 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
888                                            ram_addr_t offset,
889                                            uint64_t *bytes_transferred)
890 {
891     int idx, thread_count, bytes_xmit = -1, pages = -1;
892 
893     thread_count = migrate_compress_threads();
894     qemu_mutex_lock(&comp_done_lock);
895     while (true) {
896         for (idx = 0; idx < thread_count; idx++) {
897             if (comp_param[idx].done) {
898                 comp_param[idx].done = false;
899                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
900                 qemu_mutex_lock(&comp_param[idx].mutex);
901                 set_compress_params(&comp_param[idx], block, offset);
902                 qemu_cond_signal(&comp_param[idx].cond);
903                 qemu_mutex_unlock(&comp_param[idx].mutex);
904                 pages = 1;
905                 acct_info.norm_pages++;
906                 *bytes_transferred += bytes_xmit;
907                 break;
908             }
909         }
910         if (pages > 0) {
911             break;
912         } else {
913             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
914         }
915     }
916     qemu_mutex_unlock(&comp_done_lock);
917 
918     return pages;
919 }
920 
921 /**
922  * ram_save_compressed_page: compress the given page and send it to the stream
923  *
924  * Returns: Number of pages written.
925  *
926  * @ms: The current migration state.
927  * @f: QEMUFile where to send the data
928  * @block: block that contains the page we want to send
929  * @offset: offset inside the block for the page
930  * @last_stage: if we are at the completion stage
931  * @bytes_transferred: increase it with the number of transferred bytes
932  */
933 static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f,
934                                     PageSearchStatus *pss, bool last_stage,
935                                     uint64_t *bytes_transferred)
936 {
937     int pages = -1;
938     uint64_t bytes_xmit = 0;
939     uint8_t *p;
940     int ret, blen;
941     RAMBlock *block = pss->block;
942     ram_addr_t offset = pss->offset;
943 
944     p = block->host + offset;
945 
946     ret = ram_control_save_page(f, block->offset,
947                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
948     if (bytes_xmit) {
949         *bytes_transferred += bytes_xmit;
950         pages = 1;
951     }
952     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
953         if (ret != RAM_SAVE_CONTROL_DELAYED) {
954             if (bytes_xmit > 0) {
955                 acct_info.norm_pages++;
956             } else if (bytes_xmit == 0) {
957                 acct_info.dup_pages++;
958             }
959         }
960     } else {
961         /* When starting the process of a new block, the first page of
962          * the block should be sent out before other pages in the same
963          * block, and all the pages in last block should have been sent
964          * out, keeping this order is important, because the 'cont' flag
965          * is used to avoid resending the block name.
966          */
967         if (block != last_sent_block) {
968             flush_compressed_data(f);
969             pages = save_zero_page(f, block, offset, p, bytes_transferred);
970             if (pages == -1) {
971                 /* Make sure the first page is sent out before other pages */
972                 bytes_xmit = save_page_header(f, block, offset |
973                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
974                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
975                                                  migrate_compress_level());
976                 if (blen > 0) {
977                     *bytes_transferred += bytes_xmit + blen;
978                     acct_info.norm_pages++;
979                     pages = 1;
980                 } else {
981                     qemu_file_set_error(f, blen);
982                     error_report("compressed data failed!");
983                 }
984             }
985             if (pages > 0) {
986                 ram_release_pages(ms, block->idstr, pss->offset, pages);
987             }
988         } else {
989             offset |= RAM_SAVE_FLAG_CONTINUE;
990             pages = save_zero_page(f, block, offset, p, bytes_transferred);
991             if (pages == -1) {
992                 pages = compress_page_with_multi_thread(f, block, offset,
993                                                         bytes_transferred);
994             } else {
995                 ram_release_pages(ms, block->idstr, pss->offset, pages);
996             }
997         }
998     }
999 
1000     return pages;
1001 }
1002 
1003 /*
1004  * Find the next dirty page and update any state associated with
1005  * the search process.
1006  *
1007  * Returns: True if a page is found
1008  *
1009  * @f: Current migration stream.
1010  * @pss: Data about the state of the current dirty page scan.
1011  * @*again: Set to false if the search has scanned the whole of RAM
1012  * *ram_addr_abs: Pointer into which to store the address of the dirty page
1013  *               within the global ram_addr space
1014  */
1015 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
1016                              bool *again, ram_addr_t *ram_addr_abs)
1017 {
1018     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
1019                                               ram_addr_abs);
1020     if (pss->complete_round && pss->block == last_seen_block &&
1021         pss->offset >= last_offset) {
1022         /*
1023          * We've been once around the RAM and haven't found anything.
1024          * Give up.
1025          */
1026         *again = false;
1027         return false;
1028     }
1029     if (pss->offset >= pss->block->used_length) {
1030         /* Didn't find anything in this RAM Block */
1031         pss->offset = 0;
1032         pss->block = QLIST_NEXT_RCU(pss->block, next);
1033         if (!pss->block) {
1034             /* Hit the end of the list */
1035             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1036             /* Flag that we've looped */
1037             pss->complete_round = true;
1038             ram_bulk_stage = false;
1039             if (migrate_use_xbzrle()) {
1040                 /* If xbzrle is on, stop using the data compression at this
1041                  * point. In theory, xbzrle can do better than compression.
1042                  */
1043                 flush_compressed_data(f);
1044                 compression_switch = false;
1045             }
1046         }
1047         /* Didn't find anything this time, but try again on the new block */
1048         *again = true;
1049         return false;
1050     } else {
1051         /* Can go around again, but... */
1052         *again = true;
1053         /* We've found something so probably don't need to */
1054         return true;
1055     }
1056 }
1057 
1058 /*
1059  * Helper for 'get_queued_page' - gets a page off the queue
1060  *      ms:      MigrationState in
1061  * *offset:      Used to return the offset within the RAMBlock
1062  * ram_addr_abs: global offset in the dirty/sent bitmaps
1063  *
1064  * Returns:      block (or NULL if none available)
1065  */
1066 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1067                               ram_addr_t *ram_addr_abs)
1068 {
1069     RAMBlock *block = NULL;
1070 
1071     qemu_mutex_lock(&ms->src_page_req_mutex);
1072     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1073         struct MigrationSrcPageRequest *entry =
1074                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1075         block = entry->rb;
1076         *offset = entry->offset;
1077         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1078                         TARGET_PAGE_MASK;
1079 
1080         if (entry->len > TARGET_PAGE_SIZE) {
1081             entry->len -= TARGET_PAGE_SIZE;
1082             entry->offset += TARGET_PAGE_SIZE;
1083         } else {
1084             memory_region_unref(block->mr);
1085             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1086             g_free(entry);
1087         }
1088     }
1089     qemu_mutex_unlock(&ms->src_page_req_mutex);
1090 
1091     return block;
1092 }
1093 
1094 /*
1095  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1096  * that are already sent (!dirty)
1097  *
1098  *      ms:      MigrationState in
1099  *     pss:      PageSearchStatus structure updated with found block/offset
1100  * ram_addr_abs: global offset in the dirty/sent bitmaps
1101  *
1102  * Returns:      true if a queued page is found
1103  */
1104 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1105                             ram_addr_t *ram_addr_abs)
1106 {
1107     RAMBlock  *block;
1108     ram_addr_t offset;
1109     bool dirty;
1110 
1111     do {
1112         block = unqueue_page(ms, &offset, ram_addr_abs);
1113         /*
1114          * We're sending this page, and since it's postcopy nothing else
1115          * will dirty it, and we must make sure it doesn't get sent again
1116          * even if this queue request was received after the background
1117          * search already sent it.
1118          */
1119         if (block) {
1120             unsigned long *bitmap;
1121             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1122             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1123             if (!dirty) {
1124                 trace_get_queued_page_not_dirty(
1125                     block->idstr, (uint64_t)offset,
1126                     (uint64_t)*ram_addr_abs,
1127                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1128                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1129             } else {
1130                 trace_get_queued_page(block->idstr,
1131                                       (uint64_t)offset,
1132                                       (uint64_t)*ram_addr_abs);
1133             }
1134         }
1135 
1136     } while (block && !dirty);
1137 
1138     if (block) {
1139         /*
1140          * As soon as we start servicing pages out of order, then we have
1141          * to kill the bulk stage, since the bulk stage assumes
1142          * in (migration_bitmap_find_and_reset_dirty) that every page is
1143          * dirty, that's no longer true.
1144          */
1145         ram_bulk_stage = false;
1146 
1147         /*
1148          * We want the background search to continue from the queued page
1149          * since the guest is likely to want other pages near to the page
1150          * it just requested.
1151          */
1152         pss->block = block;
1153         pss->offset = offset;
1154     }
1155 
1156     return !!block;
1157 }
1158 
1159 /**
1160  * flush_page_queue: Flush any remaining pages in the ram request queue
1161  *    it should be empty at the end anyway, but in error cases there may be
1162  *    some left.
1163  *
1164  * ms: MigrationState
1165  */
1166 void flush_page_queue(MigrationState *ms)
1167 {
1168     struct MigrationSrcPageRequest *mspr, *next_mspr;
1169     /* This queue generally should be empty - but in the case of a failed
1170      * migration might have some droppings in.
1171      */
1172     rcu_read_lock();
1173     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1174         memory_region_unref(mspr->rb->mr);
1175         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1176         g_free(mspr);
1177     }
1178     rcu_read_unlock();
1179 }
1180 
1181 /**
1182  * Queue the pages for transmission, e.g. a request from postcopy destination
1183  *   ms: MigrationStatus in which the queue is held
1184  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1185  *   start: Offset from the start of the RAMBlock
1186  *   len: Length (in bytes) to send
1187  *   Return: 0 on success
1188  */
1189 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1190                          ram_addr_t start, ram_addr_t len)
1191 {
1192     RAMBlock *ramblock;
1193 
1194     ms->postcopy_requests++;
1195     rcu_read_lock();
1196     if (!rbname) {
1197         /* Reuse last RAMBlock */
1198         ramblock = ms->last_req_rb;
1199 
1200         if (!ramblock) {
1201             /*
1202              * Shouldn't happen, we can't reuse the last RAMBlock if
1203              * it's the 1st request.
1204              */
1205             error_report("ram_save_queue_pages no previous block");
1206             goto err;
1207         }
1208     } else {
1209         ramblock = qemu_ram_block_by_name(rbname);
1210 
1211         if (!ramblock) {
1212             /* We shouldn't be asked for a non-existent RAMBlock */
1213             error_report("ram_save_queue_pages no block '%s'", rbname);
1214             goto err;
1215         }
1216         ms->last_req_rb = ramblock;
1217     }
1218     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1219     if (start+len > ramblock->used_length) {
1220         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1221                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1222                      __func__, start, len, ramblock->used_length);
1223         goto err;
1224     }
1225 
1226     struct MigrationSrcPageRequest *new_entry =
1227         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1228     new_entry->rb = ramblock;
1229     new_entry->offset = start;
1230     new_entry->len = len;
1231 
1232     memory_region_ref(ramblock->mr);
1233     qemu_mutex_lock(&ms->src_page_req_mutex);
1234     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1235     qemu_mutex_unlock(&ms->src_page_req_mutex);
1236     rcu_read_unlock();
1237 
1238     return 0;
1239 
1240 err:
1241     rcu_read_unlock();
1242     return -1;
1243 }
1244 
1245 /**
1246  * ram_save_target_page: Save one target page
1247  *
1248  *
1249  * @f: QEMUFile where to send the data
1250  * @block: pointer to block that contains the page we want to send
1251  * @offset: offset inside the block for the page;
1252  * @last_stage: if we are at the completion stage
1253  * @bytes_transferred: increase it with the number of transferred bytes
1254  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1255  *
1256  * Returns: Number of pages written.
1257  */
1258 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1259                                 PageSearchStatus *pss,
1260                                 bool last_stage,
1261                                 uint64_t *bytes_transferred,
1262                                 ram_addr_t dirty_ram_abs)
1263 {
1264     int res = 0;
1265 
1266     /* Check the pages is dirty and if it is send it */
1267     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1268         unsigned long *unsentmap;
1269         if (compression_switch && migrate_use_compression()) {
1270             res = ram_save_compressed_page(ms, f, pss,
1271                                            last_stage,
1272                                            bytes_transferred);
1273         } else {
1274             res = ram_save_page(ms, f, pss, last_stage,
1275                                 bytes_transferred);
1276         }
1277 
1278         if (res < 0) {
1279             return res;
1280         }
1281         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1282         if (unsentmap) {
1283             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1284         }
1285         /* Only update last_sent_block if a block was actually sent; xbzrle
1286          * might have decided the page was identical so didn't bother writing
1287          * to the stream.
1288          */
1289         if (res > 0) {
1290             last_sent_block = pss->block;
1291         }
1292     }
1293 
1294     return res;
1295 }
1296 
1297 /**
1298  * ram_save_host_page: Starting at *offset send pages up to the end
1299  *                     of the current host page.  It's valid for the initial
1300  *                     offset to point into the middle of a host page
1301  *                     in which case the remainder of the hostpage is sent.
1302  *                     Only dirty target pages are sent.
1303  *                     Note that the host page size may be a huge page for this
1304  *                     block.
1305  *
1306  * Returns: Number of pages written.
1307  *
1308  * @f: QEMUFile where to send the data
1309  * @block: pointer to block that contains the page we want to send
1310  * @offset: offset inside the block for the page; updated to last target page
1311  *          sent
1312  * @last_stage: if we are at the completion stage
1313  * @bytes_transferred: increase it with the number of transferred bytes
1314  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1315  */
1316 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1317                               PageSearchStatus *pss,
1318                               bool last_stage,
1319                               uint64_t *bytes_transferred,
1320                               ram_addr_t dirty_ram_abs)
1321 {
1322     int tmppages, pages = 0;
1323     size_t pagesize = qemu_ram_pagesize(pss->block);
1324 
1325     do {
1326         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1327                                         bytes_transferred, dirty_ram_abs);
1328         if (tmppages < 0) {
1329             return tmppages;
1330         }
1331 
1332         pages += tmppages;
1333         pss->offset += TARGET_PAGE_SIZE;
1334         dirty_ram_abs += TARGET_PAGE_SIZE;
1335     } while (pss->offset & (pagesize - 1));
1336 
1337     /* The offset we leave with is the last one we looked at */
1338     pss->offset -= TARGET_PAGE_SIZE;
1339     return pages;
1340 }
1341 
1342 /**
1343  * ram_find_and_save_block: Finds a dirty page and sends it to f
1344  *
1345  * Called within an RCU critical section.
1346  *
1347  * Returns:  The number of pages written
1348  *           0 means no dirty pages
1349  *
1350  * @f: QEMUFile where to send the data
1351  * @last_stage: if we are at the completion stage
1352  * @bytes_transferred: increase it with the number of transferred bytes
1353  *
1354  * On systems where host-page-size > target-page-size it will send all the
1355  * pages in a host page that are dirty.
1356  */
1357 
1358 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1359                                    uint64_t *bytes_transferred)
1360 {
1361     PageSearchStatus pss;
1362     MigrationState *ms = migrate_get_current();
1363     int pages = 0;
1364     bool again, found;
1365     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1366                                  ram_addr_t space */
1367 
1368     /* No dirty page as there is zero RAM */
1369     if (!ram_bytes_total()) {
1370         return pages;
1371     }
1372 
1373     pss.block = last_seen_block;
1374     pss.offset = last_offset;
1375     pss.complete_round = false;
1376 
1377     if (!pss.block) {
1378         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1379     }
1380 
1381     do {
1382         again = true;
1383         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1384 
1385         if (!found) {
1386             /* priority queue empty, so just search for something dirty */
1387             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1388         }
1389 
1390         if (found) {
1391             pages = ram_save_host_page(ms, f, &pss,
1392                                        last_stage, bytes_transferred,
1393                                        dirty_ram_abs);
1394         }
1395     } while (!pages && again);
1396 
1397     last_seen_block = pss.block;
1398     last_offset = pss.offset;
1399 
1400     return pages;
1401 }
1402 
1403 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1404 {
1405     uint64_t pages = size / TARGET_PAGE_SIZE;
1406     if (zero) {
1407         acct_info.dup_pages += pages;
1408     } else {
1409         acct_info.norm_pages += pages;
1410         bytes_transferred += size;
1411         qemu_update_position(f, size);
1412     }
1413 }
1414 
1415 static ram_addr_t ram_save_remaining(void)
1416 {
1417     return migration_dirty_pages;
1418 }
1419 
1420 uint64_t ram_bytes_remaining(void)
1421 {
1422     return ram_save_remaining() * TARGET_PAGE_SIZE;
1423 }
1424 
1425 uint64_t ram_bytes_transferred(void)
1426 {
1427     return bytes_transferred;
1428 }
1429 
1430 uint64_t ram_bytes_total(void)
1431 {
1432     RAMBlock *block;
1433     uint64_t total = 0;
1434 
1435     rcu_read_lock();
1436     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1437         total += block->used_length;
1438     rcu_read_unlock();
1439     return total;
1440 }
1441 
1442 void free_xbzrle_decoded_buf(void)
1443 {
1444     g_free(xbzrle_decoded_buf);
1445     xbzrle_decoded_buf = NULL;
1446 }
1447 
1448 static void migration_bitmap_free(struct BitmapRcu *bmap)
1449 {
1450     g_free(bmap->bmap);
1451     g_free(bmap->unsentmap);
1452     g_free(bmap);
1453 }
1454 
1455 static void ram_migration_cleanup(void *opaque)
1456 {
1457     /* caller have hold iothread lock or is in a bh, so there is
1458      * no writing race against this migration_bitmap
1459      */
1460     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1461     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1462     if (bitmap) {
1463         memory_global_dirty_log_stop();
1464         call_rcu(bitmap, migration_bitmap_free, rcu);
1465     }
1466 
1467     XBZRLE_cache_lock();
1468     if (XBZRLE.cache) {
1469         cache_fini(XBZRLE.cache);
1470         g_free(XBZRLE.encoded_buf);
1471         g_free(XBZRLE.current_buf);
1472         g_free(ZERO_TARGET_PAGE);
1473         XBZRLE.cache = NULL;
1474         XBZRLE.encoded_buf = NULL;
1475         XBZRLE.current_buf = NULL;
1476     }
1477     XBZRLE_cache_unlock();
1478 }
1479 
1480 static void reset_ram_globals(void)
1481 {
1482     last_seen_block = NULL;
1483     last_sent_block = NULL;
1484     last_offset = 0;
1485     last_version = ram_list.version;
1486     ram_bulk_stage = true;
1487 }
1488 
1489 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1490 
1491 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1492 {
1493     /* called in qemu main thread, so there is
1494      * no writing race against this migration_bitmap
1495      */
1496     if (migration_bitmap_rcu) {
1497         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1498         bitmap = g_new(struct BitmapRcu, 1);
1499         bitmap->bmap = bitmap_new(new);
1500 
1501         /* prevent migration_bitmap content from being set bit
1502          * by migration_bitmap_sync_range() at the same time.
1503          * it is safe to migration if migration_bitmap is cleared bit
1504          * at the same time.
1505          */
1506         qemu_mutex_lock(&migration_bitmap_mutex);
1507         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1508         bitmap_set(bitmap->bmap, old, new - old);
1509 
1510         /* We don't have a way to safely extend the sentmap
1511          * with RCU; so mark it as missing, entry to postcopy
1512          * will fail.
1513          */
1514         bitmap->unsentmap = NULL;
1515 
1516         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1517         qemu_mutex_unlock(&migration_bitmap_mutex);
1518         migration_dirty_pages += new - old;
1519         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1520     }
1521 }
1522 
1523 /*
1524  * 'expected' is the value you expect the bitmap mostly to be full
1525  * of; it won't bother printing lines that are all this value.
1526  * If 'todump' is null the migration bitmap is dumped.
1527  */
1528 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1529 {
1530     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1531 
1532     int64_t cur;
1533     int64_t linelen = 128;
1534     char linebuf[129];
1535 
1536     if (!todump) {
1537         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1538     }
1539 
1540     for (cur = 0; cur < ram_pages; cur += linelen) {
1541         int64_t curb;
1542         bool found = false;
1543         /*
1544          * Last line; catch the case where the line length
1545          * is longer than remaining ram
1546          */
1547         if (cur + linelen > ram_pages) {
1548             linelen = ram_pages - cur;
1549         }
1550         for (curb = 0; curb < linelen; curb++) {
1551             bool thisbit = test_bit(cur + curb, todump);
1552             linebuf[curb] = thisbit ? '1' : '.';
1553             found = found || (thisbit != expected);
1554         }
1555         if (found) {
1556             linebuf[curb] = '\0';
1557             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1558         }
1559     }
1560 }
1561 
1562 /* **** functions for postcopy ***** */
1563 
1564 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1565 {
1566     struct RAMBlock *block;
1567     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1568 
1569     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1570         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1571         unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1572         unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1573 
1574         while (run_start < range) {
1575             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1576             ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1577                               (run_end - run_start) << TARGET_PAGE_BITS);
1578             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1579         }
1580     }
1581 }
1582 
1583 /*
1584  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1585  * Note: At this point the 'unsentmap' is the processed bitmap combined
1586  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1587  * start,length: Indexes into the bitmap for the first bit
1588  *            representing the named block and length in target-pages
1589  */
1590 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1591                                         PostcopyDiscardState *pds,
1592                                         unsigned long start,
1593                                         unsigned long length)
1594 {
1595     unsigned long end = start + length; /* one after the end */
1596     unsigned long current;
1597     unsigned long *unsentmap;
1598 
1599     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1600     for (current = start; current < end; ) {
1601         unsigned long one = find_next_bit(unsentmap, end, current);
1602 
1603         if (one <= end) {
1604             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1605             unsigned long discard_length;
1606 
1607             if (zero >= end) {
1608                 discard_length = end - one;
1609             } else {
1610                 discard_length = zero - one;
1611             }
1612             if (discard_length) {
1613                 postcopy_discard_send_range(ms, pds, one, discard_length);
1614             }
1615             current = one + discard_length;
1616         } else {
1617             current = one;
1618         }
1619     }
1620 
1621     return 0;
1622 }
1623 
1624 /*
1625  * Utility for the outgoing postcopy code.
1626  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1627  *   passing it bitmap indexes and name.
1628  * Returns: 0 on success
1629  * (qemu_ram_foreach_block ends up passing unscaled lengths
1630  *  which would mean postcopy code would have to deal with target page)
1631  */
1632 static int postcopy_each_ram_send_discard(MigrationState *ms)
1633 {
1634     struct RAMBlock *block;
1635     int ret;
1636 
1637     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1638         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1639         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1640                                                                first,
1641                                                                block->idstr);
1642 
1643         /*
1644          * Postcopy sends chunks of bitmap over the wire, but it
1645          * just needs indexes at this point, avoids it having
1646          * target page specific code.
1647          */
1648         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1649                                     block->used_length >> TARGET_PAGE_BITS);
1650         postcopy_discard_send_finish(ms, pds);
1651         if (ret) {
1652             return ret;
1653         }
1654     }
1655 
1656     return 0;
1657 }
1658 
1659 /*
1660  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1661  *   the two bitmaps, that are similar, but one is inverted.
1662  *
1663  * We search for runs of target-pages that don't start or end on a
1664  * host page boundary;
1665  * unsent_pass=true: Cleans up partially unsent host pages by searching
1666  *                 the unsentmap
1667  * unsent_pass=false: Cleans up partially dirty host pages by searching
1668  *                 the main migration bitmap
1669  *
1670  */
1671 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1672                                           RAMBlock *block,
1673                                           PostcopyDiscardState *pds)
1674 {
1675     unsigned long *bitmap;
1676     unsigned long *unsentmap;
1677     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1678     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1679     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1680     unsigned long last = first + (len - 1);
1681     unsigned long run_start;
1682 
1683     if (block->page_size == TARGET_PAGE_SIZE) {
1684         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1685         return;
1686     }
1687 
1688     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1689     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1690 
1691     if (unsent_pass) {
1692         /* Find a sent page */
1693         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1694     } else {
1695         /* Find a dirty page */
1696         run_start = find_next_bit(bitmap, last + 1, first);
1697     }
1698 
1699     while (run_start <= last) {
1700         bool do_fixup = false;
1701         unsigned long fixup_start_addr;
1702         unsigned long host_offset;
1703 
1704         /*
1705          * If the start of this run of pages is in the middle of a host
1706          * page, then we need to fixup this host page.
1707          */
1708         host_offset = run_start % host_ratio;
1709         if (host_offset) {
1710             do_fixup = true;
1711             run_start -= host_offset;
1712             fixup_start_addr = run_start;
1713             /* For the next pass */
1714             run_start = run_start + host_ratio;
1715         } else {
1716             /* Find the end of this run */
1717             unsigned long run_end;
1718             if (unsent_pass) {
1719                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1720             } else {
1721                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1722             }
1723             /*
1724              * If the end isn't at the start of a host page, then the
1725              * run doesn't finish at the end of a host page
1726              * and we need to discard.
1727              */
1728             host_offset = run_end % host_ratio;
1729             if (host_offset) {
1730                 do_fixup = true;
1731                 fixup_start_addr = run_end - host_offset;
1732                 /*
1733                  * This host page has gone, the next loop iteration starts
1734                  * from after the fixup
1735                  */
1736                 run_start = fixup_start_addr + host_ratio;
1737             } else {
1738                 /*
1739                  * No discards on this iteration, next loop starts from
1740                  * next sent/dirty page
1741                  */
1742                 run_start = run_end + 1;
1743             }
1744         }
1745 
1746         if (do_fixup) {
1747             unsigned long page;
1748 
1749             /* Tell the destination to discard this page */
1750             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1751                 /* For the unsent_pass we:
1752                  *     discard partially sent pages
1753                  * For the !unsent_pass (dirty) we:
1754                  *     discard partially dirty pages that were sent
1755                  *     (any partially sent pages were already discarded
1756                  *     by the previous unsent_pass)
1757                  */
1758                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1759                                             host_ratio);
1760             }
1761 
1762             /* Clean up the bitmap */
1763             for (page = fixup_start_addr;
1764                  page < fixup_start_addr + host_ratio; page++) {
1765                 /* All pages in this host page are now not sent */
1766                 set_bit(page, unsentmap);
1767 
1768                 /*
1769                  * Remark them as dirty, updating the count for any pages
1770                  * that weren't previously dirty.
1771                  */
1772                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1773             }
1774         }
1775 
1776         if (unsent_pass) {
1777             /* Find the next sent page for the next iteration */
1778             run_start = find_next_zero_bit(unsentmap, last + 1,
1779                                            run_start);
1780         } else {
1781             /* Find the next dirty page for the next iteration */
1782             run_start = find_next_bit(bitmap, last + 1, run_start);
1783         }
1784     }
1785 }
1786 
1787 /*
1788  * Utility for the outgoing postcopy code.
1789  *
1790  * Discard any partially sent host-page size chunks, mark any partially
1791  * dirty host-page size chunks as all dirty.  In this case the host-page
1792  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1793  *
1794  * Returns: 0 on success
1795  */
1796 static int postcopy_chunk_hostpages(MigrationState *ms)
1797 {
1798     struct RAMBlock *block;
1799 
1800     /* Easiest way to make sure we don't resume in the middle of a host-page */
1801     last_seen_block = NULL;
1802     last_sent_block = NULL;
1803     last_offset     = 0;
1804 
1805     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1806         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1807 
1808         PostcopyDiscardState *pds =
1809                          postcopy_discard_send_init(ms, first, block->idstr);
1810 
1811         /* First pass: Discard all partially sent host pages */
1812         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1813         /*
1814          * Second pass: Ensure that all partially dirty host pages are made
1815          * fully dirty.
1816          */
1817         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1818 
1819         postcopy_discard_send_finish(ms, pds);
1820     } /* ram_list loop */
1821 
1822     return 0;
1823 }
1824 
1825 /*
1826  * Transmit the set of pages to be discarded after precopy to the target
1827  * these are pages that:
1828  *     a) Have been previously transmitted but are now dirty again
1829  *     b) Pages that have never been transmitted, this ensures that
1830  *        any pages on the destination that have been mapped by background
1831  *        tasks get discarded (transparent huge pages is the specific concern)
1832  * Hopefully this is pretty sparse
1833  */
1834 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1835 {
1836     int ret;
1837     unsigned long *bitmap, *unsentmap;
1838 
1839     rcu_read_lock();
1840 
1841     /* This should be our last sync, the src is now paused */
1842     migration_bitmap_sync();
1843 
1844     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1845     if (!unsentmap) {
1846         /* We don't have a safe way to resize the sentmap, so
1847          * if the bitmap was resized it will be NULL at this
1848          * point.
1849          */
1850         error_report("migration ram resized during precopy phase");
1851         rcu_read_unlock();
1852         return -EINVAL;
1853     }
1854 
1855     /* Deal with TPS != HPS and huge pages */
1856     ret = postcopy_chunk_hostpages(ms);
1857     if (ret) {
1858         rcu_read_unlock();
1859         return ret;
1860     }
1861 
1862     /*
1863      * Update the unsentmap to be unsentmap = unsentmap | dirty
1864      */
1865     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1866     bitmap_or(unsentmap, unsentmap, bitmap,
1867                last_ram_offset() >> TARGET_PAGE_BITS);
1868 
1869 
1870     trace_ram_postcopy_send_discard_bitmap();
1871 #ifdef DEBUG_POSTCOPY
1872     ram_debug_dump_bitmap(unsentmap, true);
1873 #endif
1874 
1875     ret = postcopy_each_ram_send_discard(ms);
1876     rcu_read_unlock();
1877 
1878     return ret;
1879 }
1880 
1881 /*
1882  * At the start of the postcopy phase of migration, any now-dirty
1883  * precopied pages are discarded.
1884  *
1885  * start, length describe a byte address range within the RAMBlock
1886  *
1887  * Returns 0 on success.
1888  */
1889 int ram_discard_range(MigrationIncomingState *mis,
1890                       const char *block_name,
1891                       uint64_t start, size_t length)
1892 {
1893     int ret = -1;
1894 
1895     trace_ram_discard_range(block_name, start, length);
1896 
1897     rcu_read_lock();
1898     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1899 
1900     if (!rb) {
1901         error_report("ram_discard_range: Failed to find block '%s'",
1902                      block_name);
1903         goto err;
1904     }
1905 
1906     ret = ram_block_discard_range(rb, start, length);
1907 
1908 err:
1909     rcu_read_unlock();
1910 
1911     return ret;
1912 }
1913 
1914 static int ram_save_init_globals(void)
1915 {
1916     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1917 
1918     dirty_rate_high_cnt = 0;
1919     bitmap_sync_count = 0;
1920     migration_bitmap_sync_init();
1921     qemu_mutex_init(&migration_bitmap_mutex);
1922 
1923     if (migrate_use_xbzrle()) {
1924         XBZRLE_cache_lock();
1925         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1926         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1927                                   TARGET_PAGE_SIZE,
1928                                   TARGET_PAGE_SIZE);
1929         if (!XBZRLE.cache) {
1930             XBZRLE_cache_unlock();
1931             error_report("Error creating cache");
1932             return -1;
1933         }
1934         XBZRLE_cache_unlock();
1935 
1936         /* We prefer not to abort if there is no memory */
1937         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1938         if (!XBZRLE.encoded_buf) {
1939             error_report("Error allocating encoded_buf");
1940             return -1;
1941         }
1942 
1943         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1944         if (!XBZRLE.current_buf) {
1945             error_report("Error allocating current_buf");
1946             g_free(XBZRLE.encoded_buf);
1947             XBZRLE.encoded_buf = NULL;
1948             return -1;
1949         }
1950 
1951         acct_clear();
1952     }
1953 
1954     /* For memory_global_dirty_log_start below.  */
1955     qemu_mutex_lock_iothread();
1956 
1957     qemu_mutex_lock_ramlist();
1958     rcu_read_lock();
1959     bytes_transferred = 0;
1960     reset_ram_globals();
1961 
1962     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1963     /* Skip setting bitmap if there is no RAM */
1964     if (ram_bytes_total()) {
1965         ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1966         migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1967         bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1968 
1969         if (migrate_postcopy_ram()) {
1970             migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1971             bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1972         }
1973     }
1974 
1975     /*
1976      * Count the total number of pages used by ram blocks not including any
1977      * gaps due to alignment or unplugs.
1978      */
1979     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1980 
1981     memory_global_dirty_log_start();
1982     migration_bitmap_sync();
1983     qemu_mutex_unlock_ramlist();
1984     qemu_mutex_unlock_iothread();
1985     rcu_read_unlock();
1986 
1987     return 0;
1988 }
1989 
1990 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1991  * long-running RCU critical section.  When rcu-reclaims in the code
1992  * start to become numerous it will be necessary to reduce the
1993  * granularity of these critical sections.
1994  */
1995 
1996 static int ram_save_setup(QEMUFile *f, void *opaque)
1997 {
1998     RAMBlock *block;
1999 
2000     /* migration has already setup the bitmap, reuse it. */
2001     if (!migration_in_colo_state()) {
2002         if (ram_save_init_globals() < 0) {
2003             return -1;
2004          }
2005     }
2006 
2007     rcu_read_lock();
2008 
2009     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2010 
2011     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2012         qemu_put_byte(f, strlen(block->idstr));
2013         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2014         qemu_put_be64(f, block->used_length);
2015         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2016             qemu_put_be64(f, block->page_size);
2017         }
2018     }
2019 
2020     rcu_read_unlock();
2021 
2022     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2023     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2024 
2025     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2026 
2027     return 0;
2028 }
2029 
2030 static int ram_save_iterate(QEMUFile *f, void *opaque)
2031 {
2032     int ret;
2033     int i;
2034     int64_t t0;
2035     int done = 0;
2036 
2037     rcu_read_lock();
2038     if (ram_list.version != last_version) {
2039         reset_ram_globals();
2040     }
2041 
2042     /* Read version before ram_list.blocks */
2043     smp_rmb();
2044 
2045     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2046 
2047     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2048     i = 0;
2049     while ((ret = qemu_file_rate_limit(f)) == 0) {
2050         int pages;
2051 
2052         pages = ram_find_and_save_block(f, false, &bytes_transferred);
2053         /* no more pages to sent */
2054         if (pages == 0) {
2055             done = 1;
2056             break;
2057         }
2058         acct_info.iterations++;
2059 
2060         /* we want to check in the 1st loop, just in case it was the 1st time
2061            and we had to sync the dirty bitmap.
2062            qemu_get_clock_ns() is a bit expensive, so we only check each some
2063            iterations
2064         */
2065         if ((i & 63) == 0) {
2066             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2067             if (t1 > MAX_WAIT) {
2068                 trace_ram_save_iterate_big_wait(t1, i);
2069                 break;
2070             }
2071         }
2072         i++;
2073     }
2074     flush_compressed_data(f);
2075     rcu_read_unlock();
2076 
2077     /*
2078      * Must occur before EOS (or any QEMUFile operation)
2079      * because of RDMA protocol.
2080      */
2081     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2082 
2083     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2084     bytes_transferred += 8;
2085 
2086     ret = qemu_file_get_error(f);
2087     if (ret < 0) {
2088         return ret;
2089     }
2090 
2091     return done;
2092 }
2093 
2094 /* Called with iothread lock */
2095 static int ram_save_complete(QEMUFile *f, void *opaque)
2096 {
2097     rcu_read_lock();
2098 
2099     if (!migration_in_postcopy(migrate_get_current())) {
2100         migration_bitmap_sync();
2101     }
2102 
2103     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2104 
2105     /* try transferring iterative blocks of memory */
2106 
2107     /* flush all remaining blocks regardless of rate limiting */
2108     while (true) {
2109         int pages;
2110 
2111         pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2112                                         &bytes_transferred);
2113         /* no more blocks to sent */
2114         if (pages == 0) {
2115             break;
2116         }
2117     }
2118 
2119     flush_compressed_data(f);
2120     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2121 
2122     rcu_read_unlock();
2123 
2124     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2125 
2126     return 0;
2127 }
2128 
2129 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2130                              uint64_t *non_postcopiable_pending,
2131                              uint64_t *postcopiable_pending)
2132 {
2133     uint64_t remaining_size;
2134 
2135     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2136 
2137     if (!migration_in_postcopy(migrate_get_current()) &&
2138         remaining_size < max_size) {
2139         qemu_mutex_lock_iothread();
2140         rcu_read_lock();
2141         migration_bitmap_sync();
2142         rcu_read_unlock();
2143         qemu_mutex_unlock_iothread();
2144         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2145     }
2146 
2147     /* We can do postcopy, and all the data is postcopiable */
2148     *postcopiable_pending += remaining_size;
2149 }
2150 
2151 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2152 {
2153     unsigned int xh_len;
2154     int xh_flags;
2155     uint8_t *loaded_data;
2156 
2157     if (!xbzrle_decoded_buf) {
2158         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2159     }
2160     loaded_data = xbzrle_decoded_buf;
2161 
2162     /* extract RLE header */
2163     xh_flags = qemu_get_byte(f);
2164     xh_len = qemu_get_be16(f);
2165 
2166     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2167         error_report("Failed to load XBZRLE page - wrong compression!");
2168         return -1;
2169     }
2170 
2171     if (xh_len > TARGET_PAGE_SIZE) {
2172         error_report("Failed to load XBZRLE page - len overflow!");
2173         return -1;
2174     }
2175     /* load data and decode */
2176     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2177 
2178     /* decode RLE */
2179     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2180                              TARGET_PAGE_SIZE) == -1) {
2181         error_report("Failed to load XBZRLE page - decode error!");
2182         return -1;
2183     }
2184 
2185     return 0;
2186 }
2187 
2188 /* Must be called from within a rcu critical section.
2189  * Returns a pointer from within the RCU-protected ram_list.
2190  */
2191 /*
2192  * Read a RAMBlock ID from the stream f.
2193  *
2194  * f: Stream to read from
2195  * flags: Page flags (mostly to see if it's a continuation of previous block)
2196  */
2197 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2198                                               int flags)
2199 {
2200     static RAMBlock *block = NULL;
2201     char id[256];
2202     uint8_t len;
2203 
2204     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2205         if (!block) {
2206             error_report("Ack, bad migration stream!");
2207             return NULL;
2208         }
2209         return block;
2210     }
2211 
2212     len = qemu_get_byte(f);
2213     qemu_get_buffer(f, (uint8_t *)id, len);
2214     id[len] = 0;
2215 
2216     block = qemu_ram_block_by_name(id);
2217     if (!block) {
2218         error_report("Can't find block %s", id);
2219         return NULL;
2220     }
2221 
2222     return block;
2223 }
2224 
2225 static inline void *host_from_ram_block_offset(RAMBlock *block,
2226                                                ram_addr_t offset)
2227 {
2228     if (!offset_in_ramblock(block, offset)) {
2229         return NULL;
2230     }
2231 
2232     return block->host + offset;
2233 }
2234 
2235 /*
2236  * If a page (or a whole RDMA chunk) has been
2237  * determined to be zero, then zap it.
2238  */
2239 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2240 {
2241     if (ch != 0 || !is_zero_range(host, size)) {
2242         memset(host, ch, size);
2243     }
2244 }
2245 
2246 static void *do_data_decompress(void *opaque)
2247 {
2248     DecompressParam *param = opaque;
2249     unsigned long pagesize;
2250     uint8_t *des;
2251     int len;
2252 
2253     qemu_mutex_lock(&param->mutex);
2254     while (!param->quit) {
2255         if (param->des) {
2256             des = param->des;
2257             len = param->len;
2258             param->des = 0;
2259             qemu_mutex_unlock(&param->mutex);
2260 
2261             pagesize = TARGET_PAGE_SIZE;
2262             /* uncompress() will return failed in some case, especially
2263              * when the page is dirted when doing the compression, it's
2264              * not a problem because the dirty page will be retransferred
2265              * and uncompress() won't break the data in other pages.
2266              */
2267             uncompress((Bytef *)des, &pagesize,
2268                        (const Bytef *)param->compbuf, len);
2269 
2270             qemu_mutex_lock(&decomp_done_lock);
2271             param->done = true;
2272             qemu_cond_signal(&decomp_done_cond);
2273             qemu_mutex_unlock(&decomp_done_lock);
2274 
2275             qemu_mutex_lock(&param->mutex);
2276         } else {
2277             qemu_cond_wait(&param->cond, &param->mutex);
2278         }
2279     }
2280     qemu_mutex_unlock(&param->mutex);
2281 
2282     return NULL;
2283 }
2284 
2285 static void wait_for_decompress_done(void)
2286 {
2287     int idx, thread_count;
2288 
2289     if (!migrate_use_compression()) {
2290         return;
2291     }
2292 
2293     thread_count = migrate_decompress_threads();
2294     qemu_mutex_lock(&decomp_done_lock);
2295     for (idx = 0; idx < thread_count; idx++) {
2296         while (!decomp_param[idx].done) {
2297             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2298         }
2299     }
2300     qemu_mutex_unlock(&decomp_done_lock);
2301 }
2302 
2303 void migrate_decompress_threads_create(void)
2304 {
2305     int i, thread_count;
2306 
2307     thread_count = migrate_decompress_threads();
2308     decompress_threads = g_new0(QemuThread, thread_count);
2309     decomp_param = g_new0(DecompressParam, thread_count);
2310     qemu_mutex_init(&decomp_done_lock);
2311     qemu_cond_init(&decomp_done_cond);
2312     for (i = 0; i < thread_count; i++) {
2313         qemu_mutex_init(&decomp_param[i].mutex);
2314         qemu_cond_init(&decomp_param[i].cond);
2315         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2316         decomp_param[i].done = true;
2317         decomp_param[i].quit = false;
2318         qemu_thread_create(decompress_threads + i, "decompress",
2319                            do_data_decompress, decomp_param + i,
2320                            QEMU_THREAD_JOINABLE);
2321     }
2322 }
2323 
2324 void migrate_decompress_threads_join(void)
2325 {
2326     int i, thread_count;
2327 
2328     thread_count = migrate_decompress_threads();
2329     for (i = 0; i < thread_count; i++) {
2330         qemu_mutex_lock(&decomp_param[i].mutex);
2331         decomp_param[i].quit = true;
2332         qemu_cond_signal(&decomp_param[i].cond);
2333         qemu_mutex_unlock(&decomp_param[i].mutex);
2334     }
2335     for (i = 0; i < thread_count; i++) {
2336         qemu_thread_join(decompress_threads + i);
2337         qemu_mutex_destroy(&decomp_param[i].mutex);
2338         qemu_cond_destroy(&decomp_param[i].cond);
2339         g_free(decomp_param[i].compbuf);
2340     }
2341     g_free(decompress_threads);
2342     g_free(decomp_param);
2343     decompress_threads = NULL;
2344     decomp_param = NULL;
2345 }
2346 
2347 static void decompress_data_with_multi_threads(QEMUFile *f,
2348                                                void *host, int len)
2349 {
2350     int idx, thread_count;
2351 
2352     thread_count = migrate_decompress_threads();
2353     qemu_mutex_lock(&decomp_done_lock);
2354     while (true) {
2355         for (idx = 0; idx < thread_count; idx++) {
2356             if (decomp_param[idx].done) {
2357                 decomp_param[idx].done = false;
2358                 qemu_mutex_lock(&decomp_param[idx].mutex);
2359                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2360                 decomp_param[idx].des = host;
2361                 decomp_param[idx].len = len;
2362                 qemu_cond_signal(&decomp_param[idx].cond);
2363                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2364                 break;
2365             }
2366         }
2367         if (idx < thread_count) {
2368             break;
2369         } else {
2370             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2371         }
2372     }
2373     qemu_mutex_unlock(&decomp_done_lock);
2374 }
2375 
2376 /*
2377  * Allocate data structures etc needed by incoming migration with postcopy-ram
2378  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2379  */
2380 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2381 {
2382     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2383 
2384     return postcopy_ram_incoming_init(mis, ram_pages);
2385 }
2386 
2387 /*
2388  * Called in postcopy mode by ram_load().
2389  * rcu_read_lock is taken prior to this being called.
2390  */
2391 static int ram_load_postcopy(QEMUFile *f)
2392 {
2393     int flags = 0, ret = 0;
2394     bool place_needed = false;
2395     bool matching_page_sizes = false;
2396     MigrationIncomingState *mis = migration_incoming_get_current();
2397     /* Temporary page that is later 'placed' */
2398     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2399     void *last_host = NULL;
2400     bool all_zero = false;
2401 
2402     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2403         ram_addr_t addr;
2404         void *host = NULL;
2405         void *page_buffer = NULL;
2406         void *place_source = NULL;
2407         RAMBlock *block = NULL;
2408         uint8_t ch;
2409 
2410         addr = qemu_get_be64(f);
2411         flags = addr & ~TARGET_PAGE_MASK;
2412         addr &= TARGET_PAGE_MASK;
2413 
2414         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2415         place_needed = false;
2416         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2417             block = ram_block_from_stream(f, flags);
2418 
2419             host = host_from_ram_block_offset(block, addr);
2420             if (!host) {
2421                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2422                 ret = -EINVAL;
2423                 break;
2424             }
2425             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2426             /*
2427              * Postcopy requires that we place whole host pages atomically;
2428              * these may be huge pages for RAMBlocks that are backed by
2429              * hugetlbfs.
2430              * To make it atomic, the data is read into a temporary page
2431              * that's moved into place later.
2432              * The migration protocol uses,  possibly smaller, target-pages
2433              * however the source ensures it always sends all the components
2434              * of a host page in order.
2435              */
2436             page_buffer = postcopy_host_page +
2437                           ((uintptr_t)host & (block->page_size - 1));
2438             /* If all TP are zero then we can optimise the place */
2439             if (!((uintptr_t)host & (block->page_size - 1))) {
2440                 all_zero = true;
2441             } else {
2442                 /* not the 1st TP within the HP */
2443                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2444                     error_report("Non-sequential target page %p/%p",
2445                                   host, last_host);
2446                     ret = -EINVAL;
2447                     break;
2448                 }
2449             }
2450 
2451 
2452             /*
2453              * If it's the last part of a host page then we place the host
2454              * page
2455              */
2456             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2457                                      (block->page_size - 1)) == 0;
2458             place_source = postcopy_host_page;
2459         }
2460         last_host = host;
2461 
2462         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2463         case RAM_SAVE_FLAG_COMPRESS:
2464             ch = qemu_get_byte(f);
2465             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2466             if (ch) {
2467                 all_zero = false;
2468             }
2469             break;
2470 
2471         case RAM_SAVE_FLAG_PAGE:
2472             all_zero = false;
2473             if (!place_needed || !matching_page_sizes) {
2474                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2475             } else {
2476                 /* Avoids the qemu_file copy during postcopy, which is
2477                  * going to do a copy later; can only do it when we
2478                  * do this read in one go (matching page sizes)
2479                  */
2480                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2481                                          TARGET_PAGE_SIZE);
2482             }
2483             break;
2484         case RAM_SAVE_FLAG_EOS:
2485             /* normal exit */
2486             break;
2487         default:
2488             error_report("Unknown combination of migration flags: %#x"
2489                          " (postcopy mode)", flags);
2490             ret = -EINVAL;
2491         }
2492 
2493         if (place_needed) {
2494             /* This gets called at the last target page in the host page */
2495             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2496 
2497             if (all_zero) {
2498                 ret = postcopy_place_page_zero(mis, place_dest,
2499                                                block->page_size);
2500             } else {
2501                 ret = postcopy_place_page(mis, place_dest,
2502                                           place_source, block->page_size);
2503             }
2504         }
2505         if (!ret) {
2506             ret = qemu_file_get_error(f);
2507         }
2508     }
2509 
2510     return ret;
2511 }
2512 
2513 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2514 {
2515     int flags = 0, ret = 0;
2516     static uint64_t seq_iter;
2517     int len = 0;
2518     /*
2519      * If system is running in postcopy mode, page inserts to host memory must
2520      * be atomic
2521      */
2522     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2523     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2524     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2525 
2526     seq_iter++;
2527 
2528     if (version_id != 4) {
2529         ret = -EINVAL;
2530     }
2531 
2532     /* This RCU critical section can be very long running.
2533      * When RCU reclaims in the code start to become numerous,
2534      * it will be necessary to reduce the granularity of this
2535      * critical section.
2536      */
2537     rcu_read_lock();
2538 
2539     if (postcopy_running) {
2540         ret = ram_load_postcopy(f);
2541     }
2542 
2543     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2544         ram_addr_t addr, total_ram_bytes;
2545         void *host = NULL;
2546         uint8_t ch;
2547 
2548         addr = qemu_get_be64(f);
2549         flags = addr & ~TARGET_PAGE_MASK;
2550         addr &= TARGET_PAGE_MASK;
2551 
2552         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2553                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2554             RAMBlock *block = ram_block_from_stream(f, flags);
2555 
2556             host = host_from_ram_block_offset(block, addr);
2557             if (!host) {
2558                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2559                 ret = -EINVAL;
2560                 break;
2561             }
2562         }
2563 
2564         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2565         case RAM_SAVE_FLAG_MEM_SIZE:
2566             /* Synchronize RAM block list */
2567             total_ram_bytes = addr;
2568             while (!ret && total_ram_bytes) {
2569                 RAMBlock *block;
2570                 char id[256];
2571                 ram_addr_t length;
2572 
2573                 len = qemu_get_byte(f);
2574                 qemu_get_buffer(f, (uint8_t *)id, len);
2575                 id[len] = 0;
2576                 length = qemu_get_be64(f);
2577 
2578                 block = qemu_ram_block_by_name(id);
2579                 if (block) {
2580                     if (length != block->used_length) {
2581                         Error *local_err = NULL;
2582 
2583                         ret = qemu_ram_resize(block, length,
2584                                               &local_err);
2585                         if (local_err) {
2586                             error_report_err(local_err);
2587                         }
2588                     }
2589                     /* For postcopy we need to check hugepage sizes match */
2590                     if (postcopy_advised &&
2591                         block->page_size != qemu_host_page_size) {
2592                         uint64_t remote_page_size = qemu_get_be64(f);
2593                         if (remote_page_size != block->page_size) {
2594                             error_report("Mismatched RAM page size %s "
2595                                          "(local) %zd != %" PRId64,
2596                                          id, block->page_size,
2597                                          remote_page_size);
2598                             ret = -EINVAL;
2599                         }
2600                     }
2601                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2602                                           block->idstr);
2603                 } else {
2604                     error_report("Unknown ramblock \"%s\", cannot "
2605                                  "accept migration", id);
2606                     ret = -EINVAL;
2607                 }
2608 
2609                 total_ram_bytes -= length;
2610             }
2611             break;
2612 
2613         case RAM_SAVE_FLAG_COMPRESS:
2614             ch = qemu_get_byte(f);
2615             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2616             break;
2617 
2618         case RAM_SAVE_FLAG_PAGE:
2619             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2620             break;
2621 
2622         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2623             len = qemu_get_be32(f);
2624             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2625                 error_report("Invalid compressed data length: %d", len);
2626                 ret = -EINVAL;
2627                 break;
2628             }
2629             decompress_data_with_multi_threads(f, host, len);
2630             break;
2631 
2632         case RAM_SAVE_FLAG_XBZRLE:
2633             if (load_xbzrle(f, addr, host) < 0) {
2634                 error_report("Failed to decompress XBZRLE page at "
2635                              RAM_ADDR_FMT, addr);
2636                 ret = -EINVAL;
2637                 break;
2638             }
2639             break;
2640         case RAM_SAVE_FLAG_EOS:
2641             /* normal exit */
2642             break;
2643         default:
2644             if (flags & RAM_SAVE_FLAG_HOOK) {
2645                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2646             } else {
2647                 error_report("Unknown combination of migration flags: %#x",
2648                              flags);
2649                 ret = -EINVAL;
2650             }
2651         }
2652         if (!ret) {
2653             ret = qemu_file_get_error(f);
2654         }
2655     }
2656 
2657     wait_for_decompress_done();
2658     rcu_read_unlock();
2659     trace_ram_load_complete(ret, seq_iter);
2660     return ret;
2661 }
2662 
2663 static SaveVMHandlers savevm_ram_handlers = {
2664     .save_live_setup = ram_save_setup,
2665     .save_live_iterate = ram_save_iterate,
2666     .save_live_complete_postcopy = ram_save_complete,
2667     .save_live_complete_precopy = ram_save_complete,
2668     .save_live_pending = ram_save_pending,
2669     .load_state = ram_load,
2670     .cleanup = ram_migration_cleanup,
2671 };
2672 
2673 void ram_mig_init(void)
2674 {
2675     qemu_mutex_init(&XBZRLE.lock);
2676     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2677 }
2678