xref: /openbmc/qemu/migration/ram.c (revision a9ded601)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "xbzrle.h"
39 #include "ram.h"
40 #include "migration.h"
41 #include "migration/register.h"
42 #include "migration/misc.h"
43 #include "qemu-file.h"
44 #include "migration/vmstate.h"
45 #include "postcopy-ram.h"
46 #include "exec/address-spaces.h"
47 #include "migration/page_cache.h"
48 #include "qemu/error-report.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 
54 /***********************************************************/
55 /* ram save/restore */
56 
57 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
58  * worked for pages that where filled with the same char.  We switched
59  * it to only search for the zero value.  And to avoid confusion with
60  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
61  */
62 
63 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
64 #define RAM_SAVE_FLAG_ZERO     0x02
65 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
66 #define RAM_SAVE_FLAG_PAGE     0x08
67 #define RAM_SAVE_FLAG_EOS      0x10
68 #define RAM_SAVE_FLAG_CONTINUE 0x20
69 #define RAM_SAVE_FLAG_XBZRLE   0x40
70 /* 0x80 is reserved in migration.h start with 0x100 next */
71 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
72 
73 static inline bool is_zero_range(uint8_t *p, uint64_t size)
74 {
75     return buffer_is_zero(p, size);
76 }
77 
78 XBZRLECacheStats xbzrle_counters;
79 
80 /* struct contains XBZRLE cache and a static page
81    used by the compression */
82 static struct {
83     /* buffer used for XBZRLE encoding */
84     uint8_t *encoded_buf;
85     /* buffer for storing page content */
86     uint8_t *current_buf;
87     /* Cache for XBZRLE, Protected by lock. */
88     PageCache *cache;
89     QemuMutex lock;
90     /* it will store a page full of zeros */
91     uint8_t *zero_target_page;
92 } XBZRLE;
93 
94 /* buffer used for XBZRLE decoding */
95 static uint8_t *xbzrle_decoded_buf;
96 
97 static void XBZRLE_cache_lock(void)
98 {
99     if (migrate_use_xbzrle())
100         qemu_mutex_lock(&XBZRLE.lock);
101 }
102 
103 static void XBZRLE_cache_unlock(void)
104 {
105     if (migrate_use_xbzrle())
106         qemu_mutex_unlock(&XBZRLE.lock);
107 }
108 
109 /**
110  * xbzrle_cache_resize: resize the xbzrle cache
111  *
112  * This function is called from qmp_migrate_set_cache_size in main
113  * thread, possibly while a migration is in progress.  A running
114  * migration may be using the cache and might finish during this call,
115  * hence changes to the cache are protected by XBZRLE.lock().
116  *
117  * Returns the new_size or negative in case of error.
118  *
119  * @new_size: new cache size
120  */
121 int64_t xbzrle_cache_resize(int64_t new_size)
122 {
123     PageCache *new_cache;
124     int64_t ret;
125 
126     if (new_size < TARGET_PAGE_SIZE) {
127         return -1;
128     }
129 
130     XBZRLE_cache_lock();
131 
132     if (XBZRLE.cache != NULL) {
133         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
134             goto out_new_size;
135         }
136         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
137                                         TARGET_PAGE_SIZE);
138         if (!new_cache) {
139             error_report("Error creating cache");
140             ret = -1;
141             goto out;
142         }
143 
144         cache_fini(XBZRLE.cache);
145         XBZRLE.cache = new_cache;
146     }
147 
148 out_new_size:
149     ret = pow2floor(new_size);
150 out:
151     XBZRLE_cache_unlock();
152     return ret;
153 }
154 
155 /*
156  * An outstanding page request, on the source, having been received
157  * and queued
158  */
159 struct RAMSrcPageRequest {
160     RAMBlock *rb;
161     hwaddr    offset;
162     hwaddr    len;
163 
164     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
165 };
166 
167 /* State of RAM for migration */
168 struct RAMState {
169     /* QEMUFile used for this migration */
170     QEMUFile *f;
171     /* Last block that we have visited searching for dirty pages */
172     RAMBlock *last_seen_block;
173     /* Last block from where we have sent data */
174     RAMBlock *last_sent_block;
175     /* Last dirty target page we have sent */
176     ram_addr_t last_page;
177     /* last ram version we have seen */
178     uint32_t last_version;
179     /* We are in the first round */
180     bool ram_bulk_stage;
181     /* How many times we have dirty too many pages */
182     int dirty_rate_high_cnt;
183     /* these variables are used for bitmap sync */
184     /* last time we did a full bitmap_sync */
185     int64_t time_last_bitmap_sync;
186     /* bytes transferred at start_time */
187     uint64_t bytes_xfer_prev;
188     /* number of dirty pages since start_time */
189     uint64_t num_dirty_pages_period;
190     /* xbzrle misses since the beginning of the period */
191     uint64_t xbzrle_cache_miss_prev;
192     /* number of iterations at the beginning of period */
193     uint64_t iterations_prev;
194     /* Iterations since start */
195     uint64_t iterations;
196     /* protects modification of the bitmap */
197     uint64_t migration_dirty_pages;
198     /* number of dirty bits in the bitmap */
199     QemuMutex bitmap_mutex;
200     /* The RAMBlock used in the last src_page_requests */
201     RAMBlock *last_req_rb;
202     /* Queue of outstanding page requests from the destination */
203     QemuMutex src_page_req_mutex;
204     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
205 };
206 typedef struct RAMState RAMState;
207 
208 static RAMState *ram_state;
209 
210 uint64_t ram_bytes_remaining(void)
211 {
212     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
213 }
214 
215 MigrationStats ram_counters;
216 
217 /* used by the search for pages to send */
218 struct PageSearchStatus {
219     /* Current block being searched */
220     RAMBlock    *block;
221     /* Current page to search from */
222     unsigned long page;
223     /* Set once we wrap around */
224     bool         complete_round;
225 };
226 typedef struct PageSearchStatus PageSearchStatus;
227 
228 struct CompressParam {
229     bool done;
230     bool quit;
231     QEMUFile *file;
232     QemuMutex mutex;
233     QemuCond cond;
234     RAMBlock *block;
235     ram_addr_t offset;
236 };
237 typedef struct CompressParam CompressParam;
238 
239 struct DecompressParam {
240     bool done;
241     bool quit;
242     QemuMutex mutex;
243     QemuCond cond;
244     void *des;
245     uint8_t *compbuf;
246     int len;
247 };
248 typedef struct DecompressParam DecompressParam;
249 
250 static CompressParam *comp_param;
251 static QemuThread *compress_threads;
252 /* comp_done_cond is used to wake up the migration thread when
253  * one of the compression threads has finished the compression.
254  * comp_done_lock is used to co-work with comp_done_cond.
255  */
256 static QemuMutex comp_done_lock;
257 static QemuCond comp_done_cond;
258 /* The empty QEMUFileOps will be used by file in CompressParam */
259 static const QEMUFileOps empty_ops = { };
260 
261 static DecompressParam *decomp_param;
262 static QemuThread *decompress_threads;
263 static QemuMutex decomp_done_lock;
264 static QemuCond decomp_done_cond;
265 
266 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
267                                 ram_addr_t offset);
268 
269 static void *do_data_compress(void *opaque)
270 {
271     CompressParam *param = opaque;
272     RAMBlock *block;
273     ram_addr_t offset;
274 
275     qemu_mutex_lock(&param->mutex);
276     while (!param->quit) {
277         if (param->block) {
278             block = param->block;
279             offset = param->offset;
280             param->block = NULL;
281             qemu_mutex_unlock(&param->mutex);
282 
283             do_compress_ram_page(param->file, block, offset);
284 
285             qemu_mutex_lock(&comp_done_lock);
286             param->done = true;
287             qemu_cond_signal(&comp_done_cond);
288             qemu_mutex_unlock(&comp_done_lock);
289 
290             qemu_mutex_lock(&param->mutex);
291         } else {
292             qemu_cond_wait(&param->cond, &param->mutex);
293         }
294     }
295     qemu_mutex_unlock(&param->mutex);
296 
297     return NULL;
298 }
299 
300 static inline void terminate_compression_threads(void)
301 {
302     int idx, thread_count;
303 
304     thread_count = migrate_compress_threads();
305 
306     for (idx = 0; idx < thread_count; idx++) {
307         qemu_mutex_lock(&comp_param[idx].mutex);
308         comp_param[idx].quit = true;
309         qemu_cond_signal(&comp_param[idx].cond);
310         qemu_mutex_unlock(&comp_param[idx].mutex);
311     }
312 }
313 
314 void migrate_compress_threads_join(void)
315 {
316     int i, thread_count;
317 
318     if (!migrate_use_compression()) {
319         return;
320     }
321     terminate_compression_threads();
322     thread_count = migrate_compress_threads();
323     for (i = 0; i < thread_count; i++) {
324         qemu_thread_join(compress_threads + i);
325         qemu_fclose(comp_param[i].file);
326         qemu_mutex_destroy(&comp_param[i].mutex);
327         qemu_cond_destroy(&comp_param[i].cond);
328     }
329     qemu_mutex_destroy(&comp_done_lock);
330     qemu_cond_destroy(&comp_done_cond);
331     g_free(compress_threads);
332     g_free(comp_param);
333     compress_threads = NULL;
334     comp_param = NULL;
335 }
336 
337 void migrate_compress_threads_create(void)
338 {
339     int i, thread_count;
340 
341     if (!migrate_use_compression()) {
342         return;
343     }
344     thread_count = migrate_compress_threads();
345     compress_threads = g_new0(QemuThread, thread_count);
346     comp_param = g_new0(CompressParam, thread_count);
347     qemu_cond_init(&comp_done_cond);
348     qemu_mutex_init(&comp_done_lock);
349     for (i = 0; i < thread_count; i++) {
350         /* comp_param[i].file is just used as a dummy buffer to save data,
351          * set its ops to empty.
352          */
353         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
354         comp_param[i].done = true;
355         comp_param[i].quit = false;
356         qemu_mutex_init(&comp_param[i].mutex);
357         qemu_cond_init(&comp_param[i].cond);
358         qemu_thread_create(compress_threads + i, "compress",
359                            do_data_compress, comp_param + i,
360                            QEMU_THREAD_JOINABLE);
361     }
362 }
363 
364 /**
365  * save_page_header: write page header to wire
366  *
367  * If this is the 1st block, it also writes the block identification
368  *
369  * Returns the number of bytes written
370  *
371  * @f: QEMUFile where to send the data
372  * @block: block that contains the page we want to send
373  * @offset: offset inside the block for the page
374  *          in the lower bits, it contains flags
375  */
376 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
377                                ram_addr_t offset)
378 {
379     size_t size, len;
380 
381     if (block == rs->last_sent_block) {
382         offset |= RAM_SAVE_FLAG_CONTINUE;
383     }
384     qemu_put_be64(f, offset);
385     size = 8;
386 
387     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
388         len = strlen(block->idstr);
389         qemu_put_byte(f, len);
390         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
391         size += 1 + len;
392         rs->last_sent_block = block;
393     }
394     return size;
395 }
396 
397 /**
398  * mig_throttle_guest_down: throotle down the guest
399  *
400  * Reduce amount of guest cpu execution to hopefully slow down memory
401  * writes. If guest dirty memory rate is reduced below the rate at
402  * which we can transfer pages to the destination then we should be
403  * able to complete migration. Some workloads dirty memory way too
404  * fast and will not effectively converge, even with auto-converge.
405  */
406 static void mig_throttle_guest_down(void)
407 {
408     MigrationState *s = migrate_get_current();
409     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
410     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
411 
412     /* We have not started throttling yet. Let's start it. */
413     if (!cpu_throttle_active()) {
414         cpu_throttle_set(pct_initial);
415     } else {
416         /* Throttling already on, just increase the rate */
417         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
418     }
419 }
420 
421 /**
422  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
423  *
424  * @rs: current RAM state
425  * @current_addr: address for the zero page
426  *
427  * Update the xbzrle cache to reflect a page that's been sent as all 0.
428  * The important thing is that a stale (not-yet-0'd) page be replaced
429  * by the new data.
430  * As a bonus, if the page wasn't in the cache it gets added so that
431  * when a small write is made into the 0'd page it gets XBZRLE sent.
432  */
433 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
434 {
435     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
436         return;
437     }
438 
439     /* We don't care if this fails to allocate a new cache page
440      * as long as it updated an old one */
441     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
442                  ram_counters.dirty_sync_count);
443 }
444 
445 #define ENCODING_FLAG_XBZRLE 0x1
446 
447 /**
448  * save_xbzrle_page: compress and send current page
449  *
450  * Returns: 1 means that we wrote the page
451  *          0 means that page is identical to the one already sent
452  *          -1 means that xbzrle would be longer than normal
453  *
454  * @rs: current RAM state
455  * @current_data: pointer to the address of the page contents
456  * @current_addr: addr of the page
457  * @block: block that contains the page we want to send
458  * @offset: offset inside the block for the page
459  * @last_stage: if we are at the completion stage
460  */
461 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
462                             ram_addr_t current_addr, RAMBlock *block,
463                             ram_addr_t offset, bool last_stage)
464 {
465     int encoded_len = 0, bytes_xbzrle;
466     uint8_t *prev_cached_page;
467 
468     if (!cache_is_cached(XBZRLE.cache, current_addr,
469                          ram_counters.dirty_sync_count)) {
470         xbzrle_counters.cache_miss++;
471         if (!last_stage) {
472             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
473                              ram_counters.dirty_sync_count) == -1) {
474                 return -1;
475             } else {
476                 /* update *current_data when the page has been
477                    inserted into cache */
478                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
479             }
480         }
481         return -1;
482     }
483 
484     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
485 
486     /* save current buffer into memory */
487     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
488 
489     /* XBZRLE encoding (if there is no overflow) */
490     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
491                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
492                                        TARGET_PAGE_SIZE);
493     if (encoded_len == 0) {
494         trace_save_xbzrle_page_skipping();
495         return 0;
496     } else if (encoded_len == -1) {
497         trace_save_xbzrle_page_overflow();
498         xbzrle_counters.overflow++;
499         /* update data in the cache */
500         if (!last_stage) {
501             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
502             *current_data = prev_cached_page;
503         }
504         return -1;
505     }
506 
507     /* we need to update the data in the cache, in order to get the same data */
508     if (!last_stage) {
509         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
510     }
511 
512     /* Send XBZRLE based compressed page */
513     bytes_xbzrle = save_page_header(rs, rs->f, block,
514                                     offset | RAM_SAVE_FLAG_XBZRLE);
515     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
516     qemu_put_be16(rs->f, encoded_len);
517     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
518     bytes_xbzrle += encoded_len + 1 + 2;
519     xbzrle_counters.pages++;
520     xbzrle_counters.bytes += bytes_xbzrle;
521     ram_counters.transferred += bytes_xbzrle;
522 
523     return 1;
524 }
525 
526 /**
527  * migration_bitmap_find_dirty: find the next dirty page from start
528  *
529  * Called with rcu_read_lock() to protect migration_bitmap
530  *
531  * Returns the byte offset within memory region of the start of a dirty page
532  *
533  * @rs: current RAM state
534  * @rb: RAMBlock where to search for dirty pages
535  * @start: page where we start the search
536  */
537 static inline
538 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
539                                           unsigned long start)
540 {
541     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
542     unsigned long *bitmap = rb->bmap;
543     unsigned long next;
544 
545     if (rs->ram_bulk_stage && start > 0) {
546         next = start + 1;
547     } else {
548         next = find_next_bit(bitmap, size, start);
549     }
550 
551     return next;
552 }
553 
554 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
555                                                 RAMBlock *rb,
556                                                 unsigned long page)
557 {
558     bool ret;
559 
560     ret = test_and_clear_bit(page, rb->bmap);
561 
562     if (ret) {
563         rs->migration_dirty_pages--;
564     }
565     return ret;
566 }
567 
568 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
569                                         ram_addr_t start, ram_addr_t length)
570 {
571     rs->migration_dirty_pages +=
572         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
573                                               &rs->num_dirty_pages_period);
574 }
575 
576 /**
577  * ram_pagesize_summary: calculate all the pagesizes of a VM
578  *
579  * Returns a summary bitmap of the page sizes of all RAMBlocks
580  *
581  * For VMs with just normal pages this is equivalent to the host page
582  * size. If it's got some huge pages then it's the OR of all the
583  * different page sizes.
584  */
585 uint64_t ram_pagesize_summary(void)
586 {
587     RAMBlock *block;
588     uint64_t summary = 0;
589 
590     RAMBLOCK_FOREACH(block) {
591         summary |= block->page_size;
592     }
593 
594     return summary;
595 }
596 
597 static void migration_bitmap_sync(RAMState *rs)
598 {
599     RAMBlock *block;
600     int64_t end_time;
601     uint64_t bytes_xfer_now;
602 
603     ram_counters.dirty_sync_count++;
604 
605     if (!rs->time_last_bitmap_sync) {
606         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
607     }
608 
609     trace_migration_bitmap_sync_start();
610     memory_global_dirty_log_sync();
611 
612     qemu_mutex_lock(&rs->bitmap_mutex);
613     rcu_read_lock();
614     RAMBLOCK_FOREACH(block) {
615         migration_bitmap_sync_range(rs, block, 0, block->used_length);
616     }
617     rcu_read_unlock();
618     qemu_mutex_unlock(&rs->bitmap_mutex);
619 
620     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
621 
622     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
623 
624     /* more than 1 second = 1000 millisecons */
625     if (end_time > rs->time_last_bitmap_sync + 1000) {
626         /* calculate period counters */
627         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
628             / (end_time - rs->time_last_bitmap_sync);
629         bytes_xfer_now = ram_counters.transferred;
630 
631         if (migrate_auto_converge()) {
632             /* The following detection logic can be refined later. For now:
633                Check to see if the dirtied bytes is 50% more than the approx.
634                amount of bytes that just got transferred since the last time we
635                were in this routine. If that happens twice, start or increase
636                throttling */
637 
638             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
639                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
640                 (++rs->dirty_rate_high_cnt >= 2)) {
641                     trace_migration_throttle();
642                     rs->dirty_rate_high_cnt = 0;
643                     mig_throttle_guest_down();
644             }
645         }
646 
647         if (migrate_use_xbzrle()) {
648             if (rs->iterations_prev != rs->iterations) {
649                 xbzrle_counters.cache_miss_rate =
650                    (double)(xbzrle_counters.cache_miss -
651                             rs->xbzrle_cache_miss_prev) /
652                    (rs->iterations - rs->iterations_prev);
653             }
654             rs->iterations_prev = rs->iterations;
655             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
656         }
657 
658         /* reset period counters */
659         rs->time_last_bitmap_sync = end_time;
660         rs->num_dirty_pages_period = 0;
661         rs->bytes_xfer_prev = bytes_xfer_now;
662     }
663     if (migrate_use_events()) {
664         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
665     }
666 }
667 
668 /**
669  * save_zero_page: send the zero page to the stream
670  *
671  * Returns the number of pages written.
672  *
673  * @rs: current RAM state
674  * @block: block that contains the page we want to send
675  * @offset: offset inside the block for the page
676  * @p: pointer to the page
677  */
678 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
679                           uint8_t *p)
680 {
681     int pages = -1;
682 
683     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
684         ram_counters.duplicate++;
685         ram_counters.transferred +=
686             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
687         qemu_put_byte(rs->f, 0);
688         ram_counters.transferred += 1;
689         pages = 1;
690     }
691 
692     return pages;
693 }
694 
695 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
696 {
697     if (!migrate_release_ram() || !migration_in_postcopy()) {
698         return;
699     }
700 
701     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
702 }
703 
704 /**
705  * ram_save_page: send the given page to the stream
706  *
707  * Returns the number of pages written.
708  *          < 0 - error
709  *          >=0 - Number of pages written - this might legally be 0
710  *                if xbzrle noticed the page was the same.
711  *
712  * @rs: current RAM state
713  * @block: block that contains the page we want to send
714  * @offset: offset inside the block for the page
715  * @last_stage: if we are at the completion stage
716  */
717 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
718 {
719     int pages = -1;
720     uint64_t bytes_xmit;
721     ram_addr_t current_addr;
722     uint8_t *p;
723     int ret;
724     bool send_async = true;
725     RAMBlock *block = pss->block;
726     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
727 
728     p = block->host + offset;
729     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
730 
731     /* In doubt sent page as normal */
732     bytes_xmit = 0;
733     ret = ram_control_save_page(rs->f, block->offset,
734                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
735     if (bytes_xmit) {
736         ram_counters.transferred += bytes_xmit;
737         pages = 1;
738     }
739 
740     XBZRLE_cache_lock();
741 
742     current_addr = block->offset + offset;
743 
744     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
745         if (ret != RAM_SAVE_CONTROL_DELAYED) {
746             if (bytes_xmit > 0) {
747                 ram_counters.normal++;
748             } else if (bytes_xmit == 0) {
749                 ram_counters.duplicate++;
750             }
751         }
752     } else {
753         pages = save_zero_page(rs, block, offset, p);
754         if (pages > 0) {
755             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
756              * page would be stale
757              */
758             xbzrle_cache_zero_page(rs, current_addr);
759             ram_release_pages(block->idstr, offset, pages);
760         } else if (!rs->ram_bulk_stage &&
761                    !migration_in_postcopy() && migrate_use_xbzrle()) {
762             pages = save_xbzrle_page(rs, &p, current_addr, block,
763                                      offset, last_stage);
764             if (!last_stage) {
765                 /* Can't send this cached data async, since the cache page
766                  * might get updated before it gets to the wire
767                  */
768                 send_async = false;
769             }
770         }
771     }
772 
773     /* XBZRLE overflow or normal page */
774     if (pages == -1) {
775         ram_counters.transferred +=
776             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
777         if (send_async) {
778             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
779                                   migrate_release_ram() &
780                                   migration_in_postcopy());
781         } else {
782             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
783         }
784         ram_counters.transferred += TARGET_PAGE_SIZE;
785         pages = 1;
786         ram_counters.normal++;
787     }
788 
789     XBZRLE_cache_unlock();
790 
791     return pages;
792 }
793 
794 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
795                                 ram_addr_t offset)
796 {
797     RAMState *rs = ram_state;
798     int bytes_sent, blen;
799     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
800 
801     bytes_sent = save_page_header(rs, f, block, offset |
802                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
803     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
804                                      migrate_compress_level());
805     if (blen < 0) {
806         bytes_sent = 0;
807         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
808         error_report("compressed data failed!");
809     } else {
810         bytes_sent += blen;
811         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
812     }
813 
814     return bytes_sent;
815 }
816 
817 static void flush_compressed_data(RAMState *rs)
818 {
819     int idx, len, thread_count;
820 
821     if (!migrate_use_compression()) {
822         return;
823     }
824     thread_count = migrate_compress_threads();
825 
826     qemu_mutex_lock(&comp_done_lock);
827     for (idx = 0; idx < thread_count; idx++) {
828         while (!comp_param[idx].done) {
829             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
830         }
831     }
832     qemu_mutex_unlock(&comp_done_lock);
833 
834     for (idx = 0; idx < thread_count; idx++) {
835         qemu_mutex_lock(&comp_param[idx].mutex);
836         if (!comp_param[idx].quit) {
837             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
838             ram_counters.transferred += len;
839         }
840         qemu_mutex_unlock(&comp_param[idx].mutex);
841     }
842 }
843 
844 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
845                                        ram_addr_t offset)
846 {
847     param->block = block;
848     param->offset = offset;
849 }
850 
851 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
852                                            ram_addr_t offset)
853 {
854     int idx, thread_count, bytes_xmit = -1, pages = -1;
855 
856     thread_count = migrate_compress_threads();
857     qemu_mutex_lock(&comp_done_lock);
858     while (true) {
859         for (idx = 0; idx < thread_count; idx++) {
860             if (comp_param[idx].done) {
861                 comp_param[idx].done = false;
862                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
863                 qemu_mutex_lock(&comp_param[idx].mutex);
864                 set_compress_params(&comp_param[idx], block, offset);
865                 qemu_cond_signal(&comp_param[idx].cond);
866                 qemu_mutex_unlock(&comp_param[idx].mutex);
867                 pages = 1;
868                 ram_counters.normal++;
869                 ram_counters.transferred += bytes_xmit;
870                 break;
871             }
872         }
873         if (pages > 0) {
874             break;
875         } else {
876             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
877         }
878     }
879     qemu_mutex_unlock(&comp_done_lock);
880 
881     return pages;
882 }
883 
884 /**
885  * ram_save_compressed_page: compress the given page and send it to the stream
886  *
887  * Returns the number of pages written.
888  *
889  * @rs: current RAM state
890  * @block: block that contains the page we want to send
891  * @offset: offset inside the block for the page
892  * @last_stage: if we are at the completion stage
893  */
894 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
895                                     bool last_stage)
896 {
897     int pages = -1;
898     uint64_t bytes_xmit = 0;
899     uint8_t *p;
900     int ret, blen;
901     RAMBlock *block = pss->block;
902     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
903 
904     p = block->host + offset;
905 
906     ret = ram_control_save_page(rs->f, block->offset,
907                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
908     if (bytes_xmit) {
909         ram_counters.transferred += bytes_xmit;
910         pages = 1;
911     }
912     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
913         if (ret != RAM_SAVE_CONTROL_DELAYED) {
914             if (bytes_xmit > 0) {
915                 ram_counters.normal++;
916             } else if (bytes_xmit == 0) {
917                 ram_counters.duplicate++;
918             }
919         }
920     } else {
921         /* When starting the process of a new block, the first page of
922          * the block should be sent out before other pages in the same
923          * block, and all the pages in last block should have been sent
924          * out, keeping this order is important, because the 'cont' flag
925          * is used to avoid resending the block name.
926          */
927         if (block != rs->last_sent_block) {
928             flush_compressed_data(rs);
929             pages = save_zero_page(rs, block, offset, p);
930             if (pages == -1) {
931                 /* Make sure the first page is sent out before other pages */
932                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
933                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
934                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
935                                                  migrate_compress_level());
936                 if (blen > 0) {
937                     ram_counters.transferred += bytes_xmit + blen;
938                     ram_counters.normal++;
939                     pages = 1;
940                 } else {
941                     qemu_file_set_error(rs->f, blen);
942                     error_report("compressed data failed!");
943                 }
944             }
945             if (pages > 0) {
946                 ram_release_pages(block->idstr, offset, pages);
947             }
948         } else {
949             pages = save_zero_page(rs, block, offset, p);
950             if (pages == -1) {
951                 pages = compress_page_with_multi_thread(rs, block, offset);
952             } else {
953                 ram_release_pages(block->idstr, offset, pages);
954             }
955         }
956     }
957 
958     return pages;
959 }
960 
961 /**
962  * find_dirty_block: find the next dirty page and update any state
963  * associated with the search process.
964  *
965  * Returns if a page is found
966  *
967  * @rs: current RAM state
968  * @pss: data about the state of the current dirty page scan
969  * @again: set to false if the search has scanned the whole of RAM
970  */
971 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
972 {
973     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
974     if (pss->complete_round && pss->block == rs->last_seen_block &&
975         pss->page >= rs->last_page) {
976         /*
977          * We've been once around the RAM and haven't found anything.
978          * Give up.
979          */
980         *again = false;
981         return false;
982     }
983     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
984         /* Didn't find anything in this RAM Block */
985         pss->page = 0;
986         pss->block = QLIST_NEXT_RCU(pss->block, next);
987         if (!pss->block) {
988             /* Hit the end of the list */
989             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
990             /* Flag that we've looped */
991             pss->complete_round = true;
992             rs->ram_bulk_stage = false;
993             if (migrate_use_xbzrle()) {
994                 /* If xbzrle is on, stop using the data compression at this
995                  * point. In theory, xbzrle can do better than compression.
996                  */
997                 flush_compressed_data(rs);
998             }
999         }
1000         /* Didn't find anything this time, but try again on the new block */
1001         *again = true;
1002         return false;
1003     } else {
1004         /* Can go around again, but... */
1005         *again = true;
1006         /* We've found something so probably don't need to */
1007         return true;
1008     }
1009 }
1010 
1011 /**
1012  * unqueue_page: gets a page of the queue
1013  *
1014  * Helper for 'get_queued_page' - gets a page off the queue
1015  *
1016  * Returns the block of the page (or NULL if none available)
1017  *
1018  * @rs: current RAM state
1019  * @offset: used to return the offset within the RAMBlock
1020  */
1021 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1022 {
1023     RAMBlock *block = NULL;
1024 
1025     qemu_mutex_lock(&rs->src_page_req_mutex);
1026     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1027         struct RAMSrcPageRequest *entry =
1028                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1029         block = entry->rb;
1030         *offset = entry->offset;
1031 
1032         if (entry->len > TARGET_PAGE_SIZE) {
1033             entry->len -= TARGET_PAGE_SIZE;
1034             entry->offset += TARGET_PAGE_SIZE;
1035         } else {
1036             memory_region_unref(block->mr);
1037             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1038             g_free(entry);
1039         }
1040     }
1041     qemu_mutex_unlock(&rs->src_page_req_mutex);
1042 
1043     return block;
1044 }
1045 
1046 /**
1047  * get_queued_page: unqueue a page from the postocpy requests
1048  *
1049  * Skips pages that are already sent (!dirty)
1050  *
1051  * Returns if a queued page is found
1052  *
1053  * @rs: current RAM state
1054  * @pss: data about the state of the current dirty page scan
1055  */
1056 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1057 {
1058     RAMBlock  *block;
1059     ram_addr_t offset;
1060     bool dirty;
1061 
1062     do {
1063         block = unqueue_page(rs, &offset);
1064         /*
1065          * We're sending this page, and since it's postcopy nothing else
1066          * will dirty it, and we must make sure it doesn't get sent again
1067          * even if this queue request was received after the background
1068          * search already sent it.
1069          */
1070         if (block) {
1071             unsigned long page;
1072 
1073             page = offset >> TARGET_PAGE_BITS;
1074             dirty = test_bit(page, block->bmap);
1075             if (!dirty) {
1076                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1077                        page, test_bit(page, block->unsentmap));
1078             } else {
1079                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1080             }
1081         }
1082 
1083     } while (block && !dirty);
1084 
1085     if (block) {
1086         /*
1087          * As soon as we start servicing pages out of order, then we have
1088          * to kill the bulk stage, since the bulk stage assumes
1089          * in (migration_bitmap_find_and_reset_dirty) that every page is
1090          * dirty, that's no longer true.
1091          */
1092         rs->ram_bulk_stage = false;
1093 
1094         /*
1095          * We want the background search to continue from the queued page
1096          * since the guest is likely to want other pages near to the page
1097          * it just requested.
1098          */
1099         pss->block = block;
1100         pss->page = offset >> TARGET_PAGE_BITS;
1101     }
1102 
1103     return !!block;
1104 }
1105 
1106 /**
1107  * migration_page_queue_free: drop any remaining pages in the ram
1108  * request queue
1109  *
1110  * It should be empty at the end anyway, but in error cases there may
1111  * be some left.  in case that there is any page left, we drop it.
1112  *
1113  */
1114 static void migration_page_queue_free(RAMState *rs)
1115 {
1116     struct RAMSrcPageRequest *mspr, *next_mspr;
1117     /* This queue generally should be empty - but in the case of a failed
1118      * migration might have some droppings in.
1119      */
1120     rcu_read_lock();
1121     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1122         memory_region_unref(mspr->rb->mr);
1123         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1124         g_free(mspr);
1125     }
1126     rcu_read_unlock();
1127 }
1128 
1129 /**
1130  * ram_save_queue_pages: queue the page for transmission
1131  *
1132  * A request from postcopy destination for example.
1133  *
1134  * Returns zero on success or negative on error
1135  *
1136  * @rbname: Name of the RAMBLock of the request. NULL means the
1137  *          same that last one.
1138  * @start: starting address from the start of the RAMBlock
1139  * @len: length (in bytes) to send
1140  */
1141 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1142 {
1143     RAMBlock *ramblock;
1144     RAMState *rs = ram_state;
1145 
1146     ram_counters.postcopy_requests++;
1147     rcu_read_lock();
1148     if (!rbname) {
1149         /* Reuse last RAMBlock */
1150         ramblock = rs->last_req_rb;
1151 
1152         if (!ramblock) {
1153             /*
1154              * Shouldn't happen, we can't reuse the last RAMBlock if
1155              * it's the 1st request.
1156              */
1157             error_report("ram_save_queue_pages no previous block");
1158             goto err;
1159         }
1160     } else {
1161         ramblock = qemu_ram_block_by_name(rbname);
1162 
1163         if (!ramblock) {
1164             /* We shouldn't be asked for a non-existent RAMBlock */
1165             error_report("ram_save_queue_pages no block '%s'", rbname);
1166             goto err;
1167         }
1168         rs->last_req_rb = ramblock;
1169     }
1170     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1171     if (start+len > ramblock->used_length) {
1172         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1173                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1174                      __func__, start, len, ramblock->used_length);
1175         goto err;
1176     }
1177 
1178     struct RAMSrcPageRequest *new_entry =
1179         g_malloc0(sizeof(struct RAMSrcPageRequest));
1180     new_entry->rb = ramblock;
1181     new_entry->offset = start;
1182     new_entry->len = len;
1183 
1184     memory_region_ref(ramblock->mr);
1185     qemu_mutex_lock(&rs->src_page_req_mutex);
1186     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1187     qemu_mutex_unlock(&rs->src_page_req_mutex);
1188     rcu_read_unlock();
1189 
1190     return 0;
1191 
1192 err:
1193     rcu_read_unlock();
1194     return -1;
1195 }
1196 
1197 /**
1198  * ram_save_target_page: save one target page
1199  *
1200  * Returns the number of pages written
1201  *
1202  * @rs: current RAM state
1203  * @ms: current migration state
1204  * @pss: data about the page we want to send
1205  * @last_stage: if we are at the completion stage
1206  */
1207 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1208                                 bool last_stage)
1209 {
1210     int res = 0;
1211 
1212     /* Check the pages is dirty and if it is send it */
1213     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1214         /*
1215          * If xbzrle is on, stop using the data compression after first
1216          * round of migration even if compression is enabled. In theory,
1217          * xbzrle can do better than compression.
1218          */
1219         if (migrate_use_compression() &&
1220             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1221             res = ram_save_compressed_page(rs, pss, last_stage);
1222         } else {
1223             res = ram_save_page(rs, pss, last_stage);
1224         }
1225 
1226         if (res < 0) {
1227             return res;
1228         }
1229         if (pss->block->unsentmap) {
1230             clear_bit(pss->page, pss->block->unsentmap);
1231         }
1232     }
1233 
1234     return res;
1235 }
1236 
1237 /**
1238  * ram_save_host_page: save a whole host page
1239  *
1240  * Starting at *offset send pages up to the end of the current host
1241  * page. It's valid for the initial offset to point into the middle of
1242  * a host page in which case the remainder of the hostpage is sent.
1243  * Only dirty target pages are sent. Note that the host page size may
1244  * be a huge page for this block.
1245  * The saving stops at the boundary of the used_length of the block
1246  * if the RAMBlock isn't a multiple of the host page size.
1247  *
1248  * Returns the number of pages written or negative on error
1249  *
1250  * @rs: current RAM state
1251  * @ms: current migration state
1252  * @pss: data about the page we want to send
1253  * @last_stage: if we are at the completion stage
1254  */
1255 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1256                               bool last_stage)
1257 {
1258     int tmppages, pages = 0;
1259     size_t pagesize_bits =
1260         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1261 
1262     do {
1263         tmppages = ram_save_target_page(rs, pss, last_stage);
1264         if (tmppages < 0) {
1265             return tmppages;
1266         }
1267 
1268         pages += tmppages;
1269         pss->page++;
1270     } while ((pss->page & (pagesize_bits - 1)) &&
1271              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1272 
1273     /* The offset we leave with is the last one we looked at */
1274     pss->page--;
1275     return pages;
1276 }
1277 
1278 /**
1279  * ram_find_and_save_block: finds a dirty page and sends it to f
1280  *
1281  * Called within an RCU critical section.
1282  *
1283  * Returns the number of pages written where zero means no dirty pages
1284  *
1285  * @rs: current RAM state
1286  * @last_stage: if we are at the completion stage
1287  *
1288  * On systems where host-page-size > target-page-size it will send all the
1289  * pages in a host page that are dirty.
1290  */
1291 
1292 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1293 {
1294     PageSearchStatus pss;
1295     int pages = 0;
1296     bool again, found;
1297 
1298     /* No dirty page as there is zero RAM */
1299     if (!ram_bytes_total()) {
1300         return pages;
1301     }
1302 
1303     pss.block = rs->last_seen_block;
1304     pss.page = rs->last_page;
1305     pss.complete_round = false;
1306 
1307     if (!pss.block) {
1308         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1309     }
1310 
1311     do {
1312         again = true;
1313         found = get_queued_page(rs, &pss);
1314 
1315         if (!found) {
1316             /* priority queue empty, so just search for something dirty */
1317             found = find_dirty_block(rs, &pss, &again);
1318         }
1319 
1320         if (found) {
1321             pages = ram_save_host_page(rs, &pss, last_stage);
1322         }
1323     } while (!pages && again);
1324 
1325     rs->last_seen_block = pss.block;
1326     rs->last_page = pss.page;
1327 
1328     return pages;
1329 }
1330 
1331 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1332 {
1333     uint64_t pages = size / TARGET_PAGE_SIZE;
1334 
1335     if (zero) {
1336         ram_counters.duplicate += pages;
1337     } else {
1338         ram_counters.normal += pages;
1339         ram_counters.transferred += size;
1340         qemu_update_position(f, size);
1341     }
1342 }
1343 
1344 uint64_t ram_bytes_total(void)
1345 {
1346     RAMBlock *block;
1347     uint64_t total = 0;
1348 
1349     rcu_read_lock();
1350     RAMBLOCK_FOREACH(block) {
1351         total += block->used_length;
1352     }
1353     rcu_read_unlock();
1354     return total;
1355 }
1356 
1357 void free_xbzrle_decoded_buf(void)
1358 {
1359     g_free(xbzrle_decoded_buf);
1360     xbzrle_decoded_buf = NULL;
1361 }
1362 
1363 static void ram_migration_cleanup(void *opaque)
1364 {
1365     RAMState **rsp = opaque;
1366     RAMBlock *block;
1367 
1368     /* caller have hold iothread lock or is in a bh, so there is
1369      * no writing race against this migration_bitmap
1370      */
1371     memory_global_dirty_log_stop();
1372 
1373     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1374         g_free(block->bmap);
1375         block->bmap = NULL;
1376         g_free(block->unsentmap);
1377         block->unsentmap = NULL;
1378     }
1379 
1380     XBZRLE_cache_lock();
1381     if (XBZRLE.cache) {
1382         cache_fini(XBZRLE.cache);
1383         g_free(XBZRLE.encoded_buf);
1384         g_free(XBZRLE.current_buf);
1385         g_free(XBZRLE.zero_target_page);
1386         XBZRLE.cache = NULL;
1387         XBZRLE.encoded_buf = NULL;
1388         XBZRLE.current_buf = NULL;
1389         XBZRLE.zero_target_page = NULL;
1390     }
1391     XBZRLE_cache_unlock();
1392     migration_page_queue_free(*rsp);
1393     g_free(*rsp);
1394     *rsp = NULL;
1395 }
1396 
1397 static void ram_state_reset(RAMState *rs)
1398 {
1399     rs->last_seen_block = NULL;
1400     rs->last_sent_block = NULL;
1401     rs->last_page = 0;
1402     rs->last_version = ram_list.version;
1403     rs->ram_bulk_stage = true;
1404 }
1405 
1406 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1407 
1408 /*
1409  * 'expected' is the value you expect the bitmap mostly to be full
1410  * of; it won't bother printing lines that are all this value.
1411  * If 'todump' is null the migration bitmap is dumped.
1412  */
1413 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1414                            unsigned long pages)
1415 {
1416     int64_t cur;
1417     int64_t linelen = 128;
1418     char linebuf[129];
1419 
1420     for (cur = 0; cur < pages; cur += linelen) {
1421         int64_t curb;
1422         bool found = false;
1423         /*
1424          * Last line; catch the case where the line length
1425          * is longer than remaining ram
1426          */
1427         if (cur + linelen > pages) {
1428             linelen = pages - cur;
1429         }
1430         for (curb = 0; curb < linelen; curb++) {
1431             bool thisbit = test_bit(cur + curb, todump);
1432             linebuf[curb] = thisbit ? '1' : '.';
1433             found = found || (thisbit != expected);
1434         }
1435         if (found) {
1436             linebuf[curb] = '\0';
1437             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1438         }
1439     }
1440 }
1441 
1442 /* **** functions for postcopy ***** */
1443 
1444 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1445 {
1446     struct RAMBlock *block;
1447 
1448     RAMBLOCK_FOREACH(block) {
1449         unsigned long *bitmap = block->bmap;
1450         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1451         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1452 
1453         while (run_start < range) {
1454             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1455             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1456                               (run_end - run_start) << TARGET_PAGE_BITS);
1457             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1458         }
1459     }
1460 }
1461 
1462 /**
1463  * postcopy_send_discard_bm_ram: discard a RAMBlock
1464  *
1465  * Returns zero on success
1466  *
1467  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1468  * Note: At this point the 'unsentmap' is the processed bitmap combined
1469  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1470  *
1471  * @ms: current migration state
1472  * @pds: state for postcopy
1473  * @start: RAMBlock starting page
1474  * @length: RAMBlock size
1475  */
1476 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1477                                         PostcopyDiscardState *pds,
1478                                         RAMBlock *block)
1479 {
1480     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1481     unsigned long current;
1482     unsigned long *unsentmap = block->unsentmap;
1483 
1484     for (current = 0; current < end; ) {
1485         unsigned long one = find_next_bit(unsentmap, end, current);
1486 
1487         if (one <= end) {
1488             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1489             unsigned long discard_length;
1490 
1491             if (zero >= end) {
1492                 discard_length = end - one;
1493             } else {
1494                 discard_length = zero - one;
1495             }
1496             if (discard_length) {
1497                 postcopy_discard_send_range(ms, pds, one, discard_length);
1498             }
1499             current = one + discard_length;
1500         } else {
1501             current = one;
1502         }
1503     }
1504 
1505     return 0;
1506 }
1507 
1508 /**
1509  * postcopy_each_ram_send_discard: discard all RAMBlocks
1510  *
1511  * Returns 0 for success or negative for error
1512  *
1513  * Utility for the outgoing postcopy code.
1514  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1515  *   passing it bitmap indexes and name.
1516  * (qemu_ram_foreach_block ends up passing unscaled lengths
1517  *  which would mean postcopy code would have to deal with target page)
1518  *
1519  * @ms: current migration state
1520  */
1521 static int postcopy_each_ram_send_discard(MigrationState *ms)
1522 {
1523     struct RAMBlock *block;
1524     int ret;
1525 
1526     RAMBLOCK_FOREACH(block) {
1527         PostcopyDiscardState *pds =
1528             postcopy_discard_send_init(ms, block->idstr);
1529 
1530         /*
1531          * Postcopy sends chunks of bitmap over the wire, but it
1532          * just needs indexes at this point, avoids it having
1533          * target page specific code.
1534          */
1535         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1536         postcopy_discard_send_finish(ms, pds);
1537         if (ret) {
1538             return ret;
1539         }
1540     }
1541 
1542     return 0;
1543 }
1544 
1545 /**
1546  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1547  *
1548  * Helper for postcopy_chunk_hostpages; it's called twice to
1549  * canonicalize the two bitmaps, that are similar, but one is
1550  * inverted.
1551  *
1552  * Postcopy requires that all target pages in a hostpage are dirty or
1553  * clean, not a mix.  This function canonicalizes the bitmaps.
1554  *
1555  * @ms: current migration state
1556  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1557  *               otherwise we need to canonicalize partially dirty host pages
1558  * @block: block that contains the page we want to canonicalize
1559  * @pds: state for postcopy
1560  */
1561 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1562                                           RAMBlock *block,
1563                                           PostcopyDiscardState *pds)
1564 {
1565     RAMState *rs = ram_state;
1566     unsigned long *bitmap = block->bmap;
1567     unsigned long *unsentmap = block->unsentmap;
1568     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1569     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1570     unsigned long run_start;
1571 
1572     if (block->page_size == TARGET_PAGE_SIZE) {
1573         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1574         return;
1575     }
1576 
1577     if (unsent_pass) {
1578         /* Find a sent page */
1579         run_start = find_next_zero_bit(unsentmap, pages, 0);
1580     } else {
1581         /* Find a dirty page */
1582         run_start = find_next_bit(bitmap, pages, 0);
1583     }
1584 
1585     while (run_start < pages) {
1586         bool do_fixup = false;
1587         unsigned long fixup_start_addr;
1588         unsigned long host_offset;
1589 
1590         /*
1591          * If the start of this run of pages is in the middle of a host
1592          * page, then we need to fixup this host page.
1593          */
1594         host_offset = run_start % host_ratio;
1595         if (host_offset) {
1596             do_fixup = true;
1597             run_start -= host_offset;
1598             fixup_start_addr = run_start;
1599             /* For the next pass */
1600             run_start = run_start + host_ratio;
1601         } else {
1602             /* Find the end of this run */
1603             unsigned long run_end;
1604             if (unsent_pass) {
1605                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1606             } else {
1607                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1608             }
1609             /*
1610              * If the end isn't at the start of a host page, then the
1611              * run doesn't finish at the end of a host page
1612              * and we need to discard.
1613              */
1614             host_offset = run_end % host_ratio;
1615             if (host_offset) {
1616                 do_fixup = true;
1617                 fixup_start_addr = run_end - host_offset;
1618                 /*
1619                  * This host page has gone, the next loop iteration starts
1620                  * from after the fixup
1621                  */
1622                 run_start = fixup_start_addr + host_ratio;
1623             } else {
1624                 /*
1625                  * No discards on this iteration, next loop starts from
1626                  * next sent/dirty page
1627                  */
1628                 run_start = run_end + 1;
1629             }
1630         }
1631 
1632         if (do_fixup) {
1633             unsigned long page;
1634 
1635             /* Tell the destination to discard this page */
1636             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1637                 /* For the unsent_pass we:
1638                  *     discard partially sent pages
1639                  * For the !unsent_pass (dirty) we:
1640                  *     discard partially dirty pages that were sent
1641                  *     (any partially sent pages were already discarded
1642                  *     by the previous unsent_pass)
1643                  */
1644                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1645                                             host_ratio);
1646             }
1647 
1648             /* Clean up the bitmap */
1649             for (page = fixup_start_addr;
1650                  page < fixup_start_addr + host_ratio; page++) {
1651                 /* All pages in this host page are now not sent */
1652                 set_bit(page, unsentmap);
1653 
1654                 /*
1655                  * Remark them as dirty, updating the count for any pages
1656                  * that weren't previously dirty.
1657                  */
1658                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1659             }
1660         }
1661 
1662         if (unsent_pass) {
1663             /* Find the next sent page for the next iteration */
1664             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1665         } else {
1666             /* Find the next dirty page for the next iteration */
1667             run_start = find_next_bit(bitmap, pages, run_start);
1668         }
1669     }
1670 }
1671 
1672 /**
1673  * postcopy_chuck_hostpages: discrad any partially sent host page
1674  *
1675  * Utility for the outgoing postcopy code.
1676  *
1677  * Discard any partially sent host-page size chunks, mark any partially
1678  * dirty host-page size chunks as all dirty.  In this case the host-page
1679  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1680  *
1681  * Returns zero on success
1682  *
1683  * @ms: current migration state
1684  * @block: block we want to work with
1685  */
1686 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1687 {
1688     PostcopyDiscardState *pds =
1689         postcopy_discard_send_init(ms, block->idstr);
1690 
1691     /* First pass: Discard all partially sent host pages */
1692     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1693     /*
1694      * Second pass: Ensure that all partially dirty host pages are made
1695      * fully dirty.
1696      */
1697     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1698 
1699     postcopy_discard_send_finish(ms, pds);
1700     return 0;
1701 }
1702 
1703 /**
1704  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1705  *
1706  * Returns zero on success
1707  *
1708  * Transmit the set of pages to be discarded after precopy to the target
1709  * these are pages that:
1710  *     a) Have been previously transmitted but are now dirty again
1711  *     b) Pages that have never been transmitted, this ensures that
1712  *        any pages on the destination that have been mapped by background
1713  *        tasks get discarded (transparent huge pages is the specific concern)
1714  * Hopefully this is pretty sparse
1715  *
1716  * @ms: current migration state
1717  */
1718 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1719 {
1720     RAMState *rs = ram_state;
1721     RAMBlock *block;
1722     int ret;
1723 
1724     rcu_read_lock();
1725 
1726     /* This should be our last sync, the src is now paused */
1727     migration_bitmap_sync(rs);
1728 
1729     /* Easiest way to make sure we don't resume in the middle of a host-page */
1730     rs->last_seen_block = NULL;
1731     rs->last_sent_block = NULL;
1732     rs->last_page = 0;
1733 
1734     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1735         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1736         unsigned long *bitmap = block->bmap;
1737         unsigned long *unsentmap = block->unsentmap;
1738 
1739         if (!unsentmap) {
1740             /* We don't have a safe way to resize the sentmap, so
1741              * if the bitmap was resized it will be NULL at this
1742              * point.
1743              */
1744             error_report("migration ram resized during precopy phase");
1745             rcu_read_unlock();
1746             return -EINVAL;
1747         }
1748         /* Deal with TPS != HPS and huge pages */
1749         ret = postcopy_chunk_hostpages(ms, block);
1750         if (ret) {
1751             rcu_read_unlock();
1752             return ret;
1753         }
1754 
1755         /*
1756          * Update the unsentmap to be unsentmap = unsentmap | dirty
1757          */
1758         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1759 #ifdef DEBUG_POSTCOPY
1760         ram_debug_dump_bitmap(unsentmap, true, pages);
1761 #endif
1762     }
1763     trace_ram_postcopy_send_discard_bitmap();
1764 
1765     ret = postcopy_each_ram_send_discard(ms);
1766     rcu_read_unlock();
1767 
1768     return ret;
1769 }
1770 
1771 /**
1772  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1773  *
1774  * Returns zero on success
1775  *
1776  * @rbname: name of the RAMBlock of the request. NULL means the
1777  *          same that last one.
1778  * @start: RAMBlock starting page
1779  * @length: RAMBlock size
1780  */
1781 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1782 {
1783     int ret = -1;
1784 
1785     trace_ram_discard_range(rbname, start, length);
1786 
1787     rcu_read_lock();
1788     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1789 
1790     if (!rb) {
1791         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1792         goto err;
1793     }
1794 
1795     ret = ram_block_discard_range(rb, start, length);
1796 
1797 err:
1798     rcu_read_unlock();
1799 
1800     return ret;
1801 }
1802 
1803 static int ram_state_init(RAMState **rsp)
1804 {
1805     *rsp = g_new0(RAMState, 1);
1806 
1807     qemu_mutex_init(&(*rsp)->bitmap_mutex);
1808     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1809     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
1810 
1811     if (migrate_use_xbzrle()) {
1812         XBZRLE_cache_lock();
1813         XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
1814         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1815                                   TARGET_PAGE_SIZE,
1816                                   TARGET_PAGE_SIZE);
1817         if (!XBZRLE.cache) {
1818             XBZRLE_cache_unlock();
1819             error_report("Error creating cache");
1820             g_free(*rsp);
1821             *rsp = NULL;
1822             return -1;
1823         }
1824         XBZRLE_cache_unlock();
1825 
1826         /* We prefer not to abort if there is no memory */
1827         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1828         if (!XBZRLE.encoded_buf) {
1829             error_report("Error allocating encoded_buf");
1830             g_free(*rsp);
1831             *rsp = NULL;
1832             return -1;
1833         }
1834 
1835         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1836         if (!XBZRLE.current_buf) {
1837             error_report("Error allocating current_buf");
1838             g_free(XBZRLE.encoded_buf);
1839             XBZRLE.encoded_buf = NULL;
1840             g_free(*rsp);
1841             *rsp = NULL;
1842             return -1;
1843         }
1844     }
1845 
1846     /* For memory_global_dirty_log_start below.  */
1847     qemu_mutex_lock_iothread();
1848 
1849     qemu_mutex_lock_ramlist();
1850     rcu_read_lock();
1851     ram_state_reset(*rsp);
1852 
1853     /* Skip setting bitmap if there is no RAM */
1854     if (ram_bytes_total()) {
1855         RAMBlock *block;
1856 
1857         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1858             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1859 
1860             block->bmap = bitmap_new(pages);
1861             bitmap_set(block->bmap, 0, pages);
1862             if (migrate_postcopy_ram()) {
1863                 block->unsentmap = bitmap_new(pages);
1864                 bitmap_set(block->unsentmap, 0, pages);
1865             }
1866         }
1867     }
1868 
1869     /*
1870      * Count the total number of pages used by ram blocks not including any
1871      * gaps due to alignment or unplugs.
1872      */
1873     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1874 
1875     memory_global_dirty_log_start();
1876     migration_bitmap_sync(*rsp);
1877     qemu_mutex_unlock_ramlist();
1878     qemu_mutex_unlock_iothread();
1879     rcu_read_unlock();
1880 
1881     return 0;
1882 }
1883 
1884 /*
1885  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1886  * long-running RCU critical section.  When rcu-reclaims in the code
1887  * start to become numerous it will be necessary to reduce the
1888  * granularity of these critical sections.
1889  */
1890 
1891 /**
1892  * ram_save_setup: Setup RAM for migration
1893  *
1894  * Returns zero to indicate success and negative for error
1895  *
1896  * @f: QEMUFile where to send the data
1897  * @opaque: RAMState pointer
1898  */
1899 static int ram_save_setup(QEMUFile *f, void *opaque)
1900 {
1901     RAMState **rsp = opaque;
1902     RAMBlock *block;
1903 
1904     /* migration has already setup the bitmap, reuse it. */
1905     if (!migration_in_colo_state()) {
1906         if (ram_state_init(rsp) != 0) {
1907             return -1;
1908         }
1909     }
1910     (*rsp)->f = f;
1911 
1912     rcu_read_lock();
1913 
1914     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1915 
1916     RAMBLOCK_FOREACH(block) {
1917         qemu_put_byte(f, strlen(block->idstr));
1918         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1919         qemu_put_be64(f, block->used_length);
1920         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1921             qemu_put_be64(f, block->page_size);
1922         }
1923     }
1924 
1925     rcu_read_unlock();
1926 
1927     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1928     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1929 
1930     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1931 
1932     return 0;
1933 }
1934 
1935 /**
1936  * ram_save_iterate: iterative stage for migration
1937  *
1938  * Returns zero to indicate success and negative for error
1939  *
1940  * @f: QEMUFile where to send the data
1941  * @opaque: RAMState pointer
1942  */
1943 static int ram_save_iterate(QEMUFile *f, void *opaque)
1944 {
1945     RAMState **temp = opaque;
1946     RAMState *rs = *temp;
1947     int ret;
1948     int i;
1949     int64_t t0;
1950     int done = 0;
1951 
1952     rcu_read_lock();
1953     if (ram_list.version != rs->last_version) {
1954         ram_state_reset(rs);
1955     }
1956 
1957     /* Read version before ram_list.blocks */
1958     smp_rmb();
1959 
1960     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1961 
1962     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1963     i = 0;
1964     while ((ret = qemu_file_rate_limit(f)) == 0) {
1965         int pages;
1966 
1967         pages = ram_find_and_save_block(rs, false);
1968         /* no more pages to sent */
1969         if (pages == 0) {
1970             done = 1;
1971             break;
1972         }
1973         rs->iterations++;
1974 
1975         /* we want to check in the 1st loop, just in case it was the 1st time
1976            and we had to sync the dirty bitmap.
1977            qemu_get_clock_ns() is a bit expensive, so we only check each some
1978            iterations
1979         */
1980         if ((i & 63) == 0) {
1981             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1982             if (t1 > MAX_WAIT) {
1983                 trace_ram_save_iterate_big_wait(t1, i);
1984                 break;
1985             }
1986         }
1987         i++;
1988     }
1989     flush_compressed_data(rs);
1990     rcu_read_unlock();
1991 
1992     /*
1993      * Must occur before EOS (or any QEMUFile operation)
1994      * because of RDMA protocol.
1995      */
1996     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1997 
1998     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1999     ram_counters.transferred += 8;
2000 
2001     ret = qemu_file_get_error(f);
2002     if (ret < 0) {
2003         return ret;
2004     }
2005 
2006     return done;
2007 }
2008 
2009 /**
2010  * ram_save_complete: function called to send the remaining amount of ram
2011  *
2012  * Returns zero to indicate success
2013  *
2014  * Called with iothread lock
2015  *
2016  * @f: QEMUFile where to send the data
2017  * @opaque: RAMState pointer
2018  */
2019 static int ram_save_complete(QEMUFile *f, void *opaque)
2020 {
2021     RAMState **temp = opaque;
2022     RAMState *rs = *temp;
2023 
2024     rcu_read_lock();
2025 
2026     if (!migration_in_postcopy()) {
2027         migration_bitmap_sync(rs);
2028     }
2029 
2030     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2031 
2032     /* try transferring iterative blocks of memory */
2033 
2034     /* flush all remaining blocks regardless of rate limiting */
2035     while (true) {
2036         int pages;
2037 
2038         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2039         /* no more blocks to sent */
2040         if (pages == 0) {
2041             break;
2042         }
2043     }
2044 
2045     flush_compressed_data(rs);
2046     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2047 
2048     rcu_read_unlock();
2049 
2050     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2051 
2052     return 0;
2053 }
2054 
2055 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2056                              uint64_t *non_postcopiable_pending,
2057                              uint64_t *postcopiable_pending)
2058 {
2059     RAMState **temp = opaque;
2060     RAMState *rs = *temp;
2061     uint64_t remaining_size;
2062 
2063     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2064 
2065     if (!migration_in_postcopy() &&
2066         remaining_size < max_size) {
2067         qemu_mutex_lock_iothread();
2068         rcu_read_lock();
2069         migration_bitmap_sync(rs);
2070         rcu_read_unlock();
2071         qemu_mutex_unlock_iothread();
2072         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2073     }
2074 
2075     /* We can do postcopy, and all the data is postcopiable */
2076     *postcopiable_pending += remaining_size;
2077 }
2078 
2079 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2080 {
2081     unsigned int xh_len;
2082     int xh_flags;
2083     uint8_t *loaded_data;
2084 
2085     if (!xbzrle_decoded_buf) {
2086         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2087     }
2088     loaded_data = xbzrle_decoded_buf;
2089 
2090     /* extract RLE header */
2091     xh_flags = qemu_get_byte(f);
2092     xh_len = qemu_get_be16(f);
2093 
2094     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2095         error_report("Failed to load XBZRLE page - wrong compression!");
2096         return -1;
2097     }
2098 
2099     if (xh_len > TARGET_PAGE_SIZE) {
2100         error_report("Failed to load XBZRLE page - len overflow!");
2101         return -1;
2102     }
2103     /* load data and decode */
2104     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2105 
2106     /* decode RLE */
2107     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2108                              TARGET_PAGE_SIZE) == -1) {
2109         error_report("Failed to load XBZRLE page - decode error!");
2110         return -1;
2111     }
2112 
2113     return 0;
2114 }
2115 
2116 /**
2117  * ram_block_from_stream: read a RAMBlock id from the migration stream
2118  *
2119  * Must be called from within a rcu critical section.
2120  *
2121  * Returns a pointer from within the RCU-protected ram_list.
2122  *
2123  * @f: QEMUFile where to read the data from
2124  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2125  */
2126 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2127 {
2128     static RAMBlock *block = NULL;
2129     char id[256];
2130     uint8_t len;
2131 
2132     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2133         if (!block) {
2134             error_report("Ack, bad migration stream!");
2135             return NULL;
2136         }
2137         return block;
2138     }
2139 
2140     len = qemu_get_byte(f);
2141     qemu_get_buffer(f, (uint8_t *)id, len);
2142     id[len] = 0;
2143 
2144     block = qemu_ram_block_by_name(id);
2145     if (!block) {
2146         error_report("Can't find block %s", id);
2147         return NULL;
2148     }
2149 
2150     return block;
2151 }
2152 
2153 static inline void *host_from_ram_block_offset(RAMBlock *block,
2154                                                ram_addr_t offset)
2155 {
2156     if (!offset_in_ramblock(block, offset)) {
2157         return NULL;
2158     }
2159 
2160     return block->host + offset;
2161 }
2162 
2163 /**
2164  * ram_handle_compressed: handle the zero page case
2165  *
2166  * If a page (or a whole RDMA chunk) has been
2167  * determined to be zero, then zap it.
2168  *
2169  * @host: host address for the zero page
2170  * @ch: what the page is filled from.  We only support zero
2171  * @size: size of the zero page
2172  */
2173 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2174 {
2175     if (ch != 0 || !is_zero_range(host, size)) {
2176         memset(host, ch, size);
2177     }
2178 }
2179 
2180 static void *do_data_decompress(void *opaque)
2181 {
2182     DecompressParam *param = opaque;
2183     unsigned long pagesize;
2184     uint8_t *des;
2185     int len;
2186 
2187     qemu_mutex_lock(&param->mutex);
2188     while (!param->quit) {
2189         if (param->des) {
2190             des = param->des;
2191             len = param->len;
2192             param->des = 0;
2193             qemu_mutex_unlock(&param->mutex);
2194 
2195             pagesize = TARGET_PAGE_SIZE;
2196             /* uncompress() will return failed in some case, especially
2197              * when the page is dirted when doing the compression, it's
2198              * not a problem because the dirty page will be retransferred
2199              * and uncompress() won't break the data in other pages.
2200              */
2201             uncompress((Bytef *)des, &pagesize,
2202                        (const Bytef *)param->compbuf, len);
2203 
2204             qemu_mutex_lock(&decomp_done_lock);
2205             param->done = true;
2206             qemu_cond_signal(&decomp_done_cond);
2207             qemu_mutex_unlock(&decomp_done_lock);
2208 
2209             qemu_mutex_lock(&param->mutex);
2210         } else {
2211             qemu_cond_wait(&param->cond, &param->mutex);
2212         }
2213     }
2214     qemu_mutex_unlock(&param->mutex);
2215 
2216     return NULL;
2217 }
2218 
2219 static void wait_for_decompress_done(void)
2220 {
2221     int idx, thread_count;
2222 
2223     if (!migrate_use_compression()) {
2224         return;
2225     }
2226 
2227     thread_count = migrate_decompress_threads();
2228     qemu_mutex_lock(&decomp_done_lock);
2229     for (idx = 0; idx < thread_count; idx++) {
2230         while (!decomp_param[idx].done) {
2231             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2232         }
2233     }
2234     qemu_mutex_unlock(&decomp_done_lock);
2235 }
2236 
2237 void migrate_decompress_threads_create(void)
2238 {
2239     int i, thread_count;
2240 
2241     thread_count = migrate_decompress_threads();
2242     decompress_threads = g_new0(QemuThread, thread_count);
2243     decomp_param = g_new0(DecompressParam, thread_count);
2244     qemu_mutex_init(&decomp_done_lock);
2245     qemu_cond_init(&decomp_done_cond);
2246     for (i = 0; i < thread_count; i++) {
2247         qemu_mutex_init(&decomp_param[i].mutex);
2248         qemu_cond_init(&decomp_param[i].cond);
2249         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2250         decomp_param[i].done = true;
2251         decomp_param[i].quit = false;
2252         qemu_thread_create(decompress_threads + i, "decompress",
2253                            do_data_decompress, decomp_param + i,
2254                            QEMU_THREAD_JOINABLE);
2255     }
2256 }
2257 
2258 void migrate_decompress_threads_join(void)
2259 {
2260     int i, thread_count;
2261 
2262     thread_count = migrate_decompress_threads();
2263     for (i = 0; i < thread_count; i++) {
2264         qemu_mutex_lock(&decomp_param[i].mutex);
2265         decomp_param[i].quit = true;
2266         qemu_cond_signal(&decomp_param[i].cond);
2267         qemu_mutex_unlock(&decomp_param[i].mutex);
2268     }
2269     for (i = 0; i < thread_count; i++) {
2270         qemu_thread_join(decompress_threads + i);
2271         qemu_mutex_destroy(&decomp_param[i].mutex);
2272         qemu_cond_destroy(&decomp_param[i].cond);
2273         g_free(decomp_param[i].compbuf);
2274     }
2275     g_free(decompress_threads);
2276     g_free(decomp_param);
2277     decompress_threads = NULL;
2278     decomp_param = NULL;
2279 }
2280 
2281 static void decompress_data_with_multi_threads(QEMUFile *f,
2282                                                void *host, int len)
2283 {
2284     int idx, thread_count;
2285 
2286     thread_count = migrate_decompress_threads();
2287     qemu_mutex_lock(&decomp_done_lock);
2288     while (true) {
2289         for (idx = 0; idx < thread_count; idx++) {
2290             if (decomp_param[idx].done) {
2291                 decomp_param[idx].done = false;
2292                 qemu_mutex_lock(&decomp_param[idx].mutex);
2293                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2294                 decomp_param[idx].des = host;
2295                 decomp_param[idx].len = len;
2296                 qemu_cond_signal(&decomp_param[idx].cond);
2297                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2298                 break;
2299             }
2300         }
2301         if (idx < thread_count) {
2302             break;
2303         } else {
2304             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2305         }
2306     }
2307     qemu_mutex_unlock(&decomp_done_lock);
2308 }
2309 
2310 /**
2311  * ram_postcopy_incoming_init: allocate postcopy data structures
2312  *
2313  * Returns 0 for success and negative if there was one error
2314  *
2315  * @mis: current migration incoming state
2316  *
2317  * Allocate data structures etc needed by incoming migration with
2318  * postcopy-ram. postcopy-ram's similarly names
2319  * postcopy_ram_incoming_init does the work.
2320  */
2321 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2322 {
2323     unsigned long ram_pages = last_ram_page();
2324 
2325     return postcopy_ram_incoming_init(mis, ram_pages);
2326 }
2327 
2328 /**
2329  * ram_load_postcopy: load a page in postcopy case
2330  *
2331  * Returns 0 for success or -errno in case of error
2332  *
2333  * Called in postcopy mode by ram_load().
2334  * rcu_read_lock is taken prior to this being called.
2335  *
2336  * @f: QEMUFile where to send the data
2337  */
2338 static int ram_load_postcopy(QEMUFile *f)
2339 {
2340     int flags = 0, ret = 0;
2341     bool place_needed = false;
2342     bool matching_page_sizes = false;
2343     MigrationIncomingState *mis = migration_incoming_get_current();
2344     /* Temporary page that is later 'placed' */
2345     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2346     void *last_host = NULL;
2347     bool all_zero = false;
2348 
2349     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2350         ram_addr_t addr;
2351         void *host = NULL;
2352         void *page_buffer = NULL;
2353         void *place_source = NULL;
2354         RAMBlock *block = NULL;
2355         uint8_t ch;
2356 
2357         addr = qemu_get_be64(f);
2358         flags = addr & ~TARGET_PAGE_MASK;
2359         addr &= TARGET_PAGE_MASK;
2360 
2361         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2362         place_needed = false;
2363         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2364             block = ram_block_from_stream(f, flags);
2365 
2366             host = host_from_ram_block_offset(block, addr);
2367             if (!host) {
2368                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2369                 ret = -EINVAL;
2370                 break;
2371             }
2372             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2373             /*
2374              * Postcopy requires that we place whole host pages atomically;
2375              * these may be huge pages for RAMBlocks that are backed by
2376              * hugetlbfs.
2377              * To make it atomic, the data is read into a temporary page
2378              * that's moved into place later.
2379              * The migration protocol uses,  possibly smaller, target-pages
2380              * however the source ensures it always sends all the components
2381              * of a host page in order.
2382              */
2383             page_buffer = postcopy_host_page +
2384                           ((uintptr_t)host & (block->page_size - 1));
2385             /* If all TP are zero then we can optimise the place */
2386             if (!((uintptr_t)host & (block->page_size - 1))) {
2387                 all_zero = true;
2388             } else {
2389                 /* not the 1st TP within the HP */
2390                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2391                     error_report("Non-sequential target page %p/%p",
2392                                   host, last_host);
2393                     ret = -EINVAL;
2394                     break;
2395                 }
2396             }
2397 
2398 
2399             /*
2400              * If it's the last part of a host page then we place the host
2401              * page
2402              */
2403             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2404                                      (block->page_size - 1)) == 0;
2405             place_source = postcopy_host_page;
2406         }
2407         last_host = host;
2408 
2409         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2410         case RAM_SAVE_FLAG_ZERO:
2411             ch = qemu_get_byte(f);
2412             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2413             if (ch) {
2414                 all_zero = false;
2415             }
2416             break;
2417 
2418         case RAM_SAVE_FLAG_PAGE:
2419             all_zero = false;
2420             if (!place_needed || !matching_page_sizes) {
2421                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2422             } else {
2423                 /* Avoids the qemu_file copy during postcopy, which is
2424                  * going to do a copy later; can only do it when we
2425                  * do this read in one go (matching page sizes)
2426                  */
2427                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2428                                          TARGET_PAGE_SIZE);
2429             }
2430             break;
2431         case RAM_SAVE_FLAG_EOS:
2432             /* normal exit */
2433             break;
2434         default:
2435             error_report("Unknown combination of migration flags: %#x"
2436                          " (postcopy mode)", flags);
2437             ret = -EINVAL;
2438         }
2439 
2440         if (place_needed) {
2441             /* This gets called at the last target page in the host page */
2442             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2443 
2444             if (all_zero) {
2445                 ret = postcopy_place_page_zero(mis, place_dest,
2446                                                block->page_size);
2447             } else {
2448                 ret = postcopy_place_page(mis, place_dest,
2449                                           place_source, block->page_size);
2450             }
2451         }
2452         if (!ret) {
2453             ret = qemu_file_get_error(f);
2454         }
2455     }
2456 
2457     return ret;
2458 }
2459 
2460 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2461 {
2462     int flags = 0, ret = 0;
2463     static uint64_t seq_iter;
2464     int len = 0;
2465     /*
2466      * If system is running in postcopy mode, page inserts to host memory must
2467      * be atomic
2468      */
2469     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2470     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2471     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2472 
2473     seq_iter++;
2474 
2475     if (version_id != 4) {
2476         ret = -EINVAL;
2477     }
2478 
2479     /* This RCU critical section can be very long running.
2480      * When RCU reclaims in the code start to become numerous,
2481      * it will be necessary to reduce the granularity of this
2482      * critical section.
2483      */
2484     rcu_read_lock();
2485 
2486     if (postcopy_running) {
2487         ret = ram_load_postcopy(f);
2488     }
2489 
2490     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2491         ram_addr_t addr, total_ram_bytes;
2492         void *host = NULL;
2493         uint8_t ch;
2494 
2495         addr = qemu_get_be64(f);
2496         flags = addr & ~TARGET_PAGE_MASK;
2497         addr &= TARGET_PAGE_MASK;
2498 
2499         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2500                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2501             RAMBlock *block = ram_block_from_stream(f, flags);
2502 
2503             host = host_from_ram_block_offset(block, addr);
2504             if (!host) {
2505                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2506                 ret = -EINVAL;
2507                 break;
2508             }
2509             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2510         }
2511 
2512         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2513         case RAM_SAVE_FLAG_MEM_SIZE:
2514             /* Synchronize RAM block list */
2515             total_ram_bytes = addr;
2516             while (!ret && total_ram_bytes) {
2517                 RAMBlock *block;
2518                 char id[256];
2519                 ram_addr_t length;
2520 
2521                 len = qemu_get_byte(f);
2522                 qemu_get_buffer(f, (uint8_t *)id, len);
2523                 id[len] = 0;
2524                 length = qemu_get_be64(f);
2525 
2526                 block = qemu_ram_block_by_name(id);
2527                 if (block) {
2528                     if (length != block->used_length) {
2529                         Error *local_err = NULL;
2530 
2531                         ret = qemu_ram_resize(block, length,
2532                                               &local_err);
2533                         if (local_err) {
2534                             error_report_err(local_err);
2535                         }
2536                     }
2537                     /* For postcopy we need to check hugepage sizes match */
2538                     if (postcopy_advised &&
2539                         block->page_size != qemu_host_page_size) {
2540                         uint64_t remote_page_size = qemu_get_be64(f);
2541                         if (remote_page_size != block->page_size) {
2542                             error_report("Mismatched RAM page size %s "
2543                                          "(local) %zd != %" PRId64,
2544                                          id, block->page_size,
2545                                          remote_page_size);
2546                             ret = -EINVAL;
2547                         }
2548                     }
2549                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2550                                           block->idstr);
2551                 } else {
2552                     error_report("Unknown ramblock \"%s\", cannot "
2553                                  "accept migration", id);
2554                     ret = -EINVAL;
2555                 }
2556 
2557                 total_ram_bytes -= length;
2558             }
2559             break;
2560 
2561         case RAM_SAVE_FLAG_ZERO:
2562             ch = qemu_get_byte(f);
2563             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2564             break;
2565 
2566         case RAM_SAVE_FLAG_PAGE:
2567             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2568             break;
2569 
2570         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2571             len = qemu_get_be32(f);
2572             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2573                 error_report("Invalid compressed data length: %d", len);
2574                 ret = -EINVAL;
2575                 break;
2576             }
2577             decompress_data_with_multi_threads(f, host, len);
2578             break;
2579 
2580         case RAM_SAVE_FLAG_XBZRLE:
2581             if (load_xbzrle(f, addr, host) < 0) {
2582                 error_report("Failed to decompress XBZRLE page at "
2583                              RAM_ADDR_FMT, addr);
2584                 ret = -EINVAL;
2585                 break;
2586             }
2587             break;
2588         case RAM_SAVE_FLAG_EOS:
2589             /* normal exit */
2590             break;
2591         default:
2592             if (flags & RAM_SAVE_FLAG_HOOK) {
2593                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2594             } else {
2595                 error_report("Unknown combination of migration flags: %#x",
2596                              flags);
2597                 ret = -EINVAL;
2598             }
2599         }
2600         if (!ret) {
2601             ret = qemu_file_get_error(f);
2602         }
2603     }
2604 
2605     wait_for_decompress_done();
2606     rcu_read_unlock();
2607     trace_ram_load_complete(ret, seq_iter);
2608     return ret;
2609 }
2610 
2611 static SaveVMHandlers savevm_ram_handlers = {
2612     .save_live_setup = ram_save_setup,
2613     .save_live_iterate = ram_save_iterate,
2614     .save_live_complete_postcopy = ram_save_complete,
2615     .save_live_complete_precopy = ram_save_complete,
2616     .save_live_pending = ram_save_pending,
2617     .load_state = ram_load,
2618     .cleanup = ram_migration_cleanup,
2619 };
2620 
2621 void ram_mig_init(void)
2622 {
2623     qemu_mutex_init(&XBZRLE.lock);
2624     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2625 }
2626