xref: /openbmc/qemu/migration/ram.c (revision 49cbd887abfaf3269653a292e7ea6c7852832250)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "cpu.h"
30 #include <zlib.h>
31 #include "qapi-event.h"
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/qmp/qerror.h"
46 #include "trace.h"
47 #include "exec/ram_addr.h"
48 #include "exec/target_page.h"
49 #include "qemu/rcu_queue.h"
50 #include "migration/colo.h"
51 #include "migration/block.h"
52 
53 /***********************************************************/
54 /* ram save/restore */
55 
56 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
57  * worked for pages that where filled with the same char.  We switched
58  * it to only search for the zero value.  And to avoid confusion with
59  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
60  */
61 
62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
63 #define RAM_SAVE_FLAG_ZERO     0x02
64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
65 #define RAM_SAVE_FLAG_PAGE     0x08
66 #define RAM_SAVE_FLAG_EOS      0x10
67 #define RAM_SAVE_FLAG_CONTINUE 0x20
68 #define RAM_SAVE_FLAG_XBZRLE   0x40
69 /* 0x80 is reserved in migration.h start with 0x100 next */
70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
71 
72 static inline bool is_zero_range(uint8_t *p, uint64_t size)
73 {
74     return buffer_is_zero(p, size);
75 }
76 
77 XBZRLECacheStats xbzrle_counters;
78 
79 /* struct contains XBZRLE cache and a static page
80    used by the compression */
81 static struct {
82     /* buffer used for XBZRLE encoding */
83     uint8_t *encoded_buf;
84     /* buffer for storing page content */
85     uint8_t *current_buf;
86     /* Cache for XBZRLE, Protected by lock. */
87     PageCache *cache;
88     QemuMutex lock;
89     /* it will store a page full of zeros */
90     uint8_t *zero_target_page;
91     /* buffer used for XBZRLE decoding */
92     uint8_t *decoded_buf;
93 } XBZRLE;
94 
95 static void XBZRLE_cache_lock(void)
96 {
97     if (migrate_use_xbzrle())
98         qemu_mutex_lock(&XBZRLE.lock);
99 }
100 
101 static void XBZRLE_cache_unlock(void)
102 {
103     if (migrate_use_xbzrle())
104         qemu_mutex_unlock(&XBZRLE.lock);
105 }
106 
107 /**
108  * xbzrle_cache_resize: resize the xbzrle cache
109  *
110  * This function is called from qmp_migrate_set_cache_size in main
111  * thread, possibly while a migration is in progress.  A running
112  * migration may be using the cache and might finish during this call,
113  * hence changes to the cache are protected by XBZRLE.lock().
114  *
115  * Returns the new_size or negative in case of error.
116  *
117  * @new_size: new cache size
118  * @errp: set *errp if the check failed, with reason
119  */
120 int64_t xbzrle_cache_resize(int64_t new_size, Error **errp)
121 {
122     PageCache *new_cache;
123     int64_t ret;
124 
125     /* Check for truncation */
126     if (new_size != (size_t)new_size) {
127         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
128                    "exceeding address space");
129         return -1;
130     }
131 
132     /* Cache should not be larger than guest ram size */
133     if (new_size > ram_bytes_total()) {
134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
135                    "exceeds guest ram size");
136         return -1;
137     }
138 
139     XBZRLE_cache_lock();
140 
141     if (XBZRLE.cache != NULL) {
142         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
143             goto out_new_size;
144         }
145         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
146         if (!new_cache) {
147             ret = -1;
148             goto out;
149         }
150 
151         cache_fini(XBZRLE.cache);
152         XBZRLE.cache = new_cache;
153     }
154 
155 out_new_size:
156     ret = pow2floor(new_size);
157 out:
158     XBZRLE_cache_unlock();
159     return ret;
160 }
161 
162 static void ramblock_recv_map_init(void)
163 {
164     RAMBlock *rb;
165 
166     RAMBLOCK_FOREACH(rb) {
167         assert(!rb->receivedmap);
168         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
169     }
170 }
171 
172 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
173 {
174     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
175                     rb->receivedmap);
176 }
177 
178 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
179 {
180     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
181 }
182 
183 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
184                                     size_t nr)
185 {
186     bitmap_set_atomic(rb->receivedmap,
187                       ramblock_recv_bitmap_offset(host_addr, rb),
188                       nr);
189 }
190 
191 /*
192  * An outstanding page request, on the source, having been received
193  * and queued
194  */
195 struct RAMSrcPageRequest {
196     RAMBlock *rb;
197     hwaddr    offset;
198     hwaddr    len;
199 
200     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
201 };
202 
203 /* State of RAM for migration */
204 struct RAMState {
205     /* QEMUFile used for this migration */
206     QEMUFile *f;
207     /* Last block that we have visited searching for dirty pages */
208     RAMBlock *last_seen_block;
209     /* Last block from where we have sent data */
210     RAMBlock *last_sent_block;
211     /* Last dirty target page we have sent */
212     ram_addr_t last_page;
213     /* last ram version we have seen */
214     uint32_t last_version;
215     /* We are in the first round */
216     bool ram_bulk_stage;
217     /* How many times we have dirty too many pages */
218     int dirty_rate_high_cnt;
219     /* these variables are used for bitmap sync */
220     /* last time we did a full bitmap_sync */
221     int64_t time_last_bitmap_sync;
222     /* bytes transferred at start_time */
223     uint64_t bytes_xfer_prev;
224     /* number of dirty pages since start_time */
225     uint64_t num_dirty_pages_period;
226     /* xbzrle misses since the beginning of the period */
227     uint64_t xbzrle_cache_miss_prev;
228     /* number of iterations at the beginning of period */
229     uint64_t iterations_prev;
230     /* Iterations since start */
231     uint64_t iterations;
232     /* number of dirty bits in the bitmap */
233     uint64_t migration_dirty_pages;
234     /* protects modification of the bitmap */
235     QemuMutex bitmap_mutex;
236     /* The RAMBlock used in the last src_page_requests */
237     RAMBlock *last_req_rb;
238     /* Queue of outstanding page requests from the destination */
239     QemuMutex src_page_req_mutex;
240     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
241 };
242 typedef struct RAMState RAMState;
243 
244 static RAMState *ram_state;
245 
246 uint64_t ram_bytes_remaining(void)
247 {
248     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
249 }
250 
251 MigrationStats ram_counters;
252 
253 /* used by the search for pages to send */
254 struct PageSearchStatus {
255     /* Current block being searched */
256     RAMBlock    *block;
257     /* Current page to search from */
258     unsigned long page;
259     /* Set once we wrap around */
260     bool         complete_round;
261 };
262 typedef struct PageSearchStatus PageSearchStatus;
263 
264 struct CompressParam {
265     bool done;
266     bool quit;
267     QEMUFile *file;
268     QemuMutex mutex;
269     QemuCond cond;
270     RAMBlock *block;
271     ram_addr_t offset;
272 };
273 typedef struct CompressParam CompressParam;
274 
275 struct DecompressParam {
276     bool done;
277     bool quit;
278     QemuMutex mutex;
279     QemuCond cond;
280     void *des;
281     uint8_t *compbuf;
282     int len;
283 };
284 typedef struct DecompressParam DecompressParam;
285 
286 static CompressParam *comp_param;
287 static QemuThread *compress_threads;
288 /* comp_done_cond is used to wake up the migration thread when
289  * one of the compression threads has finished the compression.
290  * comp_done_lock is used to co-work with comp_done_cond.
291  */
292 static QemuMutex comp_done_lock;
293 static QemuCond comp_done_cond;
294 /* The empty QEMUFileOps will be used by file in CompressParam */
295 static const QEMUFileOps empty_ops = { };
296 
297 static DecompressParam *decomp_param;
298 static QemuThread *decompress_threads;
299 static QemuMutex decomp_done_lock;
300 static QemuCond decomp_done_cond;
301 
302 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
303                                 ram_addr_t offset);
304 
305 static void *do_data_compress(void *opaque)
306 {
307     CompressParam *param = opaque;
308     RAMBlock *block;
309     ram_addr_t offset;
310 
311     qemu_mutex_lock(&param->mutex);
312     while (!param->quit) {
313         if (param->block) {
314             block = param->block;
315             offset = param->offset;
316             param->block = NULL;
317             qemu_mutex_unlock(&param->mutex);
318 
319             do_compress_ram_page(param->file, block, offset);
320 
321             qemu_mutex_lock(&comp_done_lock);
322             param->done = true;
323             qemu_cond_signal(&comp_done_cond);
324             qemu_mutex_unlock(&comp_done_lock);
325 
326             qemu_mutex_lock(&param->mutex);
327         } else {
328             qemu_cond_wait(&param->cond, &param->mutex);
329         }
330     }
331     qemu_mutex_unlock(&param->mutex);
332 
333     return NULL;
334 }
335 
336 static inline void terminate_compression_threads(void)
337 {
338     int idx, thread_count;
339 
340     thread_count = migrate_compress_threads();
341 
342     for (idx = 0; idx < thread_count; idx++) {
343         qemu_mutex_lock(&comp_param[idx].mutex);
344         comp_param[idx].quit = true;
345         qemu_cond_signal(&comp_param[idx].cond);
346         qemu_mutex_unlock(&comp_param[idx].mutex);
347     }
348 }
349 
350 static void compress_threads_save_cleanup(void)
351 {
352     int i, thread_count;
353 
354     if (!migrate_use_compression()) {
355         return;
356     }
357     terminate_compression_threads();
358     thread_count = migrate_compress_threads();
359     for (i = 0; i < thread_count; i++) {
360         qemu_thread_join(compress_threads + i);
361         qemu_fclose(comp_param[i].file);
362         qemu_mutex_destroy(&comp_param[i].mutex);
363         qemu_cond_destroy(&comp_param[i].cond);
364     }
365     qemu_mutex_destroy(&comp_done_lock);
366     qemu_cond_destroy(&comp_done_cond);
367     g_free(compress_threads);
368     g_free(comp_param);
369     compress_threads = NULL;
370     comp_param = NULL;
371 }
372 
373 static void compress_threads_save_setup(void)
374 {
375     int i, thread_count;
376 
377     if (!migrate_use_compression()) {
378         return;
379     }
380     thread_count = migrate_compress_threads();
381     compress_threads = g_new0(QemuThread, thread_count);
382     comp_param = g_new0(CompressParam, thread_count);
383     qemu_cond_init(&comp_done_cond);
384     qemu_mutex_init(&comp_done_lock);
385     for (i = 0; i < thread_count; i++) {
386         /* comp_param[i].file is just used as a dummy buffer to save data,
387          * set its ops to empty.
388          */
389         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
390         comp_param[i].done = true;
391         comp_param[i].quit = false;
392         qemu_mutex_init(&comp_param[i].mutex);
393         qemu_cond_init(&comp_param[i].cond);
394         qemu_thread_create(compress_threads + i, "compress",
395                            do_data_compress, comp_param + i,
396                            QEMU_THREAD_JOINABLE);
397     }
398 }
399 
400 /* Multiple fd's */
401 
402 struct MultiFDSendParams {
403     uint8_t id;
404     char *name;
405     QemuThread thread;
406     QemuSemaphore sem;
407     QemuMutex mutex;
408     bool quit;
409 };
410 typedef struct MultiFDSendParams MultiFDSendParams;
411 
412 struct {
413     MultiFDSendParams *params;
414     /* number of created threads */
415     int count;
416 } *multifd_send_state;
417 
418 static void terminate_multifd_send_threads(Error *errp)
419 {
420     int i;
421 
422     for (i = 0; i < multifd_send_state->count; i++) {
423         MultiFDSendParams *p = &multifd_send_state->params[i];
424 
425         qemu_mutex_lock(&p->mutex);
426         p->quit = true;
427         qemu_sem_post(&p->sem);
428         qemu_mutex_unlock(&p->mutex);
429     }
430 }
431 
432 int multifd_save_cleanup(Error **errp)
433 {
434     int i;
435     int ret = 0;
436 
437     if (!migrate_use_multifd()) {
438         return 0;
439     }
440     terminate_multifd_send_threads(NULL);
441     for (i = 0; i < multifd_send_state->count; i++) {
442         MultiFDSendParams *p = &multifd_send_state->params[i];
443 
444         qemu_thread_join(&p->thread);
445         qemu_mutex_destroy(&p->mutex);
446         qemu_sem_destroy(&p->sem);
447         g_free(p->name);
448         p->name = NULL;
449     }
450     g_free(multifd_send_state->params);
451     multifd_send_state->params = NULL;
452     g_free(multifd_send_state);
453     multifd_send_state = NULL;
454     return ret;
455 }
456 
457 static void *multifd_send_thread(void *opaque)
458 {
459     MultiFDSendParams *p = opaque;
460 
461     while (true) {
462         qemu_mutex_lock(&p->mutex);
463         if (p->quit) {
464             qemu_mutex_unlock(&p->mutex);
465             break;
466         }
467         qemu_mutex_unlock(&p->mutex);
468         qemu_sem_wait(&p->sem);
469     }
470 
471     return NULL;
472 }
473 
474 int multifd_save_setup(void)
475 {
476     int thread_count;
477     uint8_t i;
478 
479     if (!migrate_use_multifd()) {
480         return 0;
481     }
482     thread_count = migrate_multifd_channels();
483     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
484     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
485     multifd_send_state->count = 0;
486     for (i = 0; i < thread_count; i++) {
487         MultiFDSendParams *p = &multifd_send_state->params[i];
488 
489         qemu_mutex_init(&p->mutex);
490         qemu_sem_init(&p->sem, 0);
491         p->quit = false;
492         p->id = i;
493         p->name = g_strdup_printf("multifdsend_%d", i);
494         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
495                            QEMU_THREAD_JOINABLE);
496 
497         multifd_send_state->count++;
498     }
499     return 0;
500 }
501 
502 struct MultiFDRecvParams {
503     uint8_t id;
504     char *name;
505     QemuThread thread;
506     QemuSemaphore sem;
507     QemuMutex mutex;
508     bool quit;
509 };
510 typedef struct MultiFDRecvParams MultiFDRecvParams;
511 
512 struct {
513     MultiFDRecvParams *params;
514     /* number of created threads */
515     int count;
516 } *multifd_recv_state;
517 
518 static void terminate_multifd_recv_threads(Error *errp)
519 {
520     int i;
521 
522     for (i = 0; i < multifd_recv_state->count; i++) {
523         MultiFDRecvParams *p = &multifd_recv_state->params[i];
524 
525         qemu_mutex_lock(&p->mutex);
526         p->quit = true;
527         qemu_sem_post(&p->sem);
528         qemu_mutex_unlock(&p->mutex);
529     }
530 }
531 
532 int multifd_load_cleanup(Error **errp)
533 {
534     int i;
535     int ret = 0;
536 
537     if (!migrate_use_multifd()) {
538         return 0;
539     }
540     terminate_multifd_recv_threads(NULL);
541     for (i = 0; i < multifd_recv_state->count; i++) {
542         MultiFDRecvParams *p = &multifd_recv_state->params[i];
543 
544         qemu_thread_join(&p->thread);
545         qemu_mutex_destroy(&p->mutex);
546         qemu_sem_destroy(&p->sem);
547         g_free(p->name);
548         p->name = NULL;
549     }
550     g_free(multifd_recv_state->params);
551     multifd_recv_state->params = NULL;
552     g_free(multifd_recv_state);
553     multifd_recv_state = NULL;
554 
555     return ret;
556 }
557 
558 static void *multifd_recv_thread(void *opaque)
559 {
560     MultiFDRecvParams *p = opaque;
561 
562     while (true) {
563         qemu_mutex_lock(&p->mutex);
564         if (p->quit) {
565             qemu_mutex_unlock(&p->mutex);
566             break;
567         }
568         qemu_mutex_unlock(&p->mutex);
569         qemu_sem_wait(&p->sem);
570     }
571 
572     return NULL;
573 }
574 
575 int multifd_load_setup(void)
576 {
577     int thread_count;
578     uint8_t i;
579 
580     if (!migrate_use_multifd()) {
581         return 0;
582     }
583     thread_count = migrate_multifd_channels();
584     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
585     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
586     multifd_recv_state->count = 0;
587     for (i = 0; i < thread_count; i++) {
588         MultiFDRecvParams *p = &multifd_recv_state->params[i];
589 
590         qemu_mutex_init(&p->mutex);
591         qemu_sem_init(&p->sem, 0);
592         p->quit = false;
593         p->id = i;
594         p->name = g_strdup_printf("multifdrecv_%d", i);
595         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
596                            QEMU_THREAD_JOINABLE);
597         multifd_recv_state->count++;
598     }
599     return 0;
600 }
601 
602 /**
603  * save_page_header: write page header to wire
604  *
605  * If this is the 1st block, it also writes the block identification
606  *
607  * Returns the number of bytes written
608  *
609  * @f: QEMUFile where to send the data
610  * @block: block that contains the page we want to send
611  * @offset: offset inside the block for the page
612  *          in the lower bits, it contains flags
613  */
614 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
615                                ram_addr_t offset)
616 {
617     size_t size, len;
618 
619     if (block == rs->last_sent_block) {
620         offset |= RAM_SAVE_FLAG_CONTINUE;
621     }
622     qemu_put_be64(f, offset);
623     size = 8;
624 
625     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
626         len = strlen(block->idstr);
627         qemu_put_byte(f, len);
628         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
629         size += 1 + len;
630         rs->last_sent_block = block;
631     }
632     return size;
633 }
634 
635 /**
636  * mig_throttle_guest_down: throotle down the guest
637  *
638  * Reduce amount of guest cpu execution to hopefully slow down memory
639  * writes. If guest dirty memory rate is reduced below the rate at
640  * which we can transfer pages to the destination then we should be
641  * able to complete migration. Some workloads dirty memory way too
642  * fast and will not effectively converge, even with auto-converge.
643  */
644 static void mig_throttle_guest_down(void)
645 {
646     MigrationState *s = migrate_get_current();
647     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
648     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
649 
650     /* We have not started throttling yet. Let's start it. */
651     if (!cpu_throttle_active()) {
652         cpu_throttle_set(pct_initial);
653     } else {
654         /* Throttling already on, just increase the rate */
655         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
656     }
657 }
658 
659 /**
660  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
661  *
662  * @rs: current RAM state
663  * @current_addr: address for the zero page
664  *
665  * Update the xbzrle cache to reflect a page that's been sent as all 0.
666  * The important thing is that a stale (not-yet-0'd) page be replaced
667  * by the new data.
668  * As a bonus, if the page wasn't in the cache it gets added so that
669  * when a small write is made into the 0'd page it gets XBZRLE sent.
670  */
671 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
672 {
673     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
674         return;
675     }
676 
677     /* We don't care if this fails to allocate a new cache page
678      * as long as it updated an old one */
679     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
680                  ram_counters.dirty_sync_count);
681 }
682 
683 #define ENCODING_FLAG_XBZRLE 0x1
684 
685 /**
686  * save_xbzrle_page: compress and send current page
687  *
688  * Returns: 1 means that we wrote the page
689  *          0 means that page is identical to the one already sent
690  *          -1 means that xbzrle would be longer than normal
691  *
692  * @rs: current RAM state
693  * @current_data: pointer to the address of the page contents
694  * @current_addr: addr of the page
695  * @block: block that contains the page we want to send
696  * @offset: offset inside the block for the page
697  * @last_stage: if we are at the completion stage
698  */
699 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
700                             ram_addr_t current_addr, RAMBlock *block,
701                             ram_addr_t offset, bool last_stage)
702 {
703     int encoded_len = 0, bytes_xbzrle;
704     uint8_t *prev_cached_page;
705 
706     if (!cache_is_cached(XBZRLE.cache, current_addr,
707                          ram_counters.dirty_sync_count)) {
708         xbzrle_counters.cache_miss++;
709         if (!last_stage) {
710             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
711                              ram_counters.dirty_sync_count) == -1) {
712                 return -1;
713             } else {
714                 /* update *current_data when the page has been
715                    inserted into cache */
716                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
717             }
718         }
719         return -1;
720     }
721 
722     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
723 
724     /* save current buffer into memory */
725     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
726 
727     /* XBZRLE encoding (if there is no overflow) */
728     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
729                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
730                                        TARGET_PAGE_SIZE);
731     if (encoded_len == 0) {
732         trace_save_xbzrle_page_skipping();
733         return 0;
734     } else if (encoded_len == -1) {
735         trace_save_xbzrle_page_overflow();
736         xbzrle_counters.overflow++;
737         /* update data in the cache */
738         if (!last_stage) {
739             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
740             *current_data = prev_cached_page;
741         }
742         return -1;
743     }
744 
745     /* we need to update the data in the cache, in order to get the same data */
746     if (!last_stage) {
747         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
748     }
749 
750     /* Send XBZRLE based compressed page */
751     bytes_xbzrle = save_page_header(rs, rs->f, block,
752                                     offset | RAM_SAVE_FLAG_XBZRLE);
753     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
754     qemu_put_be16(rs->f, encoded_len);
755     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
756     bytes_xbzrle += encoded_len + 1 + 2;
757     xbzrle_counters.pages++;
758     xbzrle_counters.bytes += bytes_xbzrle;
759     ram_counters.transferred += bytes_xbzrle;
760 
761     return 1;
762 }
763 
764 /**
765  * migration_bitmap_find_dirty: find the next dirty page from start
766  *
767  * Called with rcu_read_lock() to protect migration_bitmap
768  *
769  * Returns the byte offset within memory region of the start of a dirty page
770  *
771  * @rs: current RAM state
772  * @rb: RAMBlock where to search for dirty pages
773  * @start: page where we start the search
774  */
775 static inline
776 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
777                                           unsigned long start)
778 {
779     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
780     unsigned long *bitmap = rb->bmap;
781     unsigned long next;
782 
783     if (rs->ram_bulk_stage && start > 0) {
784         next = start + 1;
785     } else {
786         next = find_next_bit(bitmap, size, start);
787     }
788 
789     return next;
790 }
791 
792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
793                                                 RAMBlock *rb,
794                                                 unsigned long page)
795 {
796     bool ret;
797 
798     ret = test_and_clear_bit(page, rb->bmap);
799 
800     if (ret) {
801         rs->migration_dirty_pages--;
802     }
803     return ret;
804 }
805 
806 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
807                                         ram_addr_t start, ram_addr_t length)
808 {
809     rs->migration_dirty_pages +=
810         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
811                                               &rs->num_dirty_pages_period);
812 }
813 
814 /**
815  * ram_pagesize_summary: calculate all the pagesizes of a VM
816  *
817  * Returns a summary bitmap of the page sizes of all RAMBlocks
818  *
819  * For VMs with just normal pages this is equivalent to the host page
820  * size. If it's got some huge pages then it's the OR of all the
821  * different page sizes.
822  */
823 uint64_t ram_pagesize_summary(void)
824 {
825     RAMBlock *block;
826     uint64_t summary = 0;
827 
828     RAMBLOCK_FOREACH(block) {
829         summary |= block->page_size;
830     }
831 
832     return summary;
833 }
834 
835 static void migration_bitmap_sync(RAMState *rs)
836 {
837     RAMBlock *block;
838     int64_t end_time;
839     uint64_t bytes_xfer_now;
840 
841     ram_counters.dirty_sync_count++;
842 
843     if (!rs->time_last_bitmap_sync) {
844         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
845     }
846 
847     trace_migration_bitmap_sync_start();
848     memory_global_dirty_log_sync();
849 
850     qemu_mutex_lock(&rs->bitmap_mutex);
851     rcu_read_lock();
852     RAMBLOCK_FOREACH(block) {
853         migration_bitmap_sync_range(rs, block, 0, block->used_length);
854     }
855     rcu_read_unlock();
856     qemu_mutex_unlock(&rs->bitmap_mutex);
857 
858     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
859 
860     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
861 
862     /* more than 1 second = 1000 millisecons */
863     if (end_time > rs->time_last_bitmap_sync + 1000) {
864         /* calculate period counters */
865         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866             / (end_time - rs->time_last_bitmap_sync);
867         bytes_xfer_now = ram_counters.transferred;
868 
869         /* During block migration the auto-converge logic incorrectly detects
870          * that ram migration makes no progress. Avoid this by disabling the
871          * throttling logic during the bulk phase of block migration. */
872         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
873             /* The following detection logic can be refined later. For now:
874                Check to see if the dirtied bytes is 50% more than the approx.
875                amount of bytes that just got transferred since the last time we
876                were in this routine. If that happens twice, start or increase
877                throttling */
878 
879             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
880                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
881                 (++rs->dirty_rate_high_cnt >= 2)) {
882                     trace_migration_throttle();
883                     rs->dirty_rate_high_cnt = 0;
884                     mig_throttle_guest_down();
885             }
886         }
887 
888         if (migrate_use_xbzrle()) {
889             if (rs->iterations_prev != rs->iterations) {
890                 xbzrle_counters.cache_miss_rate =
891                    (double)(xbzrle_counters.cache_miss -
892                             rs->xbzrle_cache_miss_prev) /
893                    (rs->iterations - rs->iterations_prev);
894             }
895             rs->iterations_prev = rs->iterations;
896             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
897         }
898 
899         /* reset period counters */
900         rs->time_last_bitmap_sync = end_time;
901         rs->num_dirty_pages_period = 0;
902         rs->bytes_xfer_prev = bytes_xfer_now;
903     }
904     if (migrate_use_events()) {
905         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
906     }
907 }
908 
909 /**
910  * save_zero_page: send the zero page to the stream
911  *
912  * Returns the number of pages written.
913  *
914  * @rs: current RAM state
915  * @block: block that contains the page we want to send
916  * @offset: offset inside the block for the page
917  * @p: pointer to the page
918  */
919 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
920                           uint8_t *p)
921 {
922     int pages = -1;
923 
924     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
925         ram_counters.duplicate++;
926         ram_counters.transferred +=
927             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
928         qemu_put_byte(rs->f, 0);
929         ram_counters.transferred += 1;
930         pages = 1;
931     }
932 
933     return pages;
934 }
935 
936 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
937 {
938     if (!migrate_release_ram() || !migration_in_postcopy()) {
939         return;
940     }
941 
942     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
943 }
944 
945 /**
946  * ram_save_page: send the given page to the stream
947  *
948  * Returns the number of pages written.
949  *          < 0 - error
950  *          >=0 - Number of pages written - this might legally be 0
951  *                if xbzrle noticed the page was the same.
952  *
953  * @rs: current RAM state
954  * @block: block that contains the page we want to send
955  * @offset: offset inside the block for the page
956  * @last_stage: if we are at the completion stage
957  */
958 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
959 {
960     int pages = -1;
961     uint64_t bytes_xmit;
962     ram_addr_t current_addr;
963     uint8_t *p;
964     int ret;
965     bool send_async = true;
966     RAMBlock *block = pss->block;
967     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
968 
969     p = block->host + offset;
970     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
971 
972     /* In doubt sent page as normal */
973     bytes_xmit = 0;
974     ret = ram_control_save_page(rs->f, block->offset,
975                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
976     if (bytes_xmit) {
977         ram_counters.transferred += bytes_xmit;
978         pages = 1;
979     }
980 
981     XBZRLE_cache_lock();
982 
983     current_addr = block->offset + offset;
984 
985     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
986         if (ret != RAM_SAVE_CONTROL_DELAYED) {
987             if (bytes_xmit > 0) {
988                 ram_counters.normal++;
989             } else if (bytes_xmit == 0) {
990                 ram_counters.duplicate++;
991             }
992         }
993     } else {
994         pages = save_zero_page(rs, block, offset, p);
995         if (pages > 0) {
996             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
997              * page would be stale
998              */
999             xbzrle_cache_zero_page(rs, current_addr);
1000             ram_release_pages(block->idstr, offset, pages);
1001         } else if (!rs->ram_bulk_stage &&
1002                    !migration_in_postcopy() && migrate_use_xbzrle()) {
1003             pages = save_xbzrle_page(rs, &p, current_addr, block,
1004                                      offset, last_stage);
1005             if (!last_stage) {
1006                 /* Can't send this cached data async, since the cache page
1007                  * might get updated before it gets to the wire
1008                  */
1009                 send_async = false;
1010             }
1011         }
1012     }
1013 
1014     /* XBZRLE overflow or normal page */
1015     if (pages == -1) {
1016         ram_counters.transferred +=
1017             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1018         if (send_async) {
1019             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1020                                   migrate_release_ram() &
1021                                   migration_in_postcopy());
1022         } else {
1023             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1024         }
1025         ram_counters.transferred += TARGET_PAGE_SIZE;
1026         pages = 1;
1027         ram_counters.normal++;
1028     }
1029 
1030     XBZRLE_cache_unlock();
1031 
1032     return pages;
1033 }
1034 
1035 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1036                                 ram_addr_t offset)
1037 {
1038     RAMState *rs = ram_state;
1039     int bytes_sent, blen;
1040     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1041 
1042     bytes_sent = save_page_header(rs, f, block, offset |
1043                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1044     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1045                                      migrate_compress_level());
1046     if (blen < 0) {
1047         bytes_sent = 0;
1048         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1049         error_report("compressed data failed!");
1050     } else {
1051         bytes_sent += blen;
1052         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1053     }
1054 
1055     return bytes_sent;
1056 }
1057 
1058 static void flush_compressed_data(RAMState *rs)
1059 {
1060     int idx, len, thread_count;
1061 
1062     if (!migrate_use_compression()) {
1063         return;
1064     }
1065     thread_count = migrate_compress_threads();
1066 
1067     qemu_mutex_lock(&comp_done_lock);
1068     for (idx = 0; idx < thread_count; idx++) {
1069         while (!comp_param[idx].done) {
1070             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1071         }
1072     }
1073     qemu_mutex_unlock(&comp_done_lock);
1074 
1075     for (idx = 0; idx < thread_count; idx++) {
1076         qemu_mutex_lock(&comp_param[idx].mutex);
1077         if (!comp_param[idx].quit) {
1078             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1079             ram_counters.transferred += len;
1080         }
1081         qemu_mutex_unlock(&comp_param[idx].mutex);
1082     }
1083 }
1084 
1085 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1086                                        ram_addr_t offset)
1087 {
1088     param->block = block;
1089     param->offset = offset;
1090 }
1091 
1092 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1093                                            ram_addr_t offset)
1094 {
1095     int idx, thread_count, bytes_xmit = -1, pages = -1;
1096 
1097     thread_count = migrate_compress_threads();
1098     qemu_mutex_lock(&comp_done_lock);
1099     while (true) {
1100         for (idx = 0; idx < thread_count; idx++) {
1101             if (comp_param[idx].done) {
1102                 comp_param[idx].done = false;
1103                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1104                 qemu_mutex_lock(&comp_param[idx].mutex);
1105                 set_compress_params(&comp_param[idx], block, offset);
1106                 qemu_cond_signal(&comp_param[idx].cond);
1107                 qemu_mutex_unlock(&comp_param[idx].mutex);
1108                 pages = 1;
1109                 ram_counters.normal++;
1110                 ram_counters.transferred += bytes_xmit;
1111                 break;
1112             }
1113         }
1114         if (pages > 0) {
1115             break;
1116         } else {
1117             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1118         }
1119     }
1120     qemu_mutex_unlock(&comp_done_lock);
1121 
1122     return pages;
1123 }
1124 
1125 /**
1126  * ram_save_compressed_page: compress the given page and send it to the stream
1127  *
1128  * Returns the number of pages written.
1129  *
1130  * @rs: current RAM state
1131  * @block: block that contains the page we want to send
1132  * @offset: offset inside the block for the page
1133  * @last_stage: if we are at the completion stage
1134  */
1135 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1136                                     bool last_stage)
1137 {
1138     int pages = -1;
1139     uint64_t bytes_xmit = 0;
1140     uint8_t *p;
1141     int ret, blen;
1142     RAMBlock *block = pss->block;
1143     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1144 
1145     p = block->host + offset;
1146 
1147     ret = ram_control_save_page(rs->f, block->offset,
1148                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1149     if (bytes_xmit) {
1150         ram_counters.transferred += bytes_xmit;
1151         pages = 1;
1152     }
1153     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1154         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1155             if (bytes_xmit > 0) {
1156                 ram_counters.normal++;
1157             } else if (bytes_xmit == 0) {
1158                 ram_counters.duplicate++;
1159             }
1160         }
1161     } else {
1162         /* When starting the process of a new block, the first page of
1163          * the block should be sent out before other pages in the same
1164          * block, and all the pages in last block should have been sent
1165          * out, keeping this order is important, because the 'cont' flag
1166          * is used to avoid resending the block name.
1167          */
1168         if (block != rs->last_sent_block) {
1169             flush_compressed_data(rs);
1170             pages = save_zero_page(rs, block, offset, p);
1171             if (pages == -1) {
1172                 /* Make sure the first page is sent out before other pages */
1173                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1174                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1175                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1176                                                  migrate_compress_level());
1177                 if (blen > 0) {
1178                     ram_counters.transferred += bytes_xmit + blen;
1179                     ram_counters.normal++;
1180                     pages = 1;
1181                 } else {
1182                     qemu_file_set_error(rs->f, blen);
1183                     error_report("compressed data failed!");
1184                 }
1185             }
1186             if (pages > 0) {
1187                 ram_release_pages(block->idstr, offset, pages);
1188             }
1189         } else {
1190             pages = save_zero_page(rs, block, offset, p);
1191             if (pages == -1) {
1192                 pages = compress_page_with_multi_thread(rs, block, offset);
1193             } else {
1194                 ram_release_pages(block->idstr, offset, pages);
1195             }
1196         }
1197     }
1198 
1199     return pages;
1200 }
1201 
1202 /**
1203  * find_dirty_block: find the next dirty page and update any state
1204  * associated with the search process.
1205  *
1206  * Returns if a page is found
1207  *
1208  * @rs: current RAM state
1209  * @pss: data about the state of the current dirty page scan
1210  * @again: set to false if the search has scanned the whole of RAM
1211  */
1212 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1213 {
1214     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1215     if (pss->complete_round && pss->block == rs->last_seen_block &&
1216         pss->page >= rs->last_page) {
1217         /*
1218          * We've been once around the RAM and haven't found anything.
1219          * Give up.
1220          */
1221         *again = false;
1222         return false;
1223     }
1224     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1225         /* Didn't find anything in this RAM Block */
1226         pss->page = 0;
1227         pss->block = QLIST_NEXT_RCU(pss->block, next);
1228         if (!pss->block) {
1229             /* Hit the end of the list */
1230             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1231             /* Flag that we've looped */
1232             pss->complete_round = true;
1233             rs->ram_bulk_stage = false;
1234             if (migrate_use_xbzrle()) {
1235                 /* If xbzrle is on, stop using the data compression at this
1236                  * point. In theory, xbzrle can do better than compression.
1237                  */
1238                 flush_compressed_data(rs);
1239             }
1240         }
1241         /* Didn't find anything this time, but try again on the new block */
1242         *again = true;
1243         return false;
1244     } else {
1245         /* Can go around again, but... */
1246         *again = true;
1247         /* We've found something so probably don't need to */
1248         return true;
1249     }
1250 }
1251 
1252 /**
1253  * unqueue_page: gets a page of the queue
1254  *
1255  * Helper for 'get_queued_page' - gets a page off the queue
1256  *
1257  * Returns the block of the page (or NULL if none available)
1258  *
1259  * @rs: current RAM state
1260  * @offset: used to return the offset within the RAMBlock
1261  */
1262 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1263 {
1264     RAMBlock *block = NULL;
1265 
1266     qemu_mutex_lock(&rs->src_page_req_mutex);
1267     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1268         struct RAMSrcPageRequest *entry =
1269                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1270         block = entry->rb;
1271         *offset = entry->offset;
1272 
1273         if (entry->len > TARGET_PAGE_SIZE) {
1274             entry->len -= TARGET_PAGE_SIZE;
1275             entry->offset += TARGET_PAGE_SIZE;
1276         } else {
1277             memory_region_unref(block->mr);
1278             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1279             g_free(entry);
1280         }
1281     }
1282     qemu_mutex_unlock(&rs->src_page_req_mutex);
1283 
1284     return block;
1285 }
1286 
1287 /**
1288  * get_queued_page: unqueue a page from the postocpy requests
1289  *
1290  * Skips pages that are already sent (!dirty)
1291  *
1292  * Returns if a queued page is found
1293  *
1294  * @rs: current RAM state
1295  * @pss: data about the state of the current dirty page scan
1296  */
1297 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1298 {
1299     RAMBlock  *block;
1300     ram_addr_t offset;
1301     bool dirty;
1302 
1303     do {
1304         block = unqueue_page(rs, &offset);
1305         /*
1306          * We're sending this page, and since it's postcopy nothing else
1307          * will dirty it, and we must make sure it doesn't get sent again
1308          * even if this queue request was received after the background
1309          * search already sent it.
1310          */
1311         if (block) {
1312             unsigned long page;
1313 
1314             page = offset >> TARGET_PAGE_BITS;
1315             dirty = test_bit(page, block->bmap);
1316             if (!dirty) {
1317                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1318                        page, test_bit(page, block->unsentmap));
1319             } else {
1320                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1321             }
1322         }
1323 
1324     } while (block && !dirty);
1325 
1326     if (block) {
1327         /*
1328          * As soon as we start servicing pages out of order, then we have
1329          * to kill the bulk stage, since the bulk stage assumes
1330          * in (migration_bitmap_find_and_reset_dirty) that every page is
1331          * dirty, that's no longer true.
1332          */
1333         rs->ram_bulk_stage = false;
1334 
1335         /*
1336          * We want the background search to continue from the queued page
1337          * since the guest is likely to want other pages near to the page
1338          * it just requested.
1339          */
1340         pss->block = block;
1341         pss->page = offset >> TARGET_PAGE_BITS;
1342     }
1343 
1344     return !!block;
1345 }
1346 
1347 /**
1348  * migration_page_queue_free: drop any remaining pages in the ram
1349  * request queue
1350  *
1351  * It should be empty at the end anyway, but in error cases there may
1352  * be some left.  in case that there is any page left, we drop it.
1353  *
1354  */
1355 static void migration_page_queue_free(RAMState *rs)
1356 {
1357     struct RAMSrcPageRequest *mspr, *next_mspr;
1358     /* This queue generally should be empty - but in the case of a failed
1359      * migration might have some droppings in.
1360      */
1361     rcu_read_lock();
1362     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1363         memory_region_unref(mspr->rb->mr);
1364         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1365         g_free(mspr);
1366     }
1367     rcu_read_unlock();
1368 }
1369 
1370 /**
1371  * ram_save_queue_pages: queue the page for transmission
1372  *
1373  * A request from postcopy destination for example.
1374  *
1375  * Returns zero on success or negative on error
1376  *
1377  * @rbname: Name of the RAMBLock of the request. NULL means the
1378  *          same that last one.
1379  * @start: starting address from the start of the RAMBlock
1380  * @len: length (in bytes) to send
1381  */
1382 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1383 {
1384     RAMBlock *ramblock;
1385     RAMState *rs = ram_state;
1386 
1387     ram_counters.postcopy_requests++;
1388     rcu_read_lock();
1389     if (!rbname) {
1390         /* Reuse last RAMBlock */
1391         ramblock = rs->last_req_rb;
1392 
1393         if (!ramblock) {
1394             /*
1395              * Shouldn't happen, we can't reuse the last RAMBlock if
1396              * it's the 1st request.
1397              */
1398             error_report("ram_save_queue_pages no previous block");
1399             goto err;
1400         }
1401     } else {
1402         ramblock = qemu_ram_block_by_name(rbname);
1403 
1404         if (!ramblock) {
1405             /* We shouldn't be asked for a non-existent RAMBlock */
1406             error_report("ram_save_queue_pages no block '%s'", rbname);
1407             goto err;
1408         }
1409         rs->last_req_rb = ramblock;
1410     }
1411     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1412     if (start+len > ramblock->used_length) {
1413         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1414                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1415                      __func__, start, len, ramblock->used_length);
1416         goto err;
1417     }
1418 
1419     struct RAMSrcPageRequest *new_entry =
1420         g_malloc0(sizeof(struct RAMSrcPageRequest));
1421     new_entry->rb = ramblock;
1422     new_entry->offset = start;
1423     new_entry->len = len;
1424 
1425     memory_region_ref(ramblock->mr);
1426     qemu_mutex_lock(&rs->src_page_req_mutex);
1427     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1428     qemu_mutex_unlock(&rs->src_page_req_mutex);
1429     rcu_read_unlock();
1430 
1431     return 0;
1432 
1433 err:
1434     rcu_read_unlock();
1435     return -1;
1436 }
1437 
1438 /**
1439  * ram_save_target_page: save one target page
1440  *
1441  * Returns the number of pages written
1442  *
1443  * @rs: current RAM state
1444  * @ms: current migration state
1445  * @pss: data about the page we want to send
1446  * @last_stage: if we are at the completion stage
1447  */
1448 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1449                                 bool last_stage)
1450 {
1451     int res = 0;
1452 
1453     /* Check the pages is dirty and if it is send it */
1454     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1455         /*
1456          * If xbzrle is on, stop using the data compression after first
1457          * round of migration even if compression is enabled. In theory,
1458          * xbzrle can do better than compression.
1459          */
1460         if (migrate_use_compression() &&
1461             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1462             res = ram_save_compressed_page(rs, pss, last_stage);
1463         } else {
1464             res = ram_save_page(rs, pss, last_stage);
1465         }
1466 
1467         if (res < 0) {
1468             return res;
1469         }
1470         if (pss->block->unsentmap) {
1471             clear_bit(pss->page, pss->block->unsentmap);
1472         }
1473     }
1474 
1475     return res;
1476 }
1477 
1478 /**
1479  * ram_save_host_page: save a whole host page
1480  *
1481  * Starting at *offset send pages up to the end of the current host
1482  * page. It's valid for the initial offset to point into the middle of
1483  * a host page in which case the remainder of the hostpage is sent.
1484  * Only dirty target pages are sent. Note that the host page size may
1485  * be a huge page for this block.
1486  * The saving stops at the boundary of the used_length of the block
1487  * if the RAMBlock isn't a multiple of the host page size.
1488  *
1489  * Returns the number of pages written or negative on error
1490  *
1491  * @rs: current RAM state
1492  * @ms: current migration state
1493  * @pss: data about the page we want to send
1494  * @last_stage: if we are at the completion stage
1495  */
1496 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1497                               bool last_stage)
1498 {
1499     int tmppages, pages = 0;
1500     size_t pagesize_bits =
1501         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1502 
1503     do {
1504         tmppages = ram_save_target_page(rs, pss, last_stage);
1505         if (tmppages < 0) {
1506             return tmppages;
1507         }
1508 
1509         pages += tmppages;
1510         pss->page++;
1511     } while ((pss->page & (pagesize_bits - 1)) &&
1512              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1513 
1514     /* The offset we leave with is the last one we looked at */
1515     pss->page--;
1516     return pages;
1517 }
1518 
1519 /**
1520  * ram_find_and_save_block: finds a dirty page and sends it to f
1521  *
1522  * Called within an RCU critical section.
1523  *
1524  * Returns the number of pages written where zero means no dirty pages
1525  *
1526  * @rs: current RAM state
1527  * @last_stage: if we are at the completion stage
1528  *
1529  * On systems where host-page-size > target-page-size it will send all the
1530  * pages in a host page that are dirty.
1531  */
1532 
1533 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1534 {
1535     PageSearchStatus pss;
1536     int pages = 0;
1537     bool again, found;
1538 
1539     /* No dirty page as there is zero RAM */
1540     if (!ram_bytes_total()) {
1541         return pages;
1542     }
1543 
1544     pss.block = rs->last_seen_block;
1545     pss.page = rs->last_page;
1546     pss.complete_round = false;
1547 
1548     if (!pss.block) {
1549         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1550     }
1551 
1552     do {
1553         again = true;
1554         found = get_queued_page(rs, &pss);
1555 
1556         if (!found) {
1557             /* priority queue empty, so just search for something dirty */
1558             found = find_dirty_block(rs, &pss, &again);
1559         }
1560 
1561         if (found) {
1562             pages = ram_save_host_page(rs, &pss, last_stage);
1563         }
1564     } while (!pages && again);
1565 
1566     rs->last_seen_block = pss.block;
1567     rs->last_page = pss.page;
1568 
1569     return pages;
1570 }
1571 
1572 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1573 {
1574     uint64_t pages = size / TARGET_PAGE_SIZE;
1575 
1576     if (zero) {
1577         ram_counters.duplicate += pages;
1578     } else {
1579         ram_counters.normal += pages;
1580         ram_counters.transferred += size;
1581         qemu_update_position(f, size);
1582     }
1583 }
1584 
1585 uint64_t ram_bytes_total(void)
1586 {
1587     RAMBlock *block;
1588     uint64_t total = 0;
1589 
1590     rcu_read_lock();
1591     RAMBLOCK_FOREACH(block) {
1592         total += block->used_length;
1593     }
1594     rcu_read_unlock();
1595     return total;
1596 }
1597 
1598 static void xbzrle_load_setup(void)
1599 {
1600     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1601 }
1602 
1603 static void xbzrle_load_cleanup(void)
1604 {
1605     g_free(XBZRLE.decoded_buf);
1606     XBZRLE.decoded_buf = NULL;
1607 }
1608 
1609 static void ram_state_cleanup(RAMState **rsp)
1610 {
1611     migration_page_queue_free(*rsp);
1612     qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1613     qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1614     g_free(*rsp);
1615     *rsp = NULL;
1616 }
1617 
1618 static void xbzrle_cleanup(void)
1619 {
1620     XBZRLE_cache_lock();
1621     if (XBZRLE.cache) {
1622         cache_fini(XBZRLE.cache);
1623         g_free(XBZRLE.encoded_buf);
1624         g_free(XBZRLE.current_buf);
1625         g_free(XBZRLE.zero_target_page);
1626         XBZRLE.cache = NULL;
1627         XBZRLE.encoded_buf = NULL;
1628         XBZRLE.current_buf = NULL;
1629         XBZRLE.zero_target_page = NULL;
1630     }
1631     XBZRLE_cache_unlock();
1632 }
1633 
1634 static void ram_save_cleanup(void *opaque)
1635 {
1636     RAMState **rsp = opaque;
1637     RAMBlock *block;
1638 
1639     /* caller have hold iothread lock or is in a bh, so there is
1640      * no writing race against this migration_bitmap
1641      */
1642     memory_global_dirty_log_stop();
1643 
1644     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1645         g_free(block->bmap);
1646         block->bmap = NULL;
1647         g_free(block->unsentmap);
1648         block->unsentmap = NULL;
1649     }
1650 
1651     xbzrle_cleanup();
1652     compress_threads_save_cleanup();
1653     ram_state_cleanup(rsp);
1654 }
1655 
1656 static void ram_state_reset(RAMState *rs)
1657 {
1658     rs->last_seen_block = NULL;
1659     rs->last_sent_block = NULL;
1660     rs->last_page = 0;
1661     rs->last_version = ram_list.version;
1662     rs->ram_bulk_stage = true;
1663 }
1664 
1665 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1666 
1667 /*
1668  * 'expected' is the value you expect the bitmap mostly to be full
1669  * of; it won't bother printing lines that are all this value.
1670  * If 'todump' is null the migration bitmap is dumped.
1671  */
1672 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1673                            unsigned long pages)
1674 {
1675     int64_t cur;
1676     int64_t linelen = 128;
1677     char linebuf[129];
1678 
1679     for (cur = 0; cur < pages; cur += linelen) {
1680         int64_t curb;
1681         bool found = false;
1682         /*
1683          * Last line; catch the case where the line length
1684          * is longer than remaining ram
1685          */
1686         if (cur + linelen > pages) {
1687             linelen = pages - cur;
1688         }
1689         for (curb = 0; curb < linelen; curb++) {
1690             bool thisbit = test_bit(cur + curb, todump);
1691             linebuf[curb] = thisbit ? '1' : '.';
1692             found = found || (thisbit != expected);
1693         }
1694         if (found) {
1695             linebuf[curb] = '\0';
1696             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1697         }
1698     }
1699 }
1700 
1701 /* **** functions for postcopy ***** */
1702 
1703 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1704 {
1705     struct RAMBlock *block;
1706 
1707     RAMBLOCK_FOREACH(block) {
1708         unsigned long *bitmap = block->bmap;
1709         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1710         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1711 
1712         while (run_start < range) {
1713             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1714             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1715                               (run_end - run_start) << TARGET_PAGE_BITS);
1716             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1717         }
1718     }
1719 }
1720 
1721 /**
1722  * postcopy_send_discard_bm_ram: discard a RAMBlock
1723  *
1724  * Returns zero on success
1725  *
1726  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1727  * Note: At this point the 'unsentmap' is the processed bitmap combined
1728  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1729  *
1730  * @ms: current migration state
1731  * @pds: state for postcopy
1732  * @start: RAMBlock starting page
1733  * @length: RAMBlock size
1734  */
1735 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1736                                         PostcopyDiscardState *pds,
1737                                         RAMBlock *block)
1738 {
1739     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1740     unsigned long current;
1741     unsigned long *unsentmap = block->unsentmap;
1742 
1743     for (current = 0; current < end; ) {
1744         unsigned long one = find_next_bit(unsentmap, end, current);
1745 
1746         if (one <= end) {
1747             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1748             unsigned long discard_length;
1749 
1750             if (zero >= end) {
1751                 discard_length = end - one;
1752             } else {
1753                 discard_length = zero - one;
1754             }
1755             if (discard_length) {
1756                 postcopy_discard_send_range(ms, pds, one, discard_length);
1757             }
1758             current = one + discard_length;
1759         } else {
1760             current = one;
1761         }
1762     }
1763 
1764     return 0;
1765 }
1766 
1767 /**
1768  * postcopy_each_ram_send_discard: discard all RAMBlocks
1769  *
1770  * Returns 0 for success or negative for error
1771  *
1772  * Utility for the outgoing postcopy code.
1773  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1774  *   passing it bitmap indexes and name.
1775  * (qemu_ram_foreach_block ends up passing unscaled lengths
1776  *  which would mean postcopy code would have to deal with target page)
1777  *
1778  * @ms: current migration state
1779  */
1780 static int postcopy_each_ram_send_discard(MigrationState *ms)
1781 {
1782     struct RAMBlock *block;
1783     int ret;
1784 
1785     RAMBLOCK_FOREACH(block) {
1786         PostcopyDiscardState *pds =
1787             postcopy_discard_send_init(ms, block->idstr);
1788 
1789         /*
1790          * Postcopy sends chunks of bitmap over the wire, but it
1791          * just needs indexes at this point, avoids it having
1792          * target page specific code.
1793          */
1794         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1795         postcopy_discard_send_finish(ms, pds);
1796         if (ret) {
1797             return ret;
1798         }
1799     }
1800 
1801     return 0;
1802 }
1803 
1804 /**
1805  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1806  *
1807  * Helper for postcopy_chunk_hostpages; it's called twice to
1808  * canonicalize the two bitmaps, that are similar, but one is
1809  * inverted.
1810  *
1811  * Postcopy requires that all target pages in a hostpage are dirty or
1812  * clean, not a mix.  This function canonicalizes the bitmaps.
1813  *
1814  * @ms: current migration state
1815  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1816  *               otherwise we need to canonicalize partially dirty host pages
1817  * @block: block that contains the page we want to canonicalize
1818  * @pds: state for postcopy
1819  */
1820 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1821                                           RAMBlock *block,
1822                                           PostcopyDiscardState *pds)
1823 {
1824     RAMState *rs = ram_state;
1825     unsigned long *bitmap = block->bmap;
1826     unsigned long *unsentmap = block->unsentmap;
1827     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1828     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1829     unsigned long run_start;
1830 
1831     if (block->page_size == TARGET_PAGE_SIZE) {
1832         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1833         return;
1834     }
1835 
1836     if (unsent_pass) {
1837         /* Find a sent page */
1838         run_start = find_next_zero_bit(unsentmap, pages, 0);
1839     } else {
1840         /* Find a dirty page */
1841         run_start = find_next_bit(bitmap, pages, 0);
1842     }
1843 
1844     while (run_start < pages) {
1845         bool do_fixup = false;
1846         unsigned long fixup_start_addr;
1847         unsigned long host_offset;
1848 
1849         /*
1850          * If the start of this run of pages is in the middle of a host
1851          * page, then we need to fixup this host page.
1852          */
1853         host_offset = run_start % host_ratio;
1854         if (host_offset) {
1855             do_fixup = true;
1856             run_start -= host_offset;
1857             fixup_start_addr = run_start;
1858             /* For the next pass */
1859             run_start = run_start + host_ratio;
1860         } else {
1861             /* Find the end of this run */
1862             unsigned long run_end;
1863             if (unsent_pass) {
1864                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1865             } else {
1866                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1867             }
1868             /*
1869              * If the end isn't at the start of a host page, then the
1870              * run doesn't finish at the end of a host page
1871              * and we need to discard.
1872              */
1873             host_offset = run_end % host_ratio;
1874             if (host_offset) {
1875                 do_fixup = true;
1876                 fixup_start_addr = run_end - host_offset;
1877                 /*
1878                  * This host page has gone, the next loop iteration starts
1879                  * from after the fixup
1880                  */
1881                 run_start = fixup_start_addr + host_ratio;
1882             } else {
1883                 /*
1884                  * No discards on this iteration, next loop starts from
1885                  * next sent/dirty page
1886                  */
1887                 run_start = run_end + 1;
1888             }
1889         }
1890 
1891         if (do_fixup) {
1892             unsigned long page;
1893 
1894             /* Tell the destination to discard this page */
1895             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1896                 /* For the unsent_pass we:
1897                  *     discard partially sent pages
1898                  * For the !unsent_pass (dirty) we:
1899                  *     discard partially dirty pages that were sent
1900                  *     (any partially sent pages were already discarded
1901                  *     by the previous unsent_pass)
1902                  */
1903                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1904                                             host_ratio);
1905             }
1906 
1907             /* Clean up the bitmap */
1908             for (page = fixup_start_addr;
1909                  page < fixup_start_addr + host_ratio; page++) {
1910                 /* All pages in this host page are now not sent */
1911                 set_bit(page, unsentmap);
1912 
1913                 /*
1914                  * Remark them as dirty, updating the count for any pages
1915                  * that weren't previously dirty.
1916                  */
1917                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1918             }
1919         }
1920 
1921         if (unsent_pass) {
1922             /* Find the next sent page for the next iteration */
1923             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1924         } else {
1925             /* Find the next dirty page for the next iteration */
1926             run_start = find_next_bit(bitmap, pages, run_start);
1927         }
1928     }
1929 }
1930 
1931 /**
1932  * postcopy_chuck_hostpages: discrad any partially sent host page
1933  *
1934  * Utility for the outgoing postcopy code.
1935  *
1936  * Discard any partially sent host-page size chunks, mark any partially
1937  * dirty host-page size chunks as all dirty.  In this case the host-page
1938  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1939  *
1940  * Returns zero on success
1941  *
1942  * @ms: current migration state
1943  * @block: block we want to work with
1944  */
1945 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1946 {
1947     PostcopyDiscardState *pds =
1948         postcopy_discard_send_init(ms, block->idstr);
1949 
1950     /* First pass: Discard all partially sent host pages */
1951     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1952     /*
1953      * Second pass: Ensure that all partially dirty host pages are made
1954      * fully dirty.
1955      */
1956     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1957 
1958     postcopy_discard_send_finish(ms, pds);
1959     return 0;
1960 }
1961 
1962 /**
1963  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1964  *
1965  * Returns zero on success
1966  *
1967  * Transmit the set of pages to be discarded after precopy to the target
1968  * these are pages that:
1969  *     a) Have been previously transmitted but are now dirty again
1970  *     b) Pages that have never been transmitted, this ensures that
1971  *        any pages on the destination that have been mapped by background
1972  *        tasks get discarded (transparent huge pages is the specific concern)
1973  * Hopefully this is pretty sparse
1974  *
1975  * @ms: current migration state
1976  */
1977 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1978 {
1979     RAMState *rs = ram_state;
1980     RAMBlock *block;
1981     int ret;
1982 
1983     rcu_read_lock();
1984 
1985     /* This should be our last sync, the src is now paused */
1986     migration_bitmap_sync(rs);
1987 
1988     /* Easiest way to make sure we don't resume in the middle of a host-page */
1989     rs->last_seen_block = NULL;
1990     rs->last_sent_block = NULL;
1991     rs->last_page = 0;
1992 
1993     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1994         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1995         unsigned long *bitmap = block->bmap;
1996         unsigned long *unsentmap = block->unsentmap;
1997 
1998         if (!unsentmap) {
1999             /* We don't have a safe way to resize the sentmap, so
2000              * if the bitmap was resized it will be NULL at this
2001              * point.
2002              */
2003             error_report("migration ram resized during precopy phase");
2004             rcu_read_unlock();
2005             return -EINVAL;
2006         }
2007         /* Deal with TPS != HPS and huge pages */
2008         ret = postcopy_chunk_hostpages(ms, block);
2009         if (ret) {
2010             rcu_read_unlock();
2011             return ret;
2012         }
2013 
2014         /*
2015          * Update the unsentmap to be unsentmap = unsentmap | dirty
2016          */
2017         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2018 #ifdef DEBUG_POSTCOPY
2019         ram_debug_dump_bitmap(unsentmap, true, pages);
2020 #endif
2021     }
2022     trace_ram_postcopy_send_discard_bitmap();
2023 
2024     ret = postcopy_each_ram_send_discard(ms);
2025     rcu_read_unlock();
2026 
2027     return ret;
2028 }
2029 
2030 /**
2031  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2032  *
2033  * Returns zero on success
2034  *
2035  * @rbname: name of the RAMBlock of the request. NULL means the
2036  *          same that last one.
2037  * @start: RAMBlock starting page
2038  * @length: RAMBlock size
2039  */
2040 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2041 {
2042     int ret = -1;
2043 
2044     trace_ram_discard_range(rbname, start, length);
2045 
2046     rcu_read_lock();
2047     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2048 
2049     if (!rb) {
2050         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2051         goto err;
2052     }
2053 
2054     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2055                  length >> qemu_target_page_bits());
2056     ret = ram_block_discard_range(rb, start, length);
2057 
2058 err:
2059     rcu_read_unlock();
2060 
2061     return ret;
2062 }
2063 
2064 /*
2065  * For every allocation, we will try not to crash the VM if the
2066  * allocation failed.
2067  */
2068 static int xbzrle_init(void)
2069 {
2070     Error *local_err = NULL;
2071 
2072     if (!migrate_use_xbzrle()) {
2073         return 0;
2074     }
2075 
2076     XBZRLE_cache_lock();
2077 
2078     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2079     if (!XBZRLE.zero_target_page) {
2080         error_report("%s: Error allocating zero page", __func__);
2081         goto err_out;
2082     }
2083 
2084     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2085                               TARGET_PAGE_SIZE, &local_err);
2086     if (!XBZRLE.cache) {
2087         error_report_err(local_err);
2088         goto free_zero_page;
2089     }
2090 
2091     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2092     if (!XBZRLE.encoded_buf) {
2093         error_report("%s: Error allocating encoded_buf", __func__);
2094         goto free_cache;
2095     }
2096 
2097     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2098     if (!XBZRLE.current_buf) {
2099         error_report("%s: Error allocating current_buf", __func__);
2100         goto free_encoded_buf;
2101     }
2102 
2103     /* We are all good */
2104     XBZRLE_cache_unlock();
2105     return 0;
2106 
2107 free_encoded_buf:
2108     g_free(XBZRLE.encoded_buf);
2109     XBZRLE.encoded_buf = NULL;
2110 free_cache:
2111     cache_fini(XBZRLE.cache);
2112     XBZRLE.cache = NULL;
2113 free_zero_page:
2114     g_free(XBZRLE.zero_target_page);
2115     XBZRLE.zero_target_page = NULL;
2116 err_out:
2117     XBZRLE_cache_unlock();
2118     return -ENOMEM;
2119 }
2120 
2121 static int ram_state_init(RAMState **rsp)
2122 {
2123     *rsp = g_try_new0(RAMState, 1);
2124 
2125     if (!*rsp) {
2126         error_report("%s: Init ramstate fail", __func__);
2127         return -1;
2128     }
2129 
2130     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2131     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2132     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2133 
2134     /*
2135      * Count the total number of pages used by ram blocks not including any
2136      * gaps due to alignment or unplugs.
2137      */
2138     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2139 
2140     ram_state_reset(*rsp);
2141 
2142     return 0;
2143 }
2144 
2145 static void ram_list_init_bitmaps(void)
2146 {
2147     RAMBlock *block;
2148     unsigned long pages;
2149 
2150     /* Skip setting bitmap if there is no RAM */
2151     if (ram_bytes_total()) {
2152         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2153             pages = block->max_length >> TARGET_PAGE_BITS;
2154             block->bmap = bitmap_new(pages);
2155             bitmap_set(block->bmap, 0, pages);
2156             if (migrate_postcopy_ram()) {
2157                 block->unsentmap = bitmap_new(pages);
2158                 bitmap_set(block->unsentmap, 0, pages);
2159             }
2160         }
2161     }
2162 }
2163 
2164 static void ram_init_bitmaps(RAMState *rs)
2165 {
2166     /* For memory_global_dirty_log_start below.  */
2167     qemu_mutex_lock_iothread();
2168     qemu_mutex_lock_ramlist();
2169     rcu_read_lock();
2170 
2171     ram_list_init_bitmaps();
2172     memory_global_dirty_log_start();
2173     migration_bitmap_sync(rs);
2174 
2175     rcu_read_unlock();
2176     qemu_mutex_unlock_ramlist();
2177     qemu_mutex_unlock_iothread();
2178 }
2179 
2180 static int ram_init_all(RAMState **rsp)
2181 {
2182     if (ram_state_init(rsp)) {
2183         return -1;
2184     }
2185 
2186     if (xbzrle_init()) {
2187         ram_state_cleanup(rsp);
2188         return -1;
2189     }
2190 
2191     ram_init_bitmaps(*rsp);
2192 
2193     return 0;
2194 }
2195 
2196 /*
2197  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2198  * long-running RCU critical section.  When rcu-reclaims in the code
2199  * start to become numerous it will be necessary to reduce the
2200  * granularity of these critical sections.
2201  */
2202 
2203 /**
2204  * ram_save_setup: Setup RAM for migration
2205  *
2206  * Returns zero to indicate success and negative for error
2207  *
2208  * @f: QEMUFile where to send the data
2209  * @opaque: RAMState pointer
2210  */
2211 static int ram_save_setup(QEMUFile *f, void *opaque)
2212 {
2213     RAMState **rsp = opaque;
2214     RAMBlock *block;
2215 
2216     /* migration has already setup the bitmap, reuse it. */
2217     if (!migration_in_colo_state()) {
2218         if (ram_init_all(rsp) != 0) {
2219             return -1;
2220         }
2221     }
2222     (*rsp)->f = f;
2223 
2224     rcu_read_lock();
2225 
2226     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2227 
2228     RAMBLOCK_FOREACH(block) {
2229         qemu_put_byte(f, strlen(block->idstr));
2230         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2231         qemu_put_be64(f, block->used_length);
2232         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2233             qemu_put_be64(f, block->page_size);
2234         }
2235     }
2236 
2237     rcu_read_unlock();
2238     compress_threads_save_setup();
2239 
2240     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2241     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2242 
2243     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2244 
2245     return 0;
2246 }
2247 
2248 /**
2249  * ram_save_iterate: iterative stage for migration
2250  *
2251  * Returns zero to indicate success and negative for error
2252  *
2253  * @f: QEMUFile where to send the data
2254  * @opaque: RAMState pointer
2255  */
2256 static int ram_save_iterate(QEMUFile *f, void *opaque)
2257 {
2258     RAMState **temp = opaque;
2259     RAMState *rs = *temp;
2260     int ret;
2261     int i;
2262     int64_t t0;
2263     int done = 0;
2264 
2265     rcu_read_lock();
2266     if (ram_list.version != rs->last_version) {
2267         ram_state_reset(rs);
2268     }
2269 
2270     /* Read version before ram_list.blocks */
2271     smp_rmb();
2272 
2273     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2274 
2275     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2276     i = 0;
2277     while ((ret = qemu_file_rate_limit(f)) == 0) {
2278         int pages;
2279 
2280         pages = ram_find_and_save_block(rs, false);
2281         /* no more pages to sent */
2282         if (pages == 0) {
2283             done = 1;
2284             break;
2285         }
2286         rs->iterations++;
2287 
2288         /* we want to check in the 1st loop, just in case it was the 1st time
2289            and we had to sync the dirty bitmap.
2290            qemu_get_clock_ns() is a bit expensive, so we only check each some
2291            iterations
2292         */
2293         if ((i & 63) == 0) {
2294             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2295             if (t1 > MAX_WAIT) {
2296                 trace_ram_save_iterate_big_wait(t1, i);
2297                 break;
2298             }
2299         }
2300         i++;
2301     }
2302     flush_compressed_data(rs);
2303     rcu_read_unlock();
2304 
2305     /*
2306      * Must occur before EOS (or any QEMUFile operation)
2307      * because of RDMA protocol.
2308      */
2309     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2310 
2311     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2312     ram_counters.transferred += 8;
2313 
2314     ret = qemu_file_get_error(f);
2315     if (ret < 0) {
2316         return ret;
2317     }
2318 
2319     return done;
2320 }
2321 
2322 /**
2323  * ram_save_complete: function called to send the remaining amount of ram
2324  *
2325  * Returns zero to indicate success
2326  *
2327  * Called with iothread lock
2328  *
2329  * @f: QEMUFile where to send the data
2330  * @opaque: RAMState pointer
2331  */
2332 static int ram_save_complete(QEMUFile *f, void *opaque)
2333 {
2334     RAMState **temp = opaque;
2335     RAMState *rs = *temp;
2336 
2337     rcu_read_lock();
2338 
2339     if (!migration_in_postcopy()) {
2340         migration_bitmap_sync(rs);
2341     }
2342 
2343     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2344 
2345     /* try transferring iterative blocks of memory */
2346 
2347     /* flush all remaining blocks regardless of rate limiting */
2348     while (true) {
2349         int pages;
2350 
2351         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2352         /* no more blocks to sent */
2353         if (pages == 0) {
2354             break;
2355         }
2356     }
2357 
2358     flush_compressed_data(rs);
2359     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2360 
2361     rcu_read_unlock();
2362 
2363     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2364 
2365     return 0;
2366 }
2367 
2368 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2369                              uint64_t *non_postcopiable_pending,
2370                              uint64_t *postcopiable_pending)
2371 {
2372     RAMState **temp = opaque;
2373     RAMState *rs = *temp;
2374     uint64_t remaining_size;
2375 
2376     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2377 
2378     if (!migration_in_postcopy() &&
2379         remaining_size < max_size) {
2380         qemu_mutex_lock_iothread();
2381         rcu_read_lock();
2382         migration_bitmap_sync(rs);
2383         rcu_read_unlock();
2384         qemu_mutex_unlock_iothread();
2385         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2386     }
2387 
2388     if (migrate_postcopy_ram()) {
2389         /* We can do postcopy, and all the data is postcopiable */
2390         *postcopiable_pending += remaining_size;
2391     } else {
2392         *non_postcopiable_pending += remaining_size;
2393     }
2394 }
2395 
2396 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2397 {
2398     unsigned int xh_len;
2399     int xh_flags;
2400     uint8_t *loaded_data;
2401 
2402     /* extract RLE header */
2403     xh_flags = qemu_get_byte(f);
2404     xh_len = qemu_get_be16(f);
2405 
2406     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2407         error_report("Failed to load XBZRLE page - wrong compression!");
2408         return -1;
2409     }
2410 
2411     if (xh_len > TARGET_PAGE_SIZE) {
2412         error_report("Failed to load XBZRLE page - len overflow!");
2413         return -1;
2414     }
2415     loaded_data = XBZRLE.decoded_buf;
2416     /* load data and decode */
2417     /* it can change loaded_data to point to an internal buffer */
2418     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2419 
2420     /* decode RLE */
2421     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2422                              TARGET_PAGE_SIZE) == -1) {
2423         error_report("Failed to load XBZRLE page - decode error!");
2424         return -1;
2425     }
2426 
2427     return 0;
2428 }
2429 
2430 /**
2431  * ram_block_from_stream: read a RAMBlock id from the migration stream
2432  *
2433  * Must be called from within a rcu critical section.
2434  *
2435  * Returns a pointer from within the RCU-protected ram_list.
2436  *
2437  * @f: QEMUFile where to read the data from
2438  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2439  */
2440 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2441 {
2442     static RAMBlock *block = NULL;
2443     char id[256];
2444     uint8_t len;
2445 
2446     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2447         if (!block) {
2448             error_report("Ack, bad migration stream!");
2449             return NULL;
2450         }
2451         return block;
2452     }
2453 
2454     len = qemu_get_byte(f);
2455     qemu_get_buffer(f, (uint8_t *)id, len);
2456     id[len] = 0;
2457 
2458     block = qemu_ram_block_by_name(id);
2459     if (!block) {
2460         error_report("Can't find block %s", id);
2461         return NULL;
2462     }
2463 
2464     return block;
2465 }
2466 
2467 static inline void *host_from_ram_block_offset(RAMBlock *block,
2468                                                ram_addr_t offset)
2469 {
2470     if (!offset_in_ramblock(block, offset)) {
2471         return NULL;
2472     }
2473 
2474     return block->host + offset;
2475 }
2476 
2477 /**
2478  * ram_handle_compressed: handle the zero page case
2479  *
2480  * If a page (or a whole RDMA chunk) has been
2481  * determined to be zero, then zap it.
2482  *
2483  * @host: host address for the zero page
2484  * @ch: what the page is filled from.  We only support zero
2485  * @size: size of the zero page
2486  */
2487 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2488 {
2489     if (ch != 0 || !is_zero_range(host, size)) {
2490         memset(host, ch, size);
2491     }
2492 }
2493 
2494 static void *do_data_decompress(void *opaque)
2495 {
2496     DecompressParam *param = opaque;
2497     unsigned long pagesize;
2498     uint8_t *des;
2499     int len;
2500 
2501     qemu_mutex_lock(&param->mutex);
2502     while (!param->quit) {
2503         if (param->des) {
2504             des = param->des;
2505             len = param->len;
2506             param->des = 0;
2507             qemu_mutex_unlock(&param->mutex);
2508 
2509             pagesize = TARGET_PAGE_SIZE;
2510             /* uncompress() will return failed in some case, especially
2511              * when the page is dirted when doing the compression, it's
2512              * not a problem because the dirty page will be retransferred
2513              * and uncompress() won't break the data in other pages.
2514              */
2515             uncompress((Bytef *)des, &pagesize,
2516                        (const Bytef *)param->compbuf, len);
2517 
2518             qemu_mutex_lock(&decomp_done_lock);
2519             param->done = true;
2520             qemu_cond_signal(&decomp_done_cond);
2521             qemu_mutex_unlock(&decomp_done_lock);
2522 
2523             qemu_mutex_lock(&param->mutex);
2524         } else {
2525             qemu_cond_wait(&param->cond, &param->mutex);
2526         }
2527     }
2528     qemu_mutex_unlock(&param->mutex);
2529 
2530     return NULL;
2531 }
2532 
2533 static void wait_for_decompress_done(void)
2534 {
2535     int idx, thread_count;
2536 
2537     if (!migrate_use_compression()) {
2538         return;
2539     }
2540 
2541     thread_count = migrate_decompress_threads();
2542     qemu_mutex_lock(&decomp_done_lock);
2543     for (idx = 0; idx < thread_count; idx++) {
2544         while (!decomp_param[idx].done) {
2545             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2546         }
2547     }
2548     qemu_mutex_unlock(&decomp_done_lock);
2549 }
2550 
2551 static void compress_threads_load_setup(void)
2552 {
2553     int i, thread_count;
2554 
2555     if (!migrate_use_compression()) {
2556         return;
2557     }
2558     thread_count = migrate_decompress_threads();
2559     decompress_threads = g_new0(QemuThread, thread_count);
2560     decomp_param = g_new0(DecompressParam, thread_count);
2561     qemu_mutex_init(&decomp_done_lock);
2562     qemu_cond_init(&decomp_done_cond);
2563     for (i = 0; i < thread_count; i++) {
2564         qemu_mutex_init(&decomp_param[i].mutex);
2565         qemu_cond_init(&decomp_param[i].cond);
2566         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2567         decomp_param[i].done = true;
2568         decomp_param[i].quit = false;
2569         qemu_thread_create(decompress_threads + i, "decompress",
2570                            do_data_decompress, decomp_param + i,
2571                            QEMU_THREAD_JOINABLE);
2572     }
2573 }
2574 
2575 static void compress_threads_load_cleanup(void)
2576 {
2577     int i, thread_count;
2578 
2579     if (!migrate_use_compression()) {
2580         return;
2581     }
2582     thread_count = migrate_decompress_threads();
2583     for (i = 0; i < thread_count; i++) {
2584         qemu_mutex_lock(&decomp_param[i].mutex);
2585         decomp_param[i].quit = true;
2586         qemu_cond_signal(&decomp_param[i].cond);
2587         qemu_mutex_unlock(&decomp_param[i].mutex);
2588     }
2589     for (i = 0; i < thread_count; i++) {
2590         qemu_thread_join(decompress_threads + i);
2591         qemu_mutex_destroy(&decomp_param[i].mutex);
2592         qemu_cond_destroy(&decomp_param[i].cond);
2593         g_free(decomp_param[i].compbuf);
2594     }
2595     g_free(decompress_threads);
2596     g_free(decomp_param);
2597     decompress_threads = NULL;
2598     decomp_param = NULL;
2599 }
2600 
2601 static void decompress_data_with_multi_threads(QEMUFile *f,
2602                                                void *host, int len)
2603 {
2604     int idx, thread_count;
2605 
2606     thread_count = migrate_decompress_threads();
2607     qemu_mutex_lock(&decomp_done_lock);
2608     while (true) {
2609         for (idx = 0; idx < thread_count; idx++) {
2610             if (decomp_param[idx].done) {
2611                 decomp_param[idx].done = false;
2612                 qemu_mutex_lock(&decomp_param[idx].mutex);
2613                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2614                 decomp_param[idx].des = host;
2615                 decomp_param[idx].len = len;
2616                 qemu_cond_signal(&decomp_param[idx].cond);
2617                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2618                 break;
2619             }
2620         }
2621         if (idx < thread_count) {
2622             break;
2623         } else {
2624             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2625         }
2626     }
2627     qemu_mutex_unlock(&decomp_done_lock);
2628 }
2629 
2630 /**
2631  * ram_load_setup: Setup RAM for migration incoming side
2632  *
2633  * Returns zero to indicate success and negative for error
2634  *
2635  * @f: QEMUFile where to receive the data
2636  * @opaque: RAMState pointer
2637  */
2638 static int ram_load_setup(QEMUFile *f, void *opaque)
2639 {
2640     xbzrle_load_setup();
2641     compress_threads_load_setup();
2642     ramblock_recv_map_init();
2643     return 0;
2644 }
2645 
2646 static int ram_load_cleanup(void *opaque)
2647 {
2648     RAMBlock *rb;
2649     xbzrle_load_cleanup();
2650     compress_threads_load_cleanup();
2651 
2652     RAMBLOCK_FOREACH(rb) {
2653         g_free(rb->receivedmap);
2654         rb->receivedmap = NULL;
2655     }
2656     return 0;
2657 }
2658 
2659 /**
2660  * ram_postcopy_incoming_init: allocate postcopy data structures
2661  *
2662  * Returns 0 for success and negative if there was one error
2663  *
2664  * @mis: current migration incoming state
2665  *
2666  * Allocate data structures etc needed by incoming migration with
2667  * postcopy-ram. postcopy-ram's similarly names
2668  * postcopy_ram_incoming_init does the work.
2669  */
2670 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2671 {
2672     unsigned long ram_pages = last_ram_page();
2673 
2674     return postcopy_ram_incoming_init(mis, ram_pages);
2675 }
2676 
2677 /**
2678  * ram_load_postcopy: load a page in postcopy case
2679  *
2680  * Returns 0 for success or -errno in case of error
2681  *
2682  * Called in postcopy mode by ram_load().
2683  * rcu_read_lock is taken prior to this being called.
2684  *
2685  * @f: QEMUFile where to send the data
2686  */
2687 static int ram_load_postcopy(QEMUFile *f)
2688 {
2689     int flags = 0, ret = 0;
2690     bool place_needed = false;
2691     bool matching_page_sizes = false;
2692     MigrationIncomingState *mis = migration_incoming_get_current();
2693     /* Temporary page that is later 'placed' */
2694     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2695     void *last_host = NULL;
2696     bool all_zero = false;
2697 
2698     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2699         ram_addr_t addr;
2700         void *host = NULL;
2701         void *page_buffer = NULL;
2702         void *place_source = NULL;
2703         RAMBlock *block = NULL;
2704         uint8_t ch;
2705 
2706         addr = qemu_get_be64(f);
2707         flags = addr & ~TARGET_PAGE_MASK;
2708         addr &= TARGET_PAGE_MASK;
2709 
2710         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2711         place_needed = false;
2712         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2713             block = ram_block_from_stream(f, flags);
2714 
2715             host = host_from_ram_block_offset(block, addr);
2716             if (!host) {
2717                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2718                 ret = -EINVAL;
2719                 break;
2720             }
2721             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2722             /*
2723              * Postcopy requires that we place whole host pages atomically;
2724              * these may be huge pages for RAMBlocks that are backed by
2725              * hugetlbfs.
2726              * To make it atomic, the data is read into a temporary page
2727              * that's moved into place later.
2728              * The migration protocol uses,  possibly smaller, target-pages
2729              * however the source ensures it always sends all the components
2730              * of a host page in order.
2731              */
2732             page_buffer = postcopy_host_page +
2733                           ((uintptr_t)host & (block->page_size - 1));
2734             /* If all TP are zero then we can optimise the place */
2735             if (!((uintptr_t)host & (block->page_size - 1))) {
2736                 all_zero = true;
2737             } else {
2738                 /* not the 1st TP within the HP */
2739                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2740                     error_report("Non-sequential target page %p/%p",
2741                                   host, last_host);
2742                     ret = -EINVAL;
2743                     break;
2744                 }
2745             }
2746 
2747 
2748             /*
2749              * If it's the last part of a host page then we place the host
2750              * page
2751              */
2752             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2753                                      (block->page_size - 1)) == 0;
2754             place_source = postcopy_host_page;
2755         }
2756         last_host = host;
2757 
2758         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2759         case RAM_SAVE_FLAG_ZERO:
2760             ch = qemu_get_byte(f);
2761             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2762             if (ch) {
2763                 all_zero = false;
2764             }
2765             break;
2766 
2767         case RAM_SAVE_FLAG_PAGE:
2768             all_zero = false;
2769             if (!place_needed || !matching_page_sizes) {
2770                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2771             } else {
2772                 /* Avoids the qemu_file copy during postcopy, which is
2773                  * going to do a copy later; can only do it when we
2774                  * do this read in one go (matching page sizes)
2775                  */
2776                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2777                                          TARGET_PAGE_SIZE);
2778             }
2779             break;
2780         case RAM_SAVE_FLAG_EOS:
2781             /* normal exit */
2782             break;
2783         default:
2784             error_report("Unknown combination of migration flags: %#x"
2785                          " (postcopy mode)", flags);
2786             ret = -EINVAL;
2787         }
2788 
2789         if (place_needed) {
2790             /* This gets called at the last target page in the host page */
2791             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2792 
2793             if (all_zero) {
2794                 ret = postcopy_place_page_zero(mis, place_dest,
2795                                                block);
2796             } else {
2797                 ret = postcopy_place_page(mis, place_dest,
2798                                           place_source, block);
2799             }
2800         }
2801         if (!ret) {
2802             ret = qemu_file_get_error(f);
2803         }
2804     }
2805 
2806     return ret;
2807 }
2808 
2809 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2810 {
2811     int flags = 0, ret = 0, invalid_flags = 0;
2812     static uint64_t seq_iter;
2813     int len = 0;
2814     /*
2815      * If system is running in postcopy mode, page inserts to host memory must
2816      * be atomic
2817      */
2818     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2819     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2820     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2821 
2822     seq_iter++;
2823 
2824     if (version_id != 4) {
2825         ret = -EINVAL;
2826     }
2827 
2828     if (!migrate_use_compression()) {
2829         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2830     }
2831     /* This RCU critical section can be very long running.
2832      * When RCU reclaims in the code start to become numerous,
2833      * it will be necessary to reduce the granularity of this
2834      * critical section.
2835      */
2836     rcu_read_lock();
2837 
2838     if (postcopy_running) {
2839         ret = ram_load_postcopy(f);
2840     }
2841 
2842     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2843         ram_addr_t addr, total_ram_bytes;
2844         void *host = NULL;
2845         uint8_t ch;
2846 
2847         addr = qemu_get_be64(f);
2848         flags = addr & ~TARGET_PAGE_MASK;
2849         addr &= TARGET_PAGE_MASK;
2850 
2851         if (flags & invalid_flags) {
2852             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2853                 error_report("Received an unexpected compressed page");
2854             }
2855 
2856             ret = -EINVAL;
2857             break;
2858         }
2859 
2860         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2861                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2862             RAMBlock *block = ram_block_from_stream(f, flags);
2863 
2864             host = host_from_ram_block_offset(block, addr);
2865             if (!host) {
2866                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2867                 ret = -EINVAL;
2868                 break;
2869             }
2870             ramblock_recv_bitmap_set(block, host);
2871             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2872         }
2873 
2874         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2875         case RAM_SAVE_FLAG_MEM_SIZE:
2876             /* Synchronize RAM block list */
2877             total_ram_bytes = addr;
2878             while (!ret && total_ram_bytes) {
2879                 RAMBlock *block;
2880                 char id[256];
2881                 ram_addr_t length;
2882 
2883                 len = qemu_get_byte(f);
2884                 qemu_get_buffer(f, (uint8_t *)id, len);
2885                 id[len] = 0;
2886                 length = qemu_get_be64(f);
2887 
2888                 block = qemu_ram_block_by_name(id);
2889                 if (block) {
2890                     if (length != block->used_length) {
2891                         Error *local_err = NULL;
2892 
2893                         ret = qemu_ram_resize(block, length,
2894                                               &local_err);
2895                         if (local_err) {
2896                             error_report_err(local_err);
2897                         }
2898                     }
2899                     /* For postcopy we need to check hugepage sizes match */
2900                     if (postcopy_advised &&
2901                         block->page_size != qemu_host_page_size) {
2902                         uint64_t remote_page_size = qemu_get_be64(f);
2903                         if (remote_page_size != block->page_size) {
2904                             error_report("Mismatched RAM page size %s "
2905                                          "(local) %zd != %" PRId64,
2906                                          id, block->page_size,
2907                                          remote_page_size);
2908                             ret = -EINVAL;
2909                         }
2910                     }
2911                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2912                                           block->idstr);
2913                 } else {
2914                     error_report("Unknown ramblock \"%s\", cannot "
2915                                  "accept migration", id);
2916                     ret = -EINVAL;
2917                 }
2918 
2919                 total_ram_bytes -= length;
2920             }
2921             break;
2922 
2923         case RAM_SAVE_FLAG_ZERO:
2924             ch = qemu_get_byte(f);
2925             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2926             break;
2927 
2928         case RAM_SAVE_FLAG_PAGE:
2929             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2930             break;
2931 
2932         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2933             len = qemu_get_be32(f);
2934             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2935                 error_report("Invalid compressed data length: %d", len);
2936                 ret = -EINVAL;
2937                 break;
2938             }
2939             decompress_data_with_multi_threads(f, host, len);
2940             break;
2941 
2942         case RAM_SAVE_FLAG_XBZRLE:
2943             if (load_xbzrle(f, addr, host) < 0) {
2944                 error_report("Failed to decompress XBZRLE page at "
2945                              RAM_ADDR_FMT, addr);
2946                 ret = -EINVAL;
2947                 break;
2948             }
2949             break;
2950         case RAM_SAVE_FLAG_EOS:
2951             /* normal exit */
2952             break;
2953         default:
2954             if (flags & RAM_SAVE_FLAG_HOOK) {
2955                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2956             } else {
2957                 error_report("Unknown combination of migration flags: %#x",
2958                              flags);
2959                 ret = -EINVAL;
2960             }
2961         }
2962         if (!ret) {
2963             ret = qemu_file_get_error(f);
2964         }
2965     }
2966 
2967     wait_for_decompress_done();
2968     rcu_read_unlock();
2969     trace_ram_load_complete(ret, seq_iter);
2970     return ret;
2971 }
2972 
2973 static bool ram_has_postcopy(void *opaque)
2974 {
2975     return migrate_postcopy_ram();
2976 }
2977 
2978 static SaveVMHandlers savevm_ram_handlers = {
2979     .save_setup = ram_save_setup,
2980     .save_live_iterate = ram_save_iterate,
2981     .save_live_complete_postcopy = ram_save_complete,
2982     .save_live_complete_precopy = ram_save_complete,
2983     .has_postcopy = ram_has_postcopy,
2984     .save_live_pending = ram_save_pending,
2985     .load_state = ram_load,
2986     .save_cleanup = ram_save_cleanup,
2987     .load_setup = ram_load_setup,
2988     .load_cleanup = ram_load_cleanup,
2989 };
2990 
2991 void ram_mig_init(void)
2992 {
2993     qemu_mutex_init(&XBZRLE.lock);
2994     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2995 }
2996