xref: /openbmc/qemu/migration/ram.c (revision 9cdd2a73)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "migration/block.h"
54 
55 /***********************************************************/
56 /* ram save/restore */
57 
58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
59  * worked for pages that where filled with the same char.  We switched
60  * it to only search for the zero value.  And to avoid confusion with
61  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
62  */
63 
64 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
65 #define RAM_SAVE_FLAG_ZERO     0x02
66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
67 #define RAM_SAVE_FLAG_PAGE     0x08
68 #define RAM_SAVE_FLAG_EOS      0x10
69 #define RAM_SAVE_FLAG_CONTINUE 0x20
70 #define RAM_SAVE_FLAG_XBZRLE   0x40
71 /* 0x80 is reserved in migration.h start with 0x100 next */
72 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
73 
74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
75 {
76     return buffer_is_zero(p, size);
77 }
78 
79 XBZRLECacheStats xbzrle_counters;
80 
81 /* struct contains XBZRLE cache and a static page
82    used by the compression */
83 static struct {
84     /* buffer used for XBZRLE encoding */
85     uint8_t *encoded_buf;
86     /* buffer for storing page content */
87     uint8_t *current_buf;
88     /* Cache for XBZRLE, Protected by lock. */
89     PageCache *cache;
90     QemuMutex lock;
91     /* it will store a page full of zeros */
92     uint8_t *zero_target_page;
93     /* buffer used for XBZRLE decoding */
94     uint8_t *decoded_buf;
95 } XBZRLE;
96 
97 static void XBZRLE_cache_lock(void)
98 {
99     if (migrate_use_xbzrle())
100         qemu_mutex_lock(&XBZRLE.lock);
101 }
102 
103 static void XBZRLE_cache_unlock(void)
104 {
105     if (migrate_use_xbzrle())
106         qemu_mutex_unlock(&XBZRLE.lock);
107 }
108 
109 /**
110  * xbzrle_cache_resize: resize the xbzrle cache
111  *
112  * This function is called from qmp_migrate_set_cache_size in main
113  * thread, possibly while a migration is in progress.  A running
114  * migration may be using the cache and might finish during this call,
115  * hence changes to the cache are protected by XBZRLE.lock().
116  *
117  * Returns 0 for success or -1 for error
118  *
119  * @new_size: new cache size
120  * @errp: set *errp if the check failed, with reason
121  */
122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
123 {
124     PageCache *new_cache;
125     int64_t ret = 0;
126 
127     /* Check for truncation */
128     if (new_size != (size_t)new_size) {
129         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
130                    "exceeding address space");
131         return -1;
132     }
133 
134     if (new_size == migrate_xbzrle_cache_size()) {
135         /* nothing to do */
136         return 0;
137     }
138 
139     XBZRLE_cache_lock();
140 
141     if (XBZRLE.cache != NULL) {
142         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
143         if (!new_cache) {
144             ret = -1;
145             goto out;
146         }
147 
148         cache_fini(XBZRLE.cache);
149         XBZRLE.cache = new_cache;
150     }
151 out:
152     XBZRLE_cache_unlock();
153     return ret;
154 }
155 
156 static void ramblock_recv_map_init(void)
157 {
158     RAMBlock *rb;
159 
160     RAMBLOCK_FOREACH(rb) {
161         assert(!rb->receivedmap);
162         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
163     }
164 }
165 
166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
167 {
168     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
169                     rb->receivedmap);
170 }
171 
172 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
173 {
174     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
175 }
176 
177 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
178                                     size_t nr)
179 {
180     bitmap_set_atomic(rb->receivedmap,
181                       ramblock_recv_bitmap_offset(host_addr, rb),
182                       nr);
183 }
184 
185 /*
186  * An outstanding page request, on the source, having been received
187  * and queued
188  */
189 struct RAMSrcPageRequest {
190     RAMBlock *rb;
191     hwaddr    offset;
192     hwaddr    len;
193 
194     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
195 };
196 
197 /* State of RAM for migration */
198 struct RAMState {
199     /* QEMUFile used for this migration */
200     QEMUFile *f;
201     /* Last block that we have visited searching for dirty pages */
202     RAMBlock *last_seen_block;
203     /* Last block from where we have sent data */
204     RAMBlock *last_sent_block;
205     /* Last dirty target page we have sent */
206     ram_addr_t last_page;
207     /* last ram version we have seen */
208     uint32_t last_version;
209     /* We are in the first round */
210     bool ram_bulk_stage;
211     /* How many times we have dirty too many pages */
212     int dirty_rate_high_cnt;
213     /* these variables are used for bitmap sync */
214     /* last time we did a full bitmap_sync */
215     int64_t time_last_bitmap_sync;
216     /* bytes transferred at start_time */
217     uint64_t bytes_xfer_prev;
218     /* number of dirty pages since start_time */
219     uint64_t num_dirty_pages_period;
220     /* xbzrle misses since the beginning of the period */
221     uint64_t xbzrle_cache_miss_prev;
222     /* number of iterations at the beginning of period */
223     uint64_t iterations_prev;
224     /* Iterations since start */
225     uint64_t iterations;
226     /* number of dirty bits in the bitmap */
227     uint64_t migration_dirty_pages;
228     /* protects modification of the bitmap */
229     QemuMutex bitmap_mutex;
230     /* The RAMBlock used in the last src_page_requests */
231     RAMBlock *last_req_rb;
232     /* Queue of outstanding page requests from the destination */
233     QemuMutex src_page_req_mutex;
234     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
235 };
236 typedef struct RAMState RAMState;
237 
238 static RAMState *ram_state;
239 
240 uint64_t ram_bytes_remaining(void)
241 {
242     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
243                        0;
244 }
245 
246 MigrationStats ram_counters;
247 
248 /* used by the search for pages to send */
249 struct PageSearchStatus {
250     /* Current block being searched */
251     RAMBlock    *block;
252     /* Current page to search from */
253     unsigned long page;
254     /* Set once we wrap around */
255     bool         complete_round;
256 };
257 typedef struct PageSearchStatus PageSearchStatus;
258 
259 struct CompressParam {
260     bool done;
261     bool quit;
262     QEMUFile *file;
263     QemuMutex mutex;
264     QemuCond cond;
265     RAMBlock *block;
266     ram_addr_t offset;
267 };
268 typedef struct CompressParam CompressParam;
269 
270 struct DecompressParam {
271     bool done;
272     bool quit;
273     QemuMutex mutex;
274     QemuCond cond;
275     void *des;
276     uint8_t *compbuf;
277     int len;
278 };
279 typedef struct DecompressParam DecompressParam;
280 
281 static CompressParam *comp_param;
282 static QemuThread *compress_threads;
283 /* comp_done_cond is used to wake up the migration thread when
284  * one of the compression threads has finished the compression.
285  * comp_done_lock is used to co-work with comp_done_cond.
286  */
287 static QemuMutex comp_done_lock;
288 static QemuCond comp_done_cond;
289 /* The empty QEMUFileOps will be used by file in CompressParam */
290 static const QEMUFileOps empty_ops = { };
291 
292 static DecompressParam *decomp_param;
293 static QemuThread *decompress_threads;
294 static QemuMutex decomp_done_lock;
295 static QemuCond decomp_done_cond;
296 
297 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
298                                 ram_addr_t offset);
299 
300 static void *do_data_compress(void *opaque)
301 {
302     CompressParam *param = opaque;
303     RAMBlock *block;
304     ram_addr_t offset;
305 
306     qemu_mutex_lock(&param->mutex);
307     while (!param->quit) {
308         if (param->block) {
309             block = param->block;
310             offset = param->offset;
311             param->block = NULL;
312             qemu_mutex_unlock(&param->mutex);
313 
314             do_compress_ram_page(param->file, block, offset);
315 
316             qemu_mutex_lock(&comp_done_lock);
317             param->done = true;
318             qemu_cond_signal(&comp_done_cond);
319             qemu_mutex_unlock(&comp_done_lock);
320 
321             qemu_mutex_lock(&param->mutex);
322         } else {
323             qemu_cond_wait(&param->cond, &param->mutex);
324         }
325     }
326     qemu_mutex_unlock(&param->mutex);
327 
328     return NULL;
329 }
330 
331 static inline void terminate_compression_threads(void)
332 {
333     int idx, thread_count;
334 
335     thread_count = migrate_compress_threads();
336 
337     for (idx = 0; idx < thread_count; idx++) {
338         qemu_mutex_lock(&comp_param[idx].mutex);
339         comp_param[idx].quit = true;
340         qemu_cond_signal(&comp_param[idx].cond);
341         qemu_mutex_unlock(&comp_param[idx].mutex);
342     }
343 }
344 
345 static void compress_threads_save_cleanup(void)
346 {
347     int i, thread_count;
348 
349     if (!migrate_use_compression()) {
350         return;
351     }
352     terminate_compression_threads();
353     thread_count = migrate_compress_threads();
354     for (i = 0; i < thread_count; i++) {
355         qemu_thread_join(compress_threads + i);
356         qemu_fclose(comp_param[i].file);
357         qemu_mutex_destroy(&comp_param[i].mutex);
358         qemu_cond_destroy(&comp_param[i].cond);
359     }
360     qemu_mutex_destroy(&comp_done_lock);
361     qemu_cond_destroy(&comp_done_cond);
362     g_free(compress_threads);
363     g_free(comp_param);
364     compress_threads = NULL;
365     comp_param = NULL;
366 }
367 
368 static void compress_threads_save_setup(void)
369 {
370     int i, thread_count;
371 
372     if (!migrate_use_compression()) {
373         return;
374     }
375     thread_count = migrate_compress_threads();
376     compress_threads = g_new0(QemuThread, thread_count);
377     comp_param = g_new0(CompressParam, thread_count);
378     qemu_cond_init(&comp_done_cond);
379     qemu_mutex_init(&comp_done_lock);
380     for (i = 0; i < thread_count; i++) {
381         /* comp_param[i].file is just used as a dummy buffer to save data,
382          * set its ops to empty.
383          */
384         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
385         comp_param[i].done = true;
386         comp_param[i].quit = false;
387         qemu_mutex_init(&comp_param[i].mutex);
388         qemu_cond_init(&comp_param[i].cond);
389         qemu_thread_create(compress_threads + i, "compress",
390                            do_data_compress, comp_param + i,
391                            QEMU_THREAD_JOINABLE);
392     }
393 }
394 
395 /* Multiple fd's */
396 
397 struct MultiFDSendParams {
398     uint8_t id;
399     char *name;
400     QemuThread thread;
401     QemuSemaphore sem;
402     QemuMutex mutex;
403     bool quit;
404 };
405 typedef struct MultiFDSendParams MultiFDSendParams;
406 
407 struct {
408     MultiFDSendParams *params;
409     /* number of created threads */
410     int count;
411 } *multifd_send_state;
412 
413 static void terminate_multifd_send_threads(Error *errp)
414 {
415     int i;
416 
417     for (i = 0; i < multifd_send_state->count; i++) {
418         MultiFDSendParams *p = &multifd_send_state->params[i];
419 
420         qemu_mutex_lock(&p->mutex);
421         p->quit = true;
422         qemu_sem_post(&p->sem);
423         qemu_mutex_unlock(&p->mutex);
424     }
425 }
426 
427 int multifd_save_cleanup(Error **errp)
428 {
429     int i;
430     int ret = 0;
431 
432     if (!migrate_use_multifd()) {
433         return 0;
434     }
435     terminate_multifd_send_threads(NULL);
436     for (i = 0; i < multifd_send_state->count; i++) {
437         MultiFDSendParams *p = &multifd_send_state->params[i];
438 
439         qemu_thread_join(&p->thread);
440         qemu_mutex_destroy(&p->mutex);
441         qemu_sem_destroy(&p->sem);
442         g_free(p->name);
443         p->name = NULL;
444     }
445     g_free(multifd_send_state->params);
446     multifd_send_state->params = NULL;
447     g_free(multifd_send_state);
448     multifd_send_state = NULL;
449     return ret;
450 }
451 
452 static void *multifd_send_thread(void *opaque)
453 {
454     MultiFDSendParams *p = opaque;
455 
456     while (true) {
457         qemu_mutex_lock(&p->mutex);
458         if (p->quit) {
459             qemu_mutex_unlock(&p->mutex);
460             break;
461         }
462         qemu_mutex_unlock(&p->mutex);
463         qemu_sem_wait(&p->sem);
464     }
465 
466     return NULL;
467 }
468 
469 int multifd_save_setup(void)
470 {
471     int thread_count;
472     uint8_t i;
473 
474     if (!migrate_use_multifd()) {
475         return 0;
476     }
477     thread_count = migrate_multifd_channels();
478     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
479     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
480     multifd_send_state->count = 0;
481     for (i = 0; i < thread_count; i++) {
482         MultiFDSendParams *p = &multifd_send_state->params[i];
483 
484         qemu_mutex_init(&p->mutex);
485         qemu_sem_init(&p->sem, 0);
486         p->quit = false;
487         p->id = i;
488         p->name = g_strdup_printf("multifdsend_%d", i);
489         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
490                            QEMU_THREAD_JOINABLE);
491 
492         multifd_send_state->count++;
493     }
494     return 0;
495 }
496 
497 struct MultiFDRecvParams {
498     uint8_t id;
499     char *name;
500     QemuThread thread;
501     QemuSemaphore sem;
502     QemuMutex mutex;
503     bool quit;
504 };
505 typedef struct MultiFDRecvParams MultiFDRecvParams;
506 
507 struct {
508     MultiFDRecvParams *params;
509     /* number of created threads */
510     int count;
511 } *multifd_recv_state;
512 
513 static void terminate_multifd_recv_threads(Error *errp)
514 {
515     int i;
516 
517     for (i = 0; i < multifd_recv_state->count; i++) {
518         MultiFDRecvParams *p = &multifd_recv_state->params[i];
519 
520         qemu_mutex_lock(&p->mutex);
521         p->quit = true;
522         qemu_sem_post(&p->sem);
523         qemu_mutex_unlock(&p->mutex);
524     }
525 }
526 
527 int multifd_load_cleanup(Error **errp)
528 {
529     int i;
530     int ret = 0;
531 
532     if (!migrate_use_multifd()) {
533         return 0;
534     }
535     terminate_multifd_recv_threads(NULL);
536     for (i = 0; i < multifd_recv_state->count; i++) {
537         MultiFDRecvParams *p = &multifd_recv_state->params[i];
538 
539         qemu_thread_join(&p->thread);
540         qemu_mutex_destroy(&p->mutex);
541         qemu_sem_destroy(&p->sem);
542         g_free(p->name);
543         p->name = NULL;
544     }
545     g_free(multifd_recv_state->params);
546     multifd_recv_state->params = NULL;
547     g_free(multifd_recv_state);
548     multifd_recv_state = NULL;
549 
550     return ret;
551 }
552 
553 static void *multifd_recv_thread(void *opaque)
554 {
555     MultiFDRecvParams *p = opaque;
556 
557     while (true) {
558         qemu_mutex_lock(&p->mutex);
559         if (p->quit) {
560             qemu_mutex_unlock(&p->mutex);
561             break;
562         }
563         qemu_mutex_unlock(&p->mutex);
564         qemu_sem_wait(&p->sem);
565     }
566 
567     return NULL;
568 }
569 
570 int multifd_load_setup(void)
571 {
572     int thread_count;
573     uint8_t i;
574 
575     if (!migrate_use_multifd()) {
576         return 0;
577     }
578     thread_count = migrate_multifd_channels();
579     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
580     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
581     multifd_recv_state->count = 0;
582     for (i = 0; i < thread_count; i++) {
583         MultiFDRecvParams *p = &multifd_recv_state->params[i];
584 
585         qemu_mutex_init(&p->mutex);
586         qemu_sem_init(&p->sem, 0);
587         p->quit = false;
588         p->id = i;
589         p->name = g_strdup_printf("multifdrecv_%d", i);
590         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
591                            QEMU_THREAD_JOINABLE);
592         multifd_recv_state->count++;
593     }
594     return 0;
595 }
596 
597 /**
598  * save_page_header: write page header to wire
599  *
600  * If this is the 1st block, it also writes the block identification
601  *
602  * Returns the number of bytes written
603  *
604  * @f: QEMUFile where to send the data
605  * @block: block that contains the page we want to send
606  * @offset: offset inside the block for the page
607  *          in the lower bits, it contains flags
608  */
609 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
610                                ram_addr_t offset)
611 {
612     size_t size, len;
613 
614     if (block == rs->last_sent_block) {
615         offset |= RAM_SAVE_FLAG_CONTINUE;
616     }
617     qemu_put_be64(f, offset);
618     size = 8;
619 
620     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
621         len = strlen(block->idstr);
622         qemu_put_byte(f, len);
623         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
624         size += 1 + len;
625         rs->last_sent_block = block;
626     }
627     return size;
628 }
629 
630 /**
631  * mig_throttle_guest_down: throotle down the guest
632  *
633  * Reduce amount of guest cpu execution to hopefully slow down memory
634  * writes. If guest dirty memory rate is reduced below the rate at
635  * which we can transfer pages to the destination then we should be
636  * able to complete migration. Some workloads dirty memory way too
637  * fast and will not effectively converge, even with auto-converge.
638  */
639 static void mig_throttle_guest_down(void)
640 {
641     MigrationState *s = migrate_get_current();
642     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
643     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
644 
645     /* We have not started throttling yet. Let's start it. */
646     if (!cpu_throttle_active()) {
647         cpu_throttle_set(pct_initial);
648     } else {
649         /* Throttling already on, just increase the rate */
650         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
651     }
652 }
653 
654 /**
655  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
656  *
657  * @rs: current RAM state
658  * @current_addr: address for the zero page
659  *
660  * Update the xbzrle cache to reflect a page that's been sent as all 0.
661  * The important thing is that a stale (not-yet-0'd) page be replaced
662  * by the new data.
663  * As a bonus, if the page wasn't in the cache it gets added so that
664  * when a small write is made into the 0'd page it gets XBZRLE sent.
665  */
666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
667 {
668     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
669         return;
670     }
671 
672     /* We don't care if this fails to allocate a new cache page
673      * as long as it updated an old one */
674     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
675                  ram_counters.dirty_sync_count);
676 }
677 
678 #define ENCODING_FLAG_XBZRLE 0x1
679 
680 /**
681  * save_xbzrle_page: compress and send current page
682  *
683  * Returns: 1 means that we wrote the page
684  *          0 means that page is identical to the one already sent
685  *          -1 means that xbzrle would be longer than normal
686  *
687  * @rs: current RAM state
688  * @current_data: pointer to the address of the page contents
689  * @current_addr: addr of the page
690  * @block: block that contains the page we want to send
691  * @offset: offset inside the block for the page
692  * @last_stage: if we are at the completion stage
693  */
694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
695                             ram_addr_t current_addr, RAMBlock *block,
696                             ram_addr_t offset, bool last_stage)
697 {
698     int encoded_len = 0, bytes_xbzrle;
699     uint8_t *prev_cached_page;
700 
701     if (!cache_is_cached(XBZRLE.cache, current_addr,
702                          ram_counters.dirty_sync_count)) {
703         xbzrle_counters.cache_miss++;
704         if (!last_stage) {
705             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
706                              ram_counters.dirty_sync_count) == -1) {
707                 return -1;
708             } else {
709                 /* update *current_data when the page has been
710                    inserted into cache */
711                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
712             }
713         }
714         return -1;
715     }
716 
717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
718 
719     /* save current buffer into memory */
720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
721 
722     /* XBZRLE encoding (if there is no overflow) */
723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725                                        TARGET_PAGE_SIZE);
726     if (encoded_len == 0) {
727         trace_save_xbzrle_page_skipping();
728         return 0;
729     } else if (encoded_len == -1) {
730         trace_save_xbzrle_page_overflow();
731         xbzrle_counters.overflow++;
732         /* update data in the cache */
733         if (!last_stage) {
734             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
735             *current_data = prev_cached_page;
736         }
737         return -1;
738     }
739 
740     /* we need to update the data in the cache, in order to get the same data */
741     if (!last_stage) {
742         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
743     }
744 
745     /* Send XBZRLE based compressed page */
746     bytes_xbzrle = save_page_header(rs, rs->f, block,
747                                     offset | RAM_SAVE_FLAG_XBZRLE);
748     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
749     qemu_put_be16(rs->f, encoded_len);
750     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
751     bytes_xbzrle += encoded_len + 1 + 2;
752     xbzrle_counters.pages++;
753     xbzrle_counters.bytes += bytes_xbzrle;
754     ram_counters.transferred += bytes_xbzrle;
755 
756     return 1;
757 }
758 
759 /**
760  * migration_bitmap_find_dirty: find the next dirty page from start
761  *
762  * Called with rcu_read_lock() to protect migration_bitmap
763  *
764  * Returns the byte offset within memory region of the start of a dirty page
765  *
766  * @rs: current RAM state
767  * @rb: RAMBlock where to search for dirty pages
768  * @start: page where we start the search
769  */
770 static inline
771 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
772                                           unsigned long start)
773 {
774     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
775     unsigned long *bitmap = rb->bmap;
776     unsigned long next;
777 
778     if (rs->ram_bulk_stage && start > 0) {
779         next = start + 1;
780     } else {
781         next = find_next_bit(bitmap, size, start);
782     }
783 
784     return next;
785 }
786 
787 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
788                                                 RAMBlock *rb,
789                                                 unsigned long page)
790 {
791     bool ret;
792 
793     ret = test_and_clear_bit(page, rb->bmap);
794 
795     if (ret) {
796         rs->migration_dirty_pages--;
797     }
798     return ret;
799 }
800 
801 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
802                                         ram_addr_t start, ram_addr_t length)
803 {
804     rs->migration_dirty_pages +=
805         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
806                                               &rs->num_dirty_pages_period);
807 }
808 
809 /**
810  * ram_pagesize_summary: calculate all the pagesizes of a VM
811  *
812  * Returns a summary bitmap of the page sizes of all RAMBlocks
813  *
814  * For VMs with just normal pages this is equivalent to the host page
815  * size. If it's got some huge pages then it's the OR of all the
816  * different page sizes.
817  */
818 uint64_t ram_pagesize_summary(void)
819 {
820     RAMBlock *block;
821     uint64_t summary = 0;
822 
823     RAMBLOCK_FOREACH(block) {
824         summary |= block->page_size;
825     }
826 
827     return summary;
828 }
829 
830 static void migration_bitmap_sync(RAMState *rs)
831 {
832     RAMBlock *block;
833     int64_t end_time;
834     uint64_t bytes_xfer_now;
835 
836     ram_counters.dirty_sync_count++;
837 
838     if (!rs->time_last_bitmap_sync) {
839         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
840     }
841 
842     trace_migration_bitmap_sync_start();
843     memory_global_dirty_log_sync();
844 
845     qemu_mutex_lock(&rs->bitmap_mutex);
846     rcu_read_lock();
847     RAMBLOCK_FOREACH(block) {
848         migration_bitmap_sync_range(rs, block, 0, block->used_length);
849     }
850     rcu_read_unlock();
851     qemu_mutex_unlock(&rs->bitmap_mutex);
852 
853     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
854 
855     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
856 
857     /* more than 1 second = 1000 millisecons */
858     if (end_time > rs->time_last_bitmap_sync + 1000) {
859         /* calculate period counters */
860         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
861             / (end_time - rs->time_last_bitmap_sync);
862         bytes_xfer_now = ram_counters.transferred;
863 
864         /* During block migration the auto-converge logic incorrectly detects
865          * that ram migration makes no progress. Avoid this by disabling the
866          * throttling logic during the bulk phase of block migration. */
867         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
868             /* The following detection logic can be refined later. For now:
869                Check to see if the dirtied bytes is 50% more than the approx.
870                amount of bytes that just got transferred since the last time we
871                were in this routine. If that happens twice, start or increase
872                throttling */
873 
874             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
875                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
876                 (++rs->dirty_rate_high_cnt >= 2)) {
877                     trace_migration_throttle();
878                     rs->dirty_rate_high_cnt = 0;
879                     mig_throttle_guest_down();
880             }
881         }
882 
883         if (migrate_use_xbzrle()) {
884             if (rs->iterations_prev != rs->iterations) {
885                 xbzrle_counters.cache_miss_rate =
886                    (double)(xbzrle_counters.cache_miss -
887                             rs->xbzrle_cache_miss_prev) /
888                    (rs->iterations - rs->iterations_prev);
889             }
890             rs->iterations_prev = rs->iterations;
891             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
892         }
893 
894         /* reset period counters */
895         rs->time_last_bitmap_sync = end_time;
896         rs->num_dirty_pages_period = 0;
897         rs->bytes_xfer_prev = bytes_xfer_now;
898     }
899     if (migrate_use_events()) {
900         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
901     }
902 }
903 
904 /**
905  * save_zero_page: send the zero page to the stream
906  *
907  * Returns the number of pages written.
908  *
909  * @rs: current RAM state
910  * @block: block that contains the page we want to send
911  * @offset: offset inside the block for the page
912  */
913 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
914 {
915     uint8_t *p = block->host + offset;
916     int pages = -1;
917 
918     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
919         ram_counters.duplicate++;
920         ram_counters.transferred +=
921             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
922         qemu_put_byte(rs->f, 0);
923         ram_counters.transferred += 1;
924         pages = 1;
925     }
926 
927     return pages;
928 }
929 
930 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
931 {
932     if (!migrate_release_ram() || !migration_in_postcopy()) {
933         return;
934     }
935 
936     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
937 }
938 
939 /**
940  * ram_save_page: send the given page to the stream
941  *
942  * Returns the number of pages written.
943  *          < 0 - error
944  *          >=0 - Number of pages written - this might legally be 0
945  *                if xbzrle noticed the page was the same.
946  *
947  * @rs: current RAM state
948  * @block: block that contains the page we want to send
949  * @offset: offset inside the block for the page
950  * @last_stage: if we are at the completion stage
951  */
952 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
953 {
954     int pages = -1;
955     uint64_t bytes_xmit;
956     ram_addr_t current_addr;
957     uint8_t *p;
958     int ret;
959     bool send_async = true;
960     RAMBlock *block = pss->block;
961     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
962 
963     p = block->host + offset;
964     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
965 
966     /* In doubt sent page as normal */
967     bytes_xmit = 0;
968     ret = ram_control_save_page(rs->f, block->offset,
969                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
970     if (bytes_xmit) {
971         ram_counters.transferred += bytes_xmit;
972         pages = 1;
973     }
974 
975     XBZRLE_cache_lock();
976 
977     current_addr = block->offset + offset;
978 
979     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
980         if (ret != RAM_SAVE_CONTROL_DELAYED) {
981             if (bytes_xmit > 0) {
982                 ram_counters.normal++;
983             } else if (bytes_xmit == 0) {
984                 ram_counters.duplicate++;
985             }
986         }
987     } else {
988         pages = save_zero_page(rs, block, offset);
989         if (pages > 0) {
990             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
991              * page would be stale
992              */
993             xbzrle_cache_zero_page(rs, current_addr);
994             ram_release_pages(block->idstr, offset, pages);
995         } else if (!rs->ram_bulk_stage &&
996                    !migration_in_postcopy() && migrate_use_xbzrle()) {
997             pages = save_xbzrle_page(rs, &p, current_addr, block,
998                                      offset, last_stage);
999             if (!last_stage) {
1000                 /* Can't send this cached data async, since the cache page
1001                  * might get updated before it gets to the wire
1002                  */
1003                 send_async = false;
1004             }
1005         }
1006     }
1007 
1008     /* XBZRLE overflow or normal page */
1009     if (pages == -1) {
1010         ram_counters.transferred +=
1011             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1012         if (send_async) {
1013             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1014                                   migrate_release_ram() &
1015                                   migration_in_postcopy());
1016         } else {
1017             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1018         }
1019         ram_counters.transferred += TARGET_PAGE_SIZE;
1020         pages = 1;
1021         ram_counters.normal++;
1022     }
1023 
1024     XBZRLE_cache_unlock();
1025 
1026     return pages;
1027 }
1028 
1029 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1030                                 ram_addr_t offset)
1031 {
1032     RAMState *rs = ram_state;
1033     int bytes_sent, blen;
1034     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1035 
1036     bytes_sent = save_page_header(rs, f, block, offset |
1037                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1038     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1039                                      migrate_compress_level());
1040     if (blen < 0) {
1041         bytes_sent = 0;
1042         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1043         error_report("compressed data failed!");
1044     } else {
1045         bytes_sent += blen;
1046         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1047     }
1048 
1049     return bytes_sent;
1050 }
1051 
1052 static void flush_compressed_data(RAMState *rs)
1053 {
1054     int idx, len, thread_count;
1055 
1056     if (!migrate_use_compression()) {
1057         return;
1058     }
1059     thread_count = migrate_compress_threads();
1060 
1061     qemu_mutex_lock(&comp_done_lock);
1062     for (idx = 0; idx < thread_count; idx++) {
1063         while (!comp_param[idx].done) {
1064             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1065         }
1066     }
1067     qemu_mutex_unlock(&comp_done_lock);
1068 
1069     for (idx = 0; idx < thread_count; idx++) {
1070         qemu_mutex_lock(&comp_param[idx].mutex);
1071         if (!comp_param[idx].quit) {
1072             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1073             ram_counters.transferred += len;
1074         }
1075         qemu_mutex_unlock(&comp_param[idx].mutex);
1076     }
1077 }
1078 
1079 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1080                                        ram_addr_t offset)
1081 {
1082     param->block = block;
1083     param->offset = offset;
1084 }
1085 
1086 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1087                                            ram_addr_t offset)
1088 {
1089     int idx, thread_count, bytes_xmit = -1, pages = -1;
1090 
1091     thread_count = migrate_compress_threads();
1092     qemu_mutex_lock(&comp_done_lock);
1093     while (true) {
1094         for (idx = 0; idx < thread_count; idx++) {
1095             if (comp_param[idx].done) {
1096                 comp_param[idx].done = false;
1097                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1098                 qemu_mutex_lock(&comp_param[idx].mutex);
1099                 set_compress_params(&comp_param[idx], block, offset);
1100                 qemu_cond_signal(&comp_param[idx].cond);
1101                 qemu_mutex_unlock(&comp_param[idx].mutex);
1102                 pages = 1;
1103                 ram_counters.normal++;
1104                 ram_counters.transferred += bytes_xmit;
1105                 break;
1106             }
1107         }
1108         if (pages > 0) {
1109             break;
1110         } else {
1111             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1112         }
1113     }
1114     qemu_mutex_unlock(&comp_done_lock);
1115 
1116     return pages;
1117 }
1118 
1119 /**
1120  * ram_save_compressed_page: compress the given page and send it to the stream
1121  *
1122  * Returns the number of pages written.
1123  *
1124  * @rs: current RAM state
1125  * @block: block that contains the page we want to send
1126  * @offset: offset inside the block for the page
1127  * @last_stage: if we are at the completion stage
1128  */
1129 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1130                                     bool last_stage)
1131 {
1132     int pages = -1;
1133     uint64_t bytes_xmit = 0;
1134     uint8_t *p;
1135     int ret, blen;
1136     RAMBlock *block = pss->block;
1137     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1138 
1139     p = block->host + offset;
1140 
1141     ret = ram_control_save_page(rs->f, block->offset,
1142                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1143     if (bytes_xmit) {
1144         ram_counters.transferred += bytes_xmit;
1145         pages = 1;
1146     }
1147     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1148         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1149             if (bytes_xmit > 0) {
1150                 ram_counters.normal++;
1151             } else if (bytes_xmit == 0) {
1152                 ram_counters.duplicate++;
1153             }
1154         }
1155     } else {
1156         /* When starting the process of a new block, the first page of
1157          * the block should be sent out before other pages in the same
1158          * block, and all the pages in last block should have been sent
1159          * out, keeping this order is important, because the 'cont' flag
1160          * is used to avoid resending the block name.
1161          */
1162         if (block != rs->last_sent_block) {
1163             flush_compressed_data(rs);
1164             pages = save_zero_page(rs, block, offset);
1165             if (pages == -1) {
1166                 /* Make sure the first page is sent out before other pages */
1167                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1168                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1169                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1170                                                  migrate_compress_level());
1171                 if (blen > 0) {
1172                     ram_counters.transferred += bytes_xmit + blen;
1173                     ram_counters.normal++;
1174                     pages = 1;
1175                 } else {
1176                     qemu_file_set_error(rs->f, blen);
1177                     error_report("compressed data failed!");
1178                 }
1179             }
1180             if (pages > 0) {
1181                 ram_release_pages(block->idstr, offset, pages);
1182             }
1183         } else {
1184             pages = save_zero_page(rs, block, offset);
1185             if (pages == -1) {
1186                 pages = compress_page_with_multi_thread(rs, block, offset);
1187             } else {
1188                 ram_release_pages(block->idstr, offset, pages);
1189             }
1190         }
1191     }
1192 
1193     return pages;
1194 }
1195 
1196 /**
1197  * find_dirty_block: find the next dirty page and update any state
1198  * associated with the search process.
1199  *
1200  * Returns if a page is found
1201  *
1202  * @rs: current RAM state
1203  * @pss: data about the state of the current dirty page scan
1204  * @again: set to false if the search has scanned the whole of RAM
1205  */
1206 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1207 {
1208     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1209     if (pss->complete_round && pss->block == rs->last_seen_block &&
1210         pss->page >= rs->last_page) {
1211         /*
1212          * We've been once around the RAM and haven't found anything.
1213          * Give up.
1214          */
1215         *again = false;
1216         return false;
1217     }
1218     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1219         /* Didn't find anything in this RAM Block */
1220         pss->page = 0;
1221         pss->block = QLIST_NEXT_RCU(pss->block, next);
1222         if (!pss->block) {
1223             /* Hit the end of the list */
1224             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1225             /* Flag that we've looped */
1226             pss->complete_round = true;
1227             rs->ram_bulk_stage = false;
1228             if (migrate_use_xbzrle()) {
1229                 /* If xbzrle is on, stop using the data compression at this
1230                  * point. In theory, xbzrle can do better than compression.
1231                  */
1232                 flush_compressed_data(rs);
1233             }
1234         }
1235         /* Didn't find anything this time, but try again on the new block */
1236         *again = true;
1237         return false;
1238     } else {
1239         /* Can go around again, but... */
1240         *again = true;
1241         /* We've found something so probably don't need to */
1242         return true;
1243     }
1244 }
1245 
1246 /**
1247  * unqueue_page: gets a page of the queue
1248  *
1249  * Helper for 'get_queued_page' - gets a page off the queue
1250  *
1251  * Returns the block of the page (or NULL if none available)
1252  *
1253  * @rs: current RAM state
1254  * @offset: used to return the offset within the RAMBlock
1255  */
1256 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1257 {
1258     RAMBlock *block = NULL;
1259 
1260     qemu_mutex_lock(&rs->src_page_req_mutex);
1261     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1262         struct RAMSrcPageRequest *entry =
1263                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1264         block = entry->rb;
1265         *offset = entry->offset;
1266 
1267         if (entry->len > TARGET_PAGE_SIZE) {
1268             entry->len -= TARGET_PAGE_SIZE;
1269             entry->offset += TARGET_PAGE_SIZE;
1270         } else {
1271             memory_region_unref(block->mr);
1272             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1273             g_free(entry);
1274         }
1275     }
1276     qemu_mutex_unlock(&rs->src_page_req_mutex);
1277 
1278     return block;
1279 }
1280 
1281 /**
1282  * get_queued_page: unqueue a page from the postocpy requests
1283  *
1284  * Skips pages that are already sent (!dirty)
1285  *
1286  * Returns if a queued page is found
1287  *
1288  * @rs: current RAM state
1289  * @pss: data about the state of the current dirty page scan
1290  */
1291 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1292 {
1293     RAMBlock  *block;
1294     ram_addr_t offset;
1295     bool dirty;
1296 
1297     do {
1298         block = unqueue_page(rs, &offset);
1299         /*
1300          * We're sending this page, and since it's postcopy nothing else
1301          * will dirty it, and we must make sure it doesn't get sent again
1302          * even if this queue request was received after the background
1303          * search already sent it.
1304          */
1305         if (block) {
1306             unsigned long page;
1307 
1308             page = offset >> TARGET_PAGE_BITS;
1309             dirty = test_bit(page, block->bmap);
1310             if (!dirty) {
1311                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1312                        page, test_bit(page, block->unsentmap));
1313             } else {
1314                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1315             }
1316         }
1317 
1318     } while (block && !dirty);
1319 
1320     if (block) {
1321         /*
1322          * As soon as we start servicing pages out of order, then we have
1323          * to kill the bulk stage, since the bulk stage assumes
1324          * in (migration_bitmap_find_and_reset_dirty) that every page is
1325          * dirty, that's no longer true.
1326          */
1327         rs->ram_bulk_stage = false;
1328 
1329         /*
1330          * We want the background search to continue from the queued page
1331          * since the guest is likely to want other pages near to the page
1332          * it just requested.
1333          */
1334         pss->block = block;
1335         pss->page = offset >> TARGET_PAGE_BITS;
1336     }
1337 
1338     return !!block;
1339 }
1340 
1341 /**
1342  * migration_page_queue_free: drop any remaining pages in the ram
1343  * request queue
1344  *
1345  * It should be empty at the end anyway, but in error cases there may
1346  * be some left.  in case that there is any page left, we drop it.
1347  *
1348  */
1349 static void migration_page_queue_free(RAMState *rs)
1350 {
1351     struct RAMSrcPageRequest *mspr, *next_mspr;
1352     /* This queue generally should be empty - but in the case of a failed
1353      * migration might have some droppings in.
1354      */
1355     rcu_read_lock();
1356     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1357         memory_region_unref(mspr->rb->mr);
1358         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1359         g_free(mspr);
1360     }
1361     rcu_read_unlock();
1362 }
1363 
1364 /**
1365  * ram_save_queue_pages: queue the page for transmission
1366  *
1367  * A request from postcopy destination for example.
1368  *
1369  * Returns zero on success or negative on error
1370  *
1371  * @rbname: Name of the RAMBLock of the request. NULL means the
1372  *          same that last one.
1373  * @start: starting address from the start of the RAMBlock
1374  * @len: length (in bytes) to send
1375  */
1376 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1377 {
1378     RAMBlock *ramblock;
1379     RAMState *rs = ram_state;
1380 
1381     ram_counters.postcopy_requests++;
1382     rcu_read_lock();
1383     if (!rbname) {
1384         /* Reuse last RAMBlock */
1385         ramblock = rs->last_req_rb;
1386 
1387         if (!ramblock) {
1388             /*
1389              * Shouldn't happen, we can't reuse the last RAMBlock if
1390              * it's the 1st request.
1391              */
1392             error_report("ram_save_queue_pages no previous block");
1393             goto err;
1394         }
1395     } else {
1396         ramblock = qemu_ram_block_by_name(rbname);
1397 
1398         if (!ramblock) {
1399             /* We shouldn't be asked for a non-existent RAMBlock */
1400             error_report("ram_save_queue_pages no block '%s'", rbname);
1401             goto err;
1402         }
1403         rs->last_req_rb = ramblock;
1404     }
1405     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1406     if (start+len > ramblock->used_length) {
1407         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1408                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1409                      __func__, start, len, ramblock->used_length);
1410         goto err;
1411     }
1412 
1413     struct RAMSrcPageRequest *new_entry =
1414         g_malloc0(sizeof(struct RAMSrcPageRequest));
1415     new_entry->rb = ramblock;
1416     new_entry->offset = start;
1417     new_entry->len = len;
1418 
1419     memory_region_ref(ramblock->mr);
1420     qemu_mutex_lock(&rs->src_page_req_mutex);
1421     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1422     qemu_mutex_unlock(&rs->src_page_req_mutex);
1423     rcu_read_unlock();
1424 
1425     return 0;
1426 
1427 err:
1428     rcu_read_unlock();
1429     return -1;
1430 }
1431 
1432 /**
1433  * ram_save_target_page: save one target page
1434  *
1435  * Returns the number of pages written
1436  *
1437  * @rs: current RAM state
1438  * @ms: current migration state
1439  * @pss: data about the page we want to send
1440  * @last_stage: if we are at the completion stage
1441  */
1442 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1443                                 bool last_stage)
1444 {
1445     int res = 0;
1446 
1447     /* Check the pages is dirty and if it is send it */
1448     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1449         /*
1450          * If xbzrle is on, stop using the data compression after first
1451          * round of migration even if compression is enabled. In theory,
1452          * xbzrle can do better than compression.
1453          */
1454         if (migrate_use_compression() &&
1455             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1456             res = ram_save_compressed_page(rs, pss, last_stage);
1457         } else {
1458             res = ram_save_page(rs, pss, last_stage);
1459         }
1460 
1461         if (res < 0) {
1462             return res;
1463         }
1464         if (pss->block->unsentmap) {
1465             clear_bit(pss->page, pss->block->unsentmap);
1466         }
1467     }
1468 
1469     return res;
1470 }
1471 
1472 /**
1473  * ram_save_host_page: save a whole host page
1474  *
1475  * Starting at *offset send pages up to the end of the current host
1476  * page. It's valid for the initial offset to point into the middle of
1477  * a host page in which case the remainder of the hostpage is sent.
1478  * Only dirty target pages are sent. Note that the host page size may
1479  * be a huge page for this block.
1480  * The saving stops at the boundary of the used_length of the block
1481  * if the RAMBlock isn't a multiple of the host page size.
1482  *
1483  * Returns the number of pages written or negative on error
1484  *
1485  * @rs: current RAM state
1486  * @ms: current migration state
1487  * @pss: data about the page we want to send
1488  * @last_stage: if we are at the completion stage
1489  */
1490 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1491                               bool last_stage)
1492 {
1493     int tmppages, pages = 0;
1494     size_t pagesize_bits =
1495         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1496 
1497     do {
1498         tmppages = ram_save_target_page(rs, pss, last_stage);
1499         if (tmppages < 0) {
1500             return tmppages;
1501         }
1502 
1503         pages += tmppages;
1504         pss->page++;
1505     } while ((pss->page & (pagesize_bits - 1)) &&
1506              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1507 
1508     /* The offset we leave with is the last one we looked at */
1509     pss->page--;
1510     return pages;
1511 }
1512 
1513 /**
1514  * ram_find_and_save_block: finds a dirty page and sends it to f
1515  *
1516  * Called within an RCU critical section.
1517  *
1518  * Returns the number of pages written where zero means no dirty pages
1519  *
1520  * @rs: current RAM state
1521  * @last_stage: if we are at the completion stage
1522  *
1523  * On systems where host-page-size > target-page-size it will send all the
1524  * pages in a host page that are dirty.
1525  */
1526 
1527 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1528 {
1529     PageSearchStatus pss;
1530     int pages = 0;
1531     bool again, found;
1532 
1533     /* No dirty page as there is zero RAM */
1534     if (!ram_bytes_total()) {
1535         return pages;
1536     }
1537 
1538     pss.block = rs->last_seen_block;
1539     pss.page = rs->last_page;
1540     pss.complete_round = false;
1541 
1542     if (!pss.block) {
1543         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1544     }
1545 
1546     do {
1547         again = true;
1548         found = get_queued_page(rs, &pss);
1549 
1550         if (!found) {
1551             /* priority queue empty, so just search for something dirty */
1552             found = find_dirty_block(rs, &pss, &again);
1553         }
1554 
1555         if (found) {
1556             pages = ram_save_host_page(rs, &pss, last_stage);
1557         }
1558     } while (!pages && again);
1559 
1560     rs->last_seen_block = pss.block;
1561     rs->last_page = pss.page;
1562 
1563     return pages;
1564 }
1565 
1566 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1567 {
1568     uint64_t pages = size / TARGET_PAGE_SIZE;
1569 
1570     if (zero) {
1571         ram_counters.duplicate += pages;
1572     } else {
1573         ram_counters.normal += pages;
1574         ram_counters.transferred += size;
1575         qemu_update_position(f, size);
1576     }
1577 }
1578 
1579 uint64_t ram_bytes_total(void)
1580 {
1581     RAMBlock *block;
1582     uint64_t total = 0;
1583 
1584     rcu_read_lock();
1585     RAMBLOCK_FOREACH(block) {
1586         total += block->used_length;
1587     }
1588     rcu_read_unlock();
1589     return total;
1590 }
1591 
1592 static void xbzrle_load_setup(void)
1593 {
1594     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1595 }
1596 
1597 static void xbzrle_load_cleanup(void)
1598 {
1599     g_free(XBZRLE.decoded_buf);
1600     XBZRLE.decoded_buf = NULL;
1601 }
1602 
1603 static void ram_state_cleanup(RAMState **rsp)
1604 {
1605     if (*rsp) {
1606         migration_page_queue_free(*rsp);
1607         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1608         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1609         g_free(*rsp);
1610         *rsp = NULL;
1611     }
1612 }
1613 
1614 static void xbzrle_cleanup(void)
1615 {
1616     XBZRLE_cache_lock();
1617     if (XBZRLE.cache) {
1618         cache_fini(XBZRLE.cache);
1619         g_free(XBZRLE.encoded_buf);
1620         g_free(XBZRLE.current_buf);
1621         g_free(XBZRLE.zero_target_page);
1622         XBZRLE.cache = NULL;
1623         XBZRLE.encoded_buf = NULL;
1624         XBZRLE.current_buf = NULL;
1625         XBZRLE.zero_target_page = NULL;
1626     }
1627     XBZRLE_cache_unlock();
1628 }
1629 
1630 static void ram_save_cleanup(void *opaque)
1631 {
1632     RAMState **rsp = opaque;
1633     RAMBlock *block;
1634 
1635     /* caller have hold iothread lock or is in a bh, so there is
1636      * no writing race against this migration_bitmap
1637      */
1638     memory_global_dirty_log_stop();
1639 
1640     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1641         g_free(block->bmap);
1642         block->bmap = NULL;
1643         g_free(block->unsentmap);
1644         block->unsentmap = NULL;
1645     }
1646 
1647     xbzrle_cleanup();
1648     compress_threads_save_cleanup();
1649     ram_state_cleanup(rsp);
1650 }
1651 
1652 static void ram_state_reset(RAMState *rs)
1653 {
1654     rs->last_seen_block = NULL;
1655     rs->last_sent_block = NULL;
1656     rs->last_page = 0;
1657     rs->last_version = ram_list.version;
1658     rs->ram_bulk_stage = true;
1659 }
1660 
1661 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1662 
1663 /*
1664  * 'expected' is the value you expect the bitmap mostly to be full
1665  * of; it won't bother printing lines that are all this value.
1666  * If 'todump' is null the migration bitmap is dumped.
1667  */
1668 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1669                            unsigned long pages)
1670 {
1671     int64_t cur;
1672     int64_t linelen = 128;
1673     char linebuf[129];
1674 
1675     for (cur = 0; cur < pages; cur += linelen) {
1676         int64_t curb;
1677         bool found = false;
1678         /*
1679          * Last line; catch the case where the line length
1680          * is longer than remaining ram
1681          */
1682         if (cur + linelen > pages) {
1683             linelen = pages - cur;
1684         }
1685         for (curb = 0; curb < linelen; curb++) {
1686             bool thisbit = test_bit(cur + curb, todump);
1687             linebuf[curb] = thisbit ? '1' : '.';
1688             found = found || (thisbit != expected);
1689         }
1690         if (found) {
1691             linebuf[curb] = '\0';
1692             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1693         }
1694     }
1695 }
1696 
1697 /* **** functions for postcopy ***** */
1698 
1699 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1700 {
1701     struct RAMBlock *block;
1702 
1703     RAMBLOCK_FOREACH(block) {
1704         unsigned long *bitmap = block->bmap;
1705         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1706         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1707 
1708         while (run_start < range) {
1709             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1710             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1711                               (run_end - run_start) << TARGET_PAGE_BITS);
1712             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1713         }
1714     }
1715 }
1716 
1717 /**
1718  * postcopy_send_discard_bm_ram: discard a RAMBlock
1719  *
1720  * Returns zero on success
1721  *
1722  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1723  * Note: At this point the 'unsentmap' is the processed bitmap combined
1724  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1725  *
1726  * @ms: current migration state
1727  * @pds: state for postcopy
1728  * @start: RAMBlock starting page
1729  * @length: RAMBlock size
1730  */
1731 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1732                                         PostcopyDiscardState *pds,
1733                                         RAMBlock *block)
1734 {
1735     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1736     unsigned long current;
1737     unsigned long *unsentmap = block->unsentmap;
1738 
1739     for (current = 0; current < end; ) {
1740         unsigned long one = find_next_bit(unsentmap, end, current);
1741 
1742         if (one <= end) {
1743             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1744             unsigned long discard_length;
1745 
1746             if (zero >= end) {
1747                 discard_length = end - one;
1748             } else {
1749                 discard_length = zero - one;
1750             }
1751             if (discard_length) {
1752                 postcopy_discard_send_range(ms, pds, one, discard_length);
1753             }
1754             current = one + discard_length;
1755         } else {
1756             current = one;
1757         }
1758     }
1759 
1760     return 0;
1761 }
1762 
1763 /**
1764  * postcopy_each_ram_send_discard: discard all RAMBlocks
1765  *
1766  * Returns 0 for success or negative for error
1767  *
1768  * Utility for the outgoing postcopy code.
1769  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1770  *   passing it bitmap indexes and name.
1771  * (qemu_ram_foreach_block ends up passing unscaled lengths
1772  *  which would mean postcopy code would have to deal with target page)
1773  *
1774  * @ms: current migration state
1775  */
1776 static int postcopy_each_ram_send_discard(MigrationState *ms)
1777 {
1778     struct RAMBlock *block;
1779     int ret;
1780 
1781     RAMBLOCK_FOREACH(block) {
1782         PostcopyDiscardState *pds =
1783             postcopy_discard_send_init(ms, block->idstr);
1784 
1785         /*
1786          * Postcopy sends chunks of bitmap over the wire, but it
1787          * just needs indexes at this point, avoids it having
1788          * target page specific code.
1789          */
1790         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1791         postcopy_discard_send_finish(ms, pds);
1792         if (ret) {
1793             return ret;
1794         }
1795     }
1796 
1797     return 0;
1798 }
1799 
1800 /**
1801  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1802  *
1803  * Helper for postcopy_chunk_hostpages; it's called twice to
1804  * canonicalize the two bitmaps, that are similar, but one is
1805  * inverted.
1806  *
1807  * Postcopy requires that all target pages in a hostpage are dirty or
1808  * clean, not a mix.  This function canonicalizes the bitmaps.
1809  *
1810  * @ms: current migration state
1811  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1812  *               otherwise we need to canonicalize partially dirty host pages
1813  * @block: block that contains the page we want to canonicalize
1814  * @pds: state for postcopy
1815  */
1816 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1817                                           RAMBlock *block,
1818                                           PostcopyDiscardState *pds)
1819 {
1820     RAMState *rs = ram_state;
1821     unsigned long *bitmap = block->bmap;
1822     unsigned long *unsentmap = block->unsentmap;
1823     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1824     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1825     unsigned long run_start;
1826 
1827     if (block->page_size == TARGET_PAGE_SIZE) {
1828         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1829         return;
1830     }
1831 
1832     if (unsent_pass) {
1833         /* Find a sent page */
1834         run_start = find_next_zero_bit(unsentmap, pages, 0);
1835     } else {
1836         /* Find a dirty page */
1837         run_start = find_next_bit(bitmap, pages, 0);
1838     }
1839 
1840     while (run_start < pages) {
1841         bool do_fixup = false;
1842         unsigned long fixup_start_addr;
1843         unsigned long host_offset;
1844 
1845         /*
1846          * If the start of this run of pages is in the middle of a host
1847          * page, then we need to fixup this host page.
1848          */
1849         host_offset = run_start % host_ratio;
1850         if (host_offset) {
1851             do_fixup = true;
1852             run_start -= host_offset;
1853             fixup_start_addr = run_start;
1854             /* For the next pass */
1855             run_start = run_start + host_ratio;
1856         } else {
1857             /* Find the end of this run */
1858             unsigned long run_end;
1859             if (unsent_pass) {
1860                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1861             } else {
1862                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1863             }
1864             /*
1865              * If the end isn't at the start of a host page, then the
1866              * run doesn't finish at the end of a host page
1867              * and we need to discard.
1868              */
1869             host_offset = run_end % host_ratio;
1870             if (host_offset) {
1871                 do_fixup = true;
1872                 fixup_start_addr = run_end - host_offset;
1873                 /*
1874                  * This host page has gone, the next loop iteration starts
1875                  * from after the fixup
1876                  */
1877                 run_start = fixup_start_addr + host_ratio;
1878             } else {
1879                 /*
1880                  * No discards on this iteration, next loop starts from
1881                  * next sent/dirty page
1882                  */
1883                 run_start = run_end + 1;
1884             }
1885         }
1886 
1887         if (do_fixup) {
1888             unsigned long page;
1889 
1890             /* Tell the destination to discard this page */
1891             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1892                 /* For the unsent_pass we:
1893                  *     discard partially sent pages
1894                  * For the !unsent_pass (dirty) we:
1895                  *     discard partially dirty pages that were sent
1896                  *     (any partially sent pages were already discarded
1897                  *     by the previous unsent_pass)
1898                  */
1899                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1900                                             host_ratio);
1901             }
1902 
1903             /* Clean up the bitmap */
1904             for (page = fixup_start_addr;
1905                  page < fixup_start_addr + host_ratio; page++) {
1906                 /* All pages in this host page are now not sent */
1907                 set_bit(page, unsentmap);
1908 
1909                 /*
1910                  * Remark them as dirty, updating the count for any pages
1911                  * that weren't previously dirty.
1912                  */
1913                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1914             }
1915         }
1916 
1917         if (unsent_pass) {
1918             /* Find the next sent page for the next iteration */
1919             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1920         } else {
1921             /* Find the next dirty page for the next iteration */
1922             run_start = find_next_bit(bitmap, pages, run_start);
1923         }
1924     }
1925 }
1926 
1927 /**
1928  * postcopy_chuck_hostpages: discrad any partially sent host page
1929  *
1930  * Utility for the outgoing postcopy code.
1931  *
1932  * Discard any partially sent host-page size chunks, mark any partially
1933  * dirty host-page size chunks as all dirty.  In this case the host-page
1934  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1935  *
1936  * Returns zero on success
1937  *
1938  * @ms: current migration state
1939  * @block: block we want to work with
1940  */
1941 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1942 {
1943     PostcopyDiscardState *pds =
1944         postcopy_discard_send_init(ms, block->idstr);
1945 
1946     /* First pass: Discard all partially sent host pages */
1947     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1948     /*
1949      * Second pass: Ensure that all partially dirty host pages are made
1950      * fully dirty.
1951      */
1952     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1953 
1954     postcopy_discard_send_finish(ms, pds);
1955     return 0;
1956 }
1957 
1958 /**
1959  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1960  *
1961  * Returns zero on success
1962  *
1963  * Transmit the set of pages to be discarded after precopy to the target
1964  * these are pages that:
1965  *     a) Have been previously transmitted but are now dirty again
1966  *     b) Pages that have never been transmitted, this ensures that
1967  *        any pages on the destination that have been mapped by background
1968  *        tasks get discarded (transparent huge pages is the specific concern)
1969  * Hopefully this is pretty sparse
1970  *
1971  * @ms: current migration state
1972  */
1973 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1974 {
1975     RAMState *rs = ram_state;
1976     RAMBlock *block;
1977     int ret;
1978 
1979     rcu_read_lock();
1980 
1981     /* This should be our last sync, the src is now paused */
1982     migration_bitmap_sync(rs);
1983 
1984     /* Easiest way to make sure we don't resume in the middle of a host-page */
1985     rs->last_seen_block = NULL;
1986     rs->last_sent_block = NULL;
1987     rs->last_page = 0;
1988 
1989     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1990         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1991         unsigned long *bitmap = block->bmap;
1992         unsigned long *unsentmap = block->unsentmap;
1993 
1994         if (!unsentmap) {
1995             /* We don't have a safe way to resize the sentmap, so
1996              * if the bitmap was resized it will be NULL at this
1997              * point.
1998              */
1999             error_report("migration ram resized during precopy phase");
2000             rcu_read_unlock();
2001             return -EINVAL;
2002         }
2003         /* Deal with TPS != HPS and huge pages */
2004         ret = postcopy_chunk_hostpages(ms, block);
2005         if (ret) {
2006             rcu_read_unlock();
2007             return ret;
2008         }
2009 
2010         /*
2011          * Update the unsentmap to be unsentmap = unsentmap | dirty
2012          */
2013         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2014 #ifdef DEBUG_POSTCOPY
2015         ram_debug_dump_bitmap(unsentmap, true, pages);
2016 #endif
2017     }
2018     trace_ram_postcopy_send_discard_bitmap();
2019 
2020     ret = postcopy_each_ram_send_discard(ms);
2021     rcu_read_unlock();
2022 
2023     return ret;
2024 }
2025 
2026 /**
2027  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2028  *
2029  * Returns zero on success
2030  *
2031  * @rbname: name of the RAMBlock of the request. NULL means the
2032  *          same that last one.
2033  * @start: RAMBlock starting page
2034  * @length: RAMBlock size
2035  */
2036 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2037 {
2038     int ret = -1;
2039 
2040     trace_ram_discard_range(rbname, start, length);
2041 
2042     rcu_read_lock();
2043     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2044 
2045     if (!rb) {
2046         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2047         goto err;
2048     }
2049 
2050     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2051                  length >> qemu_target_page_bits());
2052     ret = ram_block_discard_range(rb, start, length);
2053 
2054 err:
2055     rcu_read_unlock();
2056 
2057     return ret;
2058 }
2059 
2060 /*
2061  * For every allocation, we will try not to crash the VM if the
2062  * allocation failed.
2063  */
2064 static int xbzrle_init(void)
2065 {
2066     Error *local_err = NULL;
2067 
2068     if (!migrate_use_xbzrle()) {
2069         return 0;
2070     }
2071 
2072     XBZRLE_cache_lock();
2073 
2074     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2075     if (!XBZRLE.zero_target_page) {
2076         error_report("%s: Error allocating zero page", __func__);
2077         goto err_out;
2078     }
2079 
2080     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2081                               TARGET_PAGE_SIZE, &local_err);
2082     if (!XBZRLE.cache) {
2083         error_report_err(local_err);
2084         goto free_zero_page;
2085     }
2086 
2087     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2088     if (!XBZRLE.encoded_buf) {
2089         error_report("%s: Error allocating encoded_buf", __func__);
2090         goto free_cache;
2091     }
2092 
2093     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2094     if (!XBZRLE.current_buf) {
2095         error_report("%s: Error allocating current_buf", __func__);
2096         goto free_encoded_buf;
2097     }
2098 
2099     /* We are all good */
2100     XBZRLE_cache_unlock();
2101     return 0;
2102 
2103 free_encoded_buf:
2104     g_free(XBZRLE.encoded_buf);
2105     XBZRLE.encoded_buf = NULL;
2106 free_cache:
2107     cache_fini(XBZRLE.cache);
2108     XBZRLE.cache = NULL;
2109 free_zero_page:
2110     g_free(XBZRLE.zero_target_page);
2111     XBZRLE.zero_target_page = NULL;
2112 err_out:
2113     XBZRLE_cache_unlock();
2114     return -ENOMEM;
2115 }
2116 
2117 static int ram_state_init(RAMState **rsp)
2118 {
2119     *rsp = g_try_new0(RAMState, 1);
2120 
2121     if (!*rsp) {
2122         error_report("%s: Init ramstate fail", __func__);
2123         return -1;
2124     }
2125 
2126     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2127     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2128     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2129 
2130     /*
2131      * Count the total number of pages used by ram blocks not including any
2132      * gaps due to alignment or unplugs.
2133      */
2134     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2135 
2136     ram_state_reset(*rsp);
2137 
2138     return 0;
2139 }
2140 
2141 static void ram_list_init_bitmaps(void)
2142 {
2143     RAMBlock *block;
2144     unsigned long pages;
2145 
2146     /* Skip setting bitmap if there is no RAM */
2147     if (ram_bytes_total()) {
2148         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2149             pages = block->max_length >> TARGET_PAGE_BITS;
2150             block->bmap = bitmap_new(pages);
2151             bitmap_set(block->bmap, 0, pages);
2152             if (migrate_postcopy_ram()) {
2153                 block->unsentmap = bitmap_new(pages);
2154                 bitmap_set(block->unsentmap, 0, pages);
2155             }
2156         }
2157     }
2158 }
2159 
2160 static void ram_init_bitmaps(RAMState *rs)
2161 {
2162     /* For memory_global_dirty_log_start below.  */
2163     qemu_mutex_lock_iothread();
2164     qemu_mutex_lock_ramlist();
2165     rcu_read_lock();
2166 
2167     ram_list_init_bitmaps();
2168     memory_global_dirty_log_start();
2169     migration_bitmap_sync(rs);
2170 
2171     rcu_read_unlock();
2172     qemu_mutex_unlock_ramlist();
2173     qemu_mutex_unlock_iothread();
2174 }
2175 
2176 static int ram_init_all(RAMState **rsp)
2177 {
2178     if (ram_state_init(rsp)) {
2179         return -1;
2180     }
2181 
2182     if (xbzrle_init()) {
2183         ram_state_cleanup(rsp);
2184         return -1;
2185     }
2186 
2187     ram_init_bitmaps(*rsp);
2188 
2189     return 0;
2190 }
2191 
2192 /*
2193  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2194  * long-running RCU critical section.  When rcu-reclaims in the code
2195  * start to become numerous it will be necessary to reduce the
2196  * granularity of these critical sections.
2197  */
2198 
2199 /**
2200  * ram_save_setup: Setup RAM for migration
2201  *
2202  * Returns zero to indicate success and negative for error
2203  *
2204  * @f: QEMUFile where to send the data
2205  * @opaque: RAMState pointer
2206  */
2207 static int ram_save_setup(QEMUFile *f, void *opaque)
2208 {
2209     RAMState **rsp = opaque;
2210     RAMBlock *block;
2211 
2212     /* migration has already setup the bitmap, reuse it. */
2213     if (!migration_in_colo_state()) {
2214         if (ram_init_all(rsp) != 0) {
2215             return -1;
2216         }
2217     }
2218     (*rsp)->f = f;
2219 
2220     rcu_read_lock();
2221 
2222     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2223 
2224     RAMBLOCK_FOREACH(block) {
2225         qemu_put_byte(f, strlen(block->idstr));
2226         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2227         qemu_put_be64(f, block->used_length);
2228         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2229             qemu_put_be64(f, block->page_size);
2230         }
2231     }
2232 
2233     rcu_read_unlock();
2234     compress_threads_save_setup();
2235 
2236     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2237     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2238 
2239     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2240 
2241     return 0;
2242 }
2243 
2244 /**
2245  * ram_save_iterate: iterative stage for migration
2246  *
2247  * Returns zero to indicate success and negative for error
2248  *
2249  * @f: QEMUFile where to send the data
2250  * @opaque: RAMState pointer
2251  */
2252 static int ram_save_iterate(QEMUFile *f, void *opaque)
2253 {
2254     RAMState **temp = opaque;
2255     RAMState *rs = *temp;
2256     int ret;
2257     int i;
2258     int64_t t0;
2259     int done = 0;
2260 
2261     if (blk_mig_bulk_active()) {
2262         /* Avoid transferring ram during bulk phase of block migration as
2263          * the bulk phase will usually take a long time and transferring
2264          * ram updates during that time is pointless. */
2265         goto out;
2266     }
2267 
2268     rcu_read_lock();
2269     if (ram_list.version != rs->last_version) {
2270         ram_state_reset(rs);
2271     }
2272 
2273     /* Read version before ram_list.blocks */
2274     smp_rmb();
2275 
2276     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2277 
2278     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2279     i = 0;
2280     while ((ret = qemu_file_rate_limit(f)) == 0) {
2281         int pages;
2282 
2283         pages = ram_find_and_save_block(rs, false);
2284         /* no more pages to sent */
2285         if (pages == 0) {
2286             done = 1;
2287             break;
2288         }
2289         rs->iterations++;
2290 
2291         /* we want to check in the 1st loop, just in case it was the 1st time
2292            and we had to sync the dirty bitmap.
2293            qemu_get_clock_ns() is a bit expensive, so we only check each some
2294            iterations
2295         */
2296         if ((i & 63) == 0) {
2297             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2298             if (t1 > MAX_WAIT) {
2299                 trace_ram_save_iterate_big_wait(t1, i);
2300                 break;
2301             }
2302         }
2303         i++;
2304     }
2305     flush_compressed_data(rs);
2306     rcu_read_unlock();
2307 
2308     /*
2309      * Must occur before EOS (or any QEMUFile operation)
2310      * because of RDMA protocol.
2311      */
2312     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2313 
2314 out:
2315     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2316     ram_counters.transferred += 8;
2317 
2318     ret = qemu_file_get_error(f);
2319     if (ret < 0) {
2320         return ret;
2321     }
2322 
2323     return done;
2324 }
2325 
2326 /**
2327  * ram_save_complete: function called to send the remaining amount of ram
2328  *
2329  * Returns zero to indicate success
2330  *
2331  * Called with iothread lock
2332  *
2333  * @f: QEMUFile where to send the data
2334  * @opaque: RAMState pointer
2335  */
2336 static int ram_save_complete(QEMUFile *f, void *opaque)
2337 {
2338     RAMState **temp = opaque;
2339     RAMState *rs = *temp;
2340 
2341     rcu_read_lock();
2342 
2343     if (!migration_in_postcopy()) {
2344         migration_bitmap_sync(rs);
2345     }
2346 
2347     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2348 
2349     /* try transferring iterative blocks of memory */
2350 
2351     /* flush all remaining blocks regardless of rate limiting */
2352     while (true) {
2353         int pages;
2354 
2355         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2356         /* no more blocks to sent */
2357         if (pages == 0) {
2358             break;
2359         }
2360     }
2361 
2362     flush_compressed_data(rs);
2363     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2364 
2365     rcu_read_unlock();
2366 
2367     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2368 
2369     return 0;
2370 }
2371 
2372 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2373                              uint64_t *res_precopy_only,
2374                              uint64_t *res_compatible,
2375                              uint64_t *res_postcopy_only)
2376 {
2377     RAMState **temp = opaque;
2378     RAMState *rs = *temp;
2379     uint64_t remaining_size;
2380 
2381     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2382 
2383     if (!migration_in_postcopy() &&
2384         remaining_size < max_size) {
2385         qemu_mutex_lock_iothread();
2386         rcu_read_lock();
2387         migration_bitmap_sync(rs);
2388         rcu_read_unlock();
2389         qemu_mutex_unlock_iothread();
2390         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2391     }
2392 
2393     if (migrate_postcopy_ram()) {
2394         /* We can do postcopy, and all the data is postcopiable */
2395         *res_compatible += remaining_size;
2396     } else {
2397         *res_precopy_only += remaining_size;
2398     }
2399 }
2400 
2401 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2402 {
2403     unsigned int xh_len;
2404     int xh_flags;
2405     uint8_t *loaded_data;
2406 
2407     /* extract RLE header */
2408     xh_flags = qemu_get_byte(f);
2409     xh_len = qemu_get_be16(f);
2410 
2411     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2412         error_report("Failed to load XBZRLE page - wrong compression!");
2413         return -1;
2414     }
2415 
2416     if (xh_len > TARGET_PAGE_SIZE) {
2417         error_report("Failed to load XBZRLE page - len overflow!");
2418         return -1;
2419     }
2420     loaded_data = XBZRLE.decoded_buf;
2421     /* load data and decode */
2422     /* it can change loaded_data to point to an internal buffer */
2423     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2424 
2425     /* decode RLE */
2426     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2427                              TARGET_PAGE_SIZE) == -1) {
2428         error_report("Failed to load XBZRLE page - decode error!");
2429         return -1;
2430     }
2431 
2432     return 0;
2433 }
2434 
2435 /**
2436  * ram_block_from_stream: read a RAMBlock id from the migration stream
2437  *
2438  * Must be called from within a rcu critical section.
2439  *
2440  * Returns a pointer from within the RCU-protected ram_list.
2441  *
2442  * @f: QEMUFile where to read the data from
2443  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2444  */
2445 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2446 {
2447     static RAMBlock *block = NULL;
2448     char id[256];
2449     uint8_t len;
2450 
2451     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2452         if (!block) {
2453             error_report("Ack, bad migration stream!");
2454             return NULL;
2455         }
2456         return block;
2457     }
2458 
2459     len = qemu_get_byte(f);
2460     qemu_get_buffer(f, (uint8_t *)id, len);
2461     id[len] = 0;
2462 
2463     block = qemu_ram_block_by_name(id);
2464     if (!block) {
2465         error_report("Can't find block %s", id);
2466         return NULL;
2467     }
2468 
2469     return block;
2470 }
2471 
2472 static inline void *host_from_ram_block_offset(RAMBlock *block,
2473                                                ram_addr_t offset)
2474 {
2475     if (!offset_in_ramblock(block, offset)) {
2476         return NULL;
2477     }
2478 
2479     return block->host + offset;
2480 }
2481 
2482 /**
2483  * ram_handle_compressed: handle the zero page case
2484  *
2485  * If a page (or a whole RDMA chunk) has been
2486  * determined to be zero, then zap it.
2487  *
2488  * @host: host address for the zero page
2489  * @ch: what the page is filled from.  We only support zero
2490  * @size: size of the zero page
2491  */
2492 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2493 {
2494     if (ch != 0 || !is_zero_range(host, size)) {
2495         memset(host, ch, size);
2496     }
2497 }
2498 
2499 static void *do_data_decompress(void *opaque)
2500 {
2501     DecompressParam *param = opaque;
2502     unsigned long pagesize;
2503     uint8_t *des;
2504     int len;
2505 
2506     qemu_mutex_lock(&param->mutex);
2507     while (!param->quit) {
2508         if (param->des) {
2509             des = param->des;
2510             len = param->len;
2511             param->des = 0;
2512             qemu_mutex_unlock(&param->mutex);
2513 
2514             pagesize = TARGET_PAGE_SIZE;
2515             /* uncompress() will return failed in some case, especially
2516              * when the page is dirted when doing the compression, it's
2517              * not a problem because the dirty page will be retransferred
2518              * and uncompress() won't break the data in other pages.
2519              */
2520             uncompress((Bytef *)des, &pagesize,
2521                        (const Bytef *)param->compbuf, len);
2522 
2523             qemu_mutex_lock(&decomp_done_lock);
2524             param->done = true;
2525             qemu_cond_signal(&decomp_done_cond);
2526             qemu_mutex_unlock(&decomp_done_lock);
2527 
2528             qemu_mutex_lock(&param->mutex);
2529         } else {
2530             qemu_cond_wait(&param->cond, &param->mutex);
2531         }
2532     }
2533     qemu_mutex_unlock(&param->mutex);
2534 
2535     return NULL;
2536 }
2537 
2538 static void wait_for_decompress_done(void)
2539 {
2540     int idx, thread_count;
2541 
2542     if (!migrate_use_compression()) {
2543         return;
2544     }
2545 
2546     thread_count = migrate_decompress_threads();
2547     qemu_mutex_lock(&decomp_done_lock);
2548     for (idx = 0; idx < thread_count; idx++) {
2549         while (!decomp_param[idx].done) {
2550             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2551         }
2552     }
2553     qemu_mutex_unlock(&decomp_done_lock);
2554 }
2555 
2556 static void compress_threads_load_setup(void)
2557 {
2558     int i, thread_count;
2559 
2560     if (!migrate_use_compression()) {
2561         return;
2562     }
2563     thread_count = migrate_decompress_threads();
2564     decompress_threads = g_new0(QemuThread, thread_count);
2565     decomp_param = g_new0(DecompressParam, thread_count);
2566     qemu_mutex_init(&decomp_done_lock);
2567     qemu_cond_init(&decomp_done_cond);
2568     for (i = 0; i < thread_count; i++) {
2569         qemu_mutex_init(&decomp_param[i].mutex);
2570         qemu_cond_init(&decomp_param[i].cond);
2571         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2572         decomp_param[i].done = true;
2573         decomp_param[i].quit = false;
2574         qemu_thread_create(decompress_threads + i, "decompress",
2575                            do_data_decompress, decomp_param + i,
2576                            QEMU_THREAD_JOINABLE);
2577     }
2578 }
2579 
2580 static void compress_threads_load_cleanup(void)
2581 {
2582     int i, thread_count;
2583 
2584     if (!migrate_use_compression()) {
2585         return;
2586     }
2587     thread_count = migrate_decompress_threads();
2588     for (i = 0; i < thread_count; i++) {
2589         qemu_mutex_lock(&decomp_param[i].mutex);
2590         decomp_param[i].quit = true;
2591         qemu_cond_signal(&decomp_param[i].cond);
2592         qemu_mutex_unlock(&decomp_param[i].mutex);
2593     }
2594     for (i = 0; i < thread_count; i++) {
2595         qemu_thread_join(decompress_threads + i);
2596         qemu_mutex_destroy(&decomp_param[i].mutex);
2597         qemu_cond_destroy(&decomp_param[i].cond);
2598         g_free(decomp_param[i].compbuf);
2599     }
2600     g_free(decompress_threads);
2601     g_free(decomp_param);
2602     decompress_threads = NULL;
2603     decomp_param = NULL;
2604 }
2605 
2606 static void decompress_data_with_multi_threads(QEMUFile *f,
2607                                                void *host, int len)
2608 {
2609     int idx, thread_count;
2610 
2611     thread_count = migrate_decompress_threads();
2612     qemu_mutex_lock(&decomp_done_lock);
2613     while (true) {
2614         for (idx = 0; idx < thread_count; idx++) {
2615             if (decomp_param[idx].done) {
2616                 decomp_param[idx].done = false;
2617                 qemu_mutex_lock(&decomp_param[idx].mutex);
2618                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2619                 decomp_param[idx].des = host;
2620                 decomp_param[idx].len = len;
2621                 qemu_cond_signal(&decomp_param[idx].cond);
2622                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2623                 break;
2624             }
2625         }
2626         if (idx < thread_count) {
2627             break;
2628         } else {
2629             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2630         }
2631     }
2632     qemu_mutex_unlock(&decomp_done_lock);
2633 }
2634 
2635 /**
2636  * ram_load_setup: Setup RAM for migration incoming side
2637  *
2638  * Returns zero to indicate success and negative for error
2639  *
2640  * @f: QEMUFile where to receive the data
2641  * @opaque: RAMState pointer
2642  */
2643 static int ram_load_setup(QEMUFile *f, void *opaque)
2644 {
2645     xbzrle_load_setup();
2646     compress_threads_load_setup();
2647     ramblock_recv_map_init();
2648     return 0;
2649 }
2650 
2651 static int ram_load_cleanup(void *opaque)
2652 {
2653     RAMBlock *rb;
2654     xbzrle_load_cleanup();
2655     compress_threads_load_cleanup();
2656 
2657     RAMBLOCK_FOREACH(rb) {
2658         g_free(rb->receivedmap);
2659         rb->receivedmap = NULL;
2660     }
2661     return 0;
2662 }
2663 
2664 /**
2665  * ram_postcopy_incoming_init: allocate postcopy data structures
2666  *
2667  * Returns 0 for success and negative if there was one error
2668  *
2669  * @mis: current migration incoming state
2670  *
2671  * Allocate data structures etc needed by incoming migration with
2672  * postcopy-ram. postcopy-ram's similarly names
2673  * postcopy_ram_incoming_init does the work.
2674  */
2675 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2676 {
2677     unsigned long ram_pages = last_ram_page();
2678 
2679     return postcopy_ram_incoming_init(mis, ram_pages);
2680 }
2681 
2682 /**
2683  * ram_load_postcopy: load a page in postcopy case
2684  *
2685  * Returns 0 for success or -errno in case of error
2686  *
2687  * Called in postcopy mode by ram_load().
2688  * rcu_read_lock is taken prior to this being called.
2689  *
2690  * @f: QEMUFile where to send the data
2691  */
2692 static int ram_load_postcopy(QEMUFile *f)
2693 {
2694     int flags = 0, ret = 0;
2695     bool place_needed = false;
2696     bool matching_page_sizes = false;
2697     MigrationIncomingState *mis = migration_incoming_get_current();
2698     /* Temporary page that is later 'placed' */
2699     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2700     void *last_host = NULL;
2701     bool all_zero = false;
2702 
2703     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2704         ram_addr_t addr;
2705         void *host = NULL;
2706         void *page_buffer = NULL;
2707         void *place_source = NULL;
2708         RAMBlock *block = NULL;
2709         uint8_t ch;
2710 
2711         addr = qemu_get_be64(f);
2712 
2713         /*
2714          * If qemu file error, we should stop here, and then "addr"
2715          * may be invalid
2716          */
2717         ret = qemu_file_get_error(f);
2718         if (ret) {
2719             break;
2720         }
2721 
2722         flags = addr & ~TARGET_PAGE_MASK;
2723         addr &= TARGET_PAGE_MASK;
2724 
2725         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2726         place_needed = false;
2727         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2728             block = ram_block_from_stream(f, flags);
2729 
2730             host = host_from_ram_block_offset(block, addr);
2731             if (!host) {
2732                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2733                 ret = -EINVAL;
2734                 break;
2735             }
2736             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2737             /*
2738              * Postcopy requires that we place whole host pages atomically;
2739              * these may be huge pages for RAMBlocks that are backed by
2740              * hugetlbfs.
2741              * To make it atomic, the data is read into a temporary page
2742              * that's moved into place later.
2743              * The migration protocol uses,  possibly smaller, target-pages
2744              * however the source ensures it always sends all the components
2745              * of a host page in order.
2746              */
2747             page_buffer = postcopy_host_page +
2748                           ((uintptr_t)host & (block->page_size - 1));
2749             /* If all TP are zero then we can optimise the place */
2750             if (!((uintptr_t)host & (block->page_size - 1))) {
2751                 all_zero = true;
2752             } else {
2753                 /* not the 1st TP within the HP */
2754                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2755                     error_report("Non-sequential target page %p/%p",
2756                                   host, last_host);
2757                     ret = -EINVAL;
2758                     break;
2759                 }
2760             }
2761 
2762 
2763             /*
2764              * If it's the last part of a host page then we place the host
2765              * page
2766              */
2767             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2768                                      (block->page_size - 1)) == 0;
2769             place_source = postcopy_host_page;
2770         }
2771         last_host = host;
2772 
2773         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2774         case RAM_SAVE_FLAG_ZERO:
2775             ch = qemu_get_byte(f);
2776             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2777             if (ch) {
2778                 all_zero = false;
2779             }
2780             break;
2781 
2782         case RAM_SAVE_FLAG_PAGE:
2783             all_zero = false;
2784             if (!place_needed || !matching_page_sizes) {
2785                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2786             } else {
2787                 /* Avoids the qemu_file copy during postcopy, which is
2788                  * going to do a copy later; can only do it when we
2789                  * do this read in one go (matching page sizes)
2790                  */
2791                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2792                                          TARGET_PAGE_SIZE);
2793             }
2794             break;
2795         case RAM_SAVE_FLAG_EOS:
2796             /* normal exit */
2797             break;
2798         default:
2799             error_report("Unknown combination of migration flags: %#x"
2800                          " (postcopy mode)", flags);
2801             ret = -EINVAL;
2802             break;
2803         }
2804 
2805         /* Detect for any possible file errors */
2806         if (!ret && qemu_file_get_error(f)) {
2807             ret = qemu_file_get_error(f);
2808         }
2809 
2810         if (!ret && place_needed) {
2811             /* This gets called at the last target page in the host page */
2812             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2813 
2814             if (all_zero) {
2815                 ret = postcopy_place_page_zero(mis, place_dest,
2816                                                block);
2817             } else {
2818                 ret = postcopy_place_page(mis, place_dest,
2819                                           place_source, block);
2820             }
2821         }
2822     }
2823 
2824     return ret;
2825 }
2826 
2827 static bool postcopy_is_advised(void)
2828 {
2829     PostcopyState ps = postcopy_state_get();
2830     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2831 }
2832 
2833 static bool postcopy_is_running(void)
2834 {
2835     PostcopyState ps = postcopy_state_get();
2836     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2837 }
2838 
2839 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2840 {
2841     int flags = 0, ret = 0, invalid_flags = 0;
2842     static uint64_t seq_iter;
2843     int len = 0;
2844     /*
2845      * If system is running in postcopy mode, page inserts to host memory must
2846      * be atomic
2847      */
2848     bool postcopy_running = postcopy_is_running();
2849     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2850     bool postcopy_advised = postcopy_is_advised();
2851 
2852     seq_iter++;
2853 
2854     if (version_id != 4) {
2855         ret = -EINVAL;
2856     }
2857 
2858     if (!migrate_use_compression()) {
2859         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2860     }
2861     /* This RCU critical section can be very long running.
2862      * When RCU reclaims in the code start to become numerous,
2863      * it will be necessary to reduce the granularity of this
2864      * critical section.
2865      */
2866     rcu_read_lock();
2867 
2868     if (postcopy_running) {
2869         ret = ram_load_postcopy(f);
2870     }
2871 
2872     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2873         ram_addr_t addr, total_ram_bytes;
2874         void *host = NULL;
2875         uint8_t ch;
2876 
2877         addr = qemu_get_be64(f);
2878         flags = addr & ~TARGET_PAGE_MASK;
2879         addr &= TARGET_PAGE_MASK;
2880 
2881         if (flags & invalid_flags) {
2882             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2883                 error_report("Received an unexpected compressed page");
2884             }
2885 
2886             ret = -EINVAL;
2887             break;
2888         }
2889 
2890         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2891                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2892             RAMBlock *block = ram_block_from_stream(f, flags);
2893 
2894             host = host_from_ram_block_offset(block, addr);
2895             if (!host) {
2896                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2897                 ret = -EINVAL;
2898                 break;
2899             }
2900             ramblock_recv_bitmap_set(block, host);
2901             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2902         }
2903 
2904         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2905         case RAM_SAVE_FLAG_MEM_SIZE:
2906             /* Synchronize RAM block list */
2907             total_ram_bytes = addr;
2908             while (!ret && total_ram_bytes) {
2909                 RAMBlock *block;
2910                 char id[256];
2911                 ram_addr_t length;
2912 
2913                 len = qemu_get_byte(f);
2914                 qemu_get_buffer(f, (uint8_t *)id, len);
2915                 id[len] = 0;
2916                 length = qemu_get_be64(f);
2917 
2918                 block = qemu_ram_block_by_name(id);
2919                 if (block) {
2920                     if (length != block->used_length) {
2921                         Error *local_err = NULL;
2922 
2923                         ret = qemu_ram_resize(block, length,
2924                                               &local_err);
2925                         if (local_err) {
2926                             error_report_err(local_err);
2927                         }
2928                     }
2929                     /* For postcopy we need to check hugepage sizes match */
2930                     if (postcopy_advised &&
2931                         block->page_size != qemu_host_page_size) {
2932                         uint64_t remote_page_size = qemu_get_be64(f);
2933                         if (remote_page_size != block->page_size) {
2934                             error_report("Mismatched RAM page size %s "
2935                                          "(local) %zd != %" PRId64,
2936                                          id, block->page_size,
2937                                          remote_page_size);
2938                             ret = -EINVAL;
2939                         }
2940                     }
2941                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2942                                           block->idstr);
2943                 } else {
2944                     error_report("Unknown ramblock \"%s\", cannot "
2945                                  "accept migration", id);
2946                     ret = -EINVAL;
2947                 }
2948 
2949                 total_ram_bytes -= length;
2950             }
2951             break;
2952 
2953         case RAM_SAVE_FLAG_ZERO:
2954             ch = qemu_get_byte(f);
2955             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2956             break;
2957 
2958         case RAM_SAVE_FLAG_PAGE:
2959             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2960             break;
2961 
2962         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2963             len = qemu_get_be32(f);
2964             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2965                 error_report("Invalid compressed data length: %d", len);
2966                 ret = -EINVAL;
2967                 break;
2968             }
2969             decompress_data_with_multi_threads(f, host, len);
2970             break;
2971 
2972         case RAM_SAVE_FLAG_XBZRLE:
2973             if (load_xbzrle(f, addr, host) < 0) {
2974                 error_report("Failed to decompress XBZRLE page at "
2975                              RAM_ADDR_FMT, addr);
2976                 ret = -EINVAL;
2977                 break;
2978             }
2979             break;
2980         case RAM_SAVE_FLAG_EOS:
2981             /* normal exit */
2982             break;
2983         default:
2984             if (flags & RAM_SAVE_FLAG_HOOK) {
2985                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2986             } else {
2987                 error_report("Unknown combination of migration flags: %#x",
2988                              flags);
2989                 ret = -EINVAL;
2990             }
2991         }
2992         if (!ret) {
2993             ret = qemu_file_get_error(f);
2994         }
2995     }
2996 
2997     wait_for_decompress_done();
2998     rcu_read_unlock();
2999     trace_ram_load_complete(ret, seq_iter);
3000     return ret;
3001 }
3002 
3003 static bool ram_has_postcopy(void *opaque)
3004 {
3005     return migrate_postcopy_ram();
3006 }
3007 
3008 static SaveVMHandlers savevm_ram_handlers = {
3009     .save_setup = ram_save_setup,
3010     .save_live_iterate = ram_save_iterate,
3011     .save_live_complete_postcopy = ram_save_complete,
3012     .save_live_complete_precopy = ram_save_complete,
3013     .has_postcopy = ram_has_postcopy,
3014     .save_live_pending = ram_save_pending,
3015     .load_state = ram_load,
3016     .save_cleanup = ram_save_cleanup,
3017     .load_setup = ram_load_setup,
3018     .load_cleanup = ram_load_cleanup,
3019 };
3020 
3021 void ram_mig_init(void)
3022 {
3023     qemu_mutex_init(&XBZRLE.lock);
3024     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3025 }
3026