xref: /openbmc/qemu/migration/ram.c (revision 62aa1d88)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "socket.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/sysemu.h"
56 #include "qemu/uuid.h"
57 #include "savevm.h"
58 #include "qemu/iov.h"
59 
60 /***********************************************************/
61 /* ram save/restore */
62 
63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
64  * worked for pages that where filled with the same char.  We switched
65  * it to only search for the zero value.  And to avoid confusion with
66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
67  */
68 
69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
70 #define RAM_SAVE_FLAG_ZERO     0x02
71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
72 #define RAM_SAVE_FLAG_PAGE     0x08
73 #define RAM_SAVE_FLAG_EOS      0x10
74 #define RAM_SAVE_FLAG_CONTINUE 0x20
75 #define RAM_SAVE_FLAG_XBZRLE   0x40
76 /* 0x80 is reserved in migration.h start with 0x100 next */
77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
78 
79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
80 {
81     return buffer_is_zero(p, size);
82 }
83 
84 XBZRLECacheStats xbzrle_counters;
85 
86 /* struct contains XBZRLE cache and a static page
87    used by the compression */
88 static struct {
89     /* buffer used for XBZRLE encoding */
90     uint8_t *encoded_buf;
91     /* buffer for storing page content */
92     uint8_t *current_buf;
93     /* Cache for XBZRLE, Protected by lock. */
94     PageCache *cache;
95     QemuMutex lock;
96     /* it will store a page full of zeros */
97     uint8_t *zero_target_page;
98     /* buffer used for XBZRLE decoding */
99     uint8_t *decoded_buf;
100 } XBZRLE;
101 
102 static void XBZRLE_cache_lock(void)
103 {
104     if (migrate_use_xbzrle())
105         qemu_mutex_lock(&XBZRLE.lock);
106 }
107 
108 static void XBZRLE_cache_unlock(void)
109 {
110     if (migrate_use_xbzrle())
111         qemu_mutex_unlock(&XBZRLE.lock);
112 }
113 
114 /**
115  * xbzrle_cache_resize: resize the xbzrle cache
116  *
117  * This function is called from qmp_migrate_set_cache_size in main
118  * thread, possibly while a migration is in progress.  A running
119  * migration may be using the cache and might finish during this call,
120  * hence changes to the cache are protected by XBZRLE.lock().
121  *
122  * Returns 0 for success or -1 for error
123  *
124  * @new_size: new cache size
125  * @errp: set *errp if the check failed, with reason
126  */
127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
128 {
129     PageCache *new_cache;
130     int64_t ret = 0;
131 
132     /* Check for truncation */
133     if (new_size != (size_t)new_size) {
134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
135                    "exceeding address space");
136         return -1;
137     }
138 
139     if (new_size == migrate_xbzrle_cache_size()) {
140         /* nothing to do */
141         return 0;
142     }
143 
144     XBZRLE_cache_lock();
145 
146     if (XBZRLE.cache != NULL) {
147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
148         if (!new_cache) {
149             ret = -1;
150             goto out;
151         }
152 
153         cache_fini(XBZRLE.cache);
154         XBZRLE.cache = new_cache;
155     }
156 out:
157     XBZRLE_cache_unlock();
158     return ret;
159 }
160 
161 /* Should be holding either ram_list.mutex, or the RCU lock. */
162 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
163     INTERNAL_RAMBLOCK_FOREACH(block)                   \
164         if (!qemu_ram_is_migratable(block)) {} else
165 
166 #undef RAMBLOCK_FOREACH
167 
168 static void ramblock_recv_map_init(void)
169 {
170     RAMBlock *rb;
171 
172     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
173         assert(!rb->receivedmap);
174         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
175     }
176 }
177 
178 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
179 {
180     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
181                     rb->receivedmap);
182 }
183 
184 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
185 {
186     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
187 }
188 
189 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
190 {
191     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
192 }
193 
194 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
195                                     size_t nr)
196 {
197     bitmap_set_atomic(rb->receivedmap,
198                       ramblock_recv_bitmap_offset(host_addr, rb),
199                       nr);
200 }
201 
202 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
203 
204 /*
205  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
206  *
207  * Returns >0 if success with sent bytes, or <0 if error.
208  */
209 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
210                                   const char *block_name)
211 {
212     RAMBlock *block = qemu_ram_block_by_name(block_name);
213     unsigned long *le_bitmap, nbits;
214     uint64_t size;
215 
216     if (!block) {
217         error_report("%s: invalid block name: %s", __func__, block_name);
218         return -1;
219     }
220 
221     nbits = block->used_length >> TARGET_PAGE_BITS;
222 
223     /*
224      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
225      * machines we may need 4 more bytes for padding (see below
226      * comment). So extend it a bit before hand.
227      */
228     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
229 
230     /*
231      * Always use little endian when sending the bitmap. This is
232      * required that when source and destination VMs are not using the
233      * same endianess. (Note: big endian won't work.)
234      */
235     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
236 
237     /* Size of the bitmap, in bytes */
238     size = DIV_ROUND_UP(nbits, 8);
239 
240     /*
241      * size is always aligned to 8 bytes for 64bit machines, but it
242      * may not be true for 32bit machines. We need this padding to
243      * make sure the migration can survive even between 32bit and
244      * 64bit machines.
245      */
246     size = ROUND_UP(size, 8);
247 
248     qemu_put_be64(file, size);
249     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
250     /*
251      * Mark as an end, in case the middle part is screwed up due to
252      * some "misterious" reason.
253      */
254     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
255     qemu_fflush(file);
256 
257     g_free(le_bitmap);
258 
259     if (qemu_file_get_error(file)) {
260         return qemu_file_get_error(file);
261     }
262 
263     return size + sizeof(size);
264 }
265 
266 /*
267  * An outstanding page request, on the source, having been received
268  * and queued
269  */
270 struct RAMSrcPageRequest {
271     RAMBlock *rb;
272     hwaddr    offset;
273     hwaddr    len;
274 
275     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
276 };
277 
278 /* State of RAM for migration */
279 struct RAMState {
280     /* QEMUFile used for this migration */
281     QEMUFile *f;
282     /* Last block that we have visited searching for dirty pages */
283     RAMBlock *last_seen_block;
284     /* Last block from where we have sent data */
285     RAMBlock *last_sent_block;
286     /* Last dirty target page we have sent */
287     ram_addr_t last_page;
288     /* last ram version we have seen */
289     uint32_t last_version;
290     /* We are in the first round */
291     bool ram_bulk_stage;
292     /* How many times we have dirty too many pages */
293     int dirty_rate_high_cnt;
294     /* these variables are used for bitmap sync */
295     /* last time we did a full bitmap_sync */
296     int64_t time_last_bitmap_sync;
297     /* bytes transferred at start_time */
298     uint64_t bytes_xfer_prev;
299     /* number of dirty pages since start_time */
300     uint64_t num_dirty_pages_period;
301     /* xbzrle misses since the beginning of the period */
302     uint64_t xbzrle_cache_miss_prev;
303     /* number of iterations at the beginning of period */
304     uint64_t iterations_prev;
305     /* Iterations since start */
306     uint64_t iterations;
307     /* number of dirty bits in the bitmap */
308     uint64_t migration_dirty_pages;
309     /* protects modification of the bitmap */
310     QemuMutex bitmap_mutex;
311     /* The RAMBlock used in the last src_page_requests */
312     RAMBlock *last_req_rb;
313     /* Queue of outstanding page requests from the destination */
314     QemuMutex src_page_req_mutex;
315     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
316 };
317 typedef struct RAMState RAMState;
318 
319 static RAMState *ram_state;
320 
321 uint64_t ram_bytes_remaining(void)
322 {
323     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
324                        0;
325 }
326 
327 MigrationStats ram_counters;
328 
329 /* used by the search for pages to send */
330 struct PageSearchStatus {
331     /* Current block being searched */
332     RAMBlock    *block;
333     /* Current page to search from */
334     unsigned long page;
335     /* Set once we wrap around */
336     bool         complete_round;
337 };
338 typedef struct PageSearchStatus PageSearchStatus;
339 
340 struct CompressParam {
341     bool done;
342     bool quit;
343     QEMUFile *file;
344     QemuMutex mutex;
345     QemuCond cond;
346     RAMBlock *block;
347     ram_addr_t offset;
348 
349     /* internally used fields */
350     z_stream stream;
351     uint8_t *originbuf;
352 };
353 typedef struct CompressParam CompressParam;
354 
355 struct DecompressParam {
356     bool done;
357     bool quit;
358     QemuMutex mutex;
359     QemuCond cond;
360     void *des;
361     uint8_t *compbuf;
362     int len;
363     z_stream stream;
364 };
365 typedef struct DecompressParam DecompressParam;
366 
367 static CompressParam *comp_param;
368 static QemuThread *compress_threads;
369 /* comp_done_cond is used to wake up the migration thread when
370  * one of the compression threads has finished the compression.
371  * comp_done_lock is used to co-work with comp_done_cond.
372  */
373 static QemuMutex comp_done_lock;
374 static QemuCond comp_done_cond;
375 /* The empty QEMUFileOps will be used by file in CompressParam */
376 static const QEMUFileOps empty_ops = { };
377 
378 static QEMUFile *decomp_file;
379 static DecompressParam *decomp_param;
380 static QemuThread *decompress_threads;
381 static QemuMutex decomp_done_lock;
382 static QemuCond decomp_done_cond;
383 
384 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
385                                 ram_addr_t offset, uint8_t *source_buf);
386 
387 static void *do_data_compress(void *opaque)
388 {
389     CompressParam *param = opaque;
390     RAMBlock *block;
391     ram_addr_t offset;
392 
393     qemu_mutex_lock(&param->mutex);
394     while (!param->quit) {
395         if (param->block) {
396             block = param->block;
397             offset = param->offset;
398             param->block = NULL;
399             qemu_mutex_unlock(&param->mutex);
400 
401             do_compress_ram_page(param->file, &param->stream, block, offset,
402                                  param->originbuf);
403 
404             qemu_mutex_lock(&comp_done_lock);
405             param->done = true;
406             qemu_cond_signal(&comp_done_cond);
407             qemu_mutex_unlock(&comp_done_lock);
408 
409             qemu_mutex_lock(&param->mutex);
410         } else {
411             qemu_cond_wait(&param->cond, &param->mutex);
412         }
413     }
414     qemu_mutex_unlock(&param->mutex);
415 
416     return NULL;
417 }
418 
419 static inline void terminate_compression_threads(void)
420 {
421     int idx, thread_count;
422 
423     thread_count = migrate_compress_threads();
424 
425     for (idx = 0; idx < thread_count; idx++) {
426         qemu_mutex_lock(&comp_param[idx].mutex);
427         comp_param[idx].quit = true;
428         qemu_cond_signal(&comp_param[idx].cond);
429         qemu_mutex_unlock(&comp_param[idx].mutex);
430     }
431 }
432 
433 static void compress_threads_save_cleanup(void)
434 {
435     int i, thread_count;
436 
437     if (!migrate_use_compression()) {
438         return;
439     }
440     terminate_compression_threads();
441     thread_count = migrate_compress_threads();
442     for (i = 0; i < thread_count; i++) {
443         /*
444          * we use it as a indicator which shows if the thread is
445          * properly init'd or not
446          */
447         if (!comp_param[i].file) {
448             break;
449         }
450         qemu_thread_join(compress_threads + i);
451         qemu_mutex_destroy(&comp_param[i].mutex);
452         qemu_cond_destroy(&comp_param[i].cond);
453         deflateEnd(&comp_param[i].stream);
454         g_free(comp_param[i].originbuf);
455         qemu_fclose(comp_param[i].file);
456         comp_param[i].file = NULL;
457     }
458     qemu_mutex_destroy(&comp_done_lock);
459     qemu_cond_destroy(&comp_done_cond);
460     g_free(compress_threads);
461     g_free(comp_param);
462     compress_threads = NULL;
463     comp_param = NULL;
464 }
465 
466 static int compress_threads_save_setup(void)
467 {
468     int i, thread_count;
469 
470     if (!migrate_use_compression()) {
471         return 0;
472     }
473     thread_count = migrate_compress_threads();
474     compress_threads = g_new0(QemuThread, thread_count);
475     comp_param = g_new0(CompressParam, thread_count);
476     qemu_cond_init(&comp_done_cond);
477     qemu_mutex_init(&comp_done_lock);
478     for (i = 0; i < thread_count; i++) {
479         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
480         if (!comp_param[i].originbuf) {
481             goto exit;
482         }
483 
484         if (deflateInit(&comp_param[i].stream,
485                         migrate_compress_level()) != Z_OK) {
486             g_free(comp_param[i].originbuf);
487             goto exit;
488         }
489 
490         /* comp_param[i].file is just used as a dummy buffer to save data,
491          * set its ops to empty.
492          */
493         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
494         comp_param[i].done = true;
495         comp_param[i].quit = false;
496         qemu_mutex_init(&comp_param[i].mutex);
497         qemu_cond_init(&comp_param[i].cond);
498         qemu_thread_create(compress_threads + i, "compress",
499                            do_data_compress, comp_param + i,
500                            QEMU_THREAD_JOINABLE);
501     }
502     return 0;
503 
504 exit:
505     compress_threads_save_cleanup();
506     return -1;
507 }
508 
509 /* Multiple fd's */
510 
511 #define MULTIFD_MAGIC 0x11223344U
512 #define MULTIFD_VERSION 1
513 
514 #define MULTIFD_FLAG_SYNC (1 << 0)
515 
516 typedef struct {
517     uint32_t magic;
518     uint32_t version;
519     unsigned char uuid[16]; /* QemuUUID */
520     uint8_t id;
521 } __attribute__((packed)) MultiFDInit_t;
522 
523 typedef struct {
524     uint32_t magic;
525     uint32_t version;
526     uint32_t flags;
527     uint32_t size;
528     uint32_t used;
529     uint64_t packet_num;
530     char ramblock[256];
531     uint64_t offset[];
532 } __attribute__((packed)) MultiFDPacket_t;
533 
534 typedef struct {
535     /* number of used pages */
536     uint32_t used;
537     /* number of allocated pages */
538     uint32_t allocated;
539     /* global number of generated multifd packets */
540     uint64_t packet_num;
541     /* offset of each page */
542     ram_addr_t *offset;
543     /* pointer to each page */
544     struct iovec *iov;
545     RAMBlock *block;
546 } MultiFDPages_t;
547 
548 typedef struct {
549     /* this fields are not changed once the thread is created */
550     /* channel number */
551     uint8_t id;
552     /* channel thread name */
553     char *name;
554     /* channel thread id */
555     QemuThread thread;
556     /* communication channel */
557     QIOChannel *c;
558     /* sem where to wait for more work */
559     QemuSemaphore sem;
560     /* this mutex protects the following parameters */
561     QemuMutex mutex;
562     /* is this channel thread running */
563     bool running;
564     /* should this thread finish */
565     bool quit;
566     /* thread has work to do */
567     int pending_job;
568     /* array of pages to sent */
569     MultiFDPages_t *pages;
570     /* packet allocated len */
571     uint32_t packet_len;
572     /* pointer to the packet */
573     MultiFDPacket_t *packet;
574     /* multifd flags for each packet */
575     uint32_t flags;
576     /* global number of generated multifd packets */
577     uint64_t packet_num;
578     /* thread local variables */
579     /* packets sent through this channel */
580     uint64_t num_packets;
581     /* pages sent through this channel */
582     uint64_t num_pages;
583     /* syncs main thread and channels */
584     QemuSemaphore sem_sync;
585 }  MultiFDSendParams;
586 
587 typedef struct {
588     /* this fields are not changed once the thread is created */
589     /* channel number */
590     uint8_t id;
591     /* channel thread name */
592     char *name;
593     /* channel thread id */
594     QemuThread thread;
595     /* communication channel */
596     QIOChannel *c;
597     /* this mutex protects the following parameters */
598     QemuMutex mutex;
599     /* is this channel thread running */
600     bool running;
601     /* array of pages to receive */
602     MultiFDPages_t *pages;
603     /* packet allocated len */
604     uint32_t packet_len;
605     /* pointer to the packet */
606     MultiFDPacket_t *packet;
607     /* multifd flags for each packet */
608     uint32_t flags;
609     /* global number of generated multifd packets */
610     uint64_t packet_num;
611     /* thread local variables */
612     /* packets sent through this channel */
613     uint64_t num_packets;
614     /* pages sent through this channel */
615     uint64_t num_pages;
616     /* syncs main thread and channels */
617     QemuSemaphore sem_sync;
618 } MultiFDRecvParams;
619 
620 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
621 {
622     MultiFDInit_t msg;
623     int ret;
624 
625     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
626     msg.version = cpu_to_be32(MULTIFD_VERSION);
627     msg.id = p->id;
628     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
629 
630     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
631     if (ret != 0) {
632         return -1;
633     }
634     return 0;
635 }
636 
637 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
638 {
639     MultiFDInit_t msg;
640     int ret;
641 
642     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
643     if (ret != 0) {
644         return -1;
645     }
646 
647     be32_to_cpus(&msg.magic);
648     be32_to_cpus(&msg.version);
649 
650     if (msg.magic != MULTIFD_MAGIC) {
651         error_setg(errp, "multifd: received packet magic %x "
652                    "expected %x", msg.magic, MULTIFD_MAGIC);
653         return -1;
654     }
655 
656     if (msg.version != MULTIFD_VERSION) {
657         error_setg(errp, "multifd: received packet version %d "
658                    "expected %d", msg.version, MULTIFD_VERSION);
659         return -1;
660     }
661 
662     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
663         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
664         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
665 
666         error_setg(errp, "multifd: received uuid '%s' and expected "
667                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
668         g_free(uuid);
669         g_free(msg_uuid);
670         return -1;
671     }
672 
673     if (msg.id > migrate_multifd_channels()) {
674         error_setg(errp, "multifd: received channel version %d "
675                    "expected %d", msg.version, MULTIFD_VERSION);
676         return -1;
677     }
678 
679     return msg.id;
680 }
681 
682 static MultiFDPages_t *multifd_pages_init(size_t size)
683 {
684     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
685 
686     pages->allocated = size;
687     pages->iov = g_new0(struct iovec, size);
688     pages->offset = g_new0(ram_addr_t, size);
689 
690     return pages;
691 }
692 
693 static void multifd_pages_clear(MultiFDPages_t *pages)
694 {
695     pages->used = 0;
696     pages->allocated = 0;
697     pages->packet_num = 0;
698     pages->block = NULL;
699     g_free(pages->iov);
700     pages->iov = NULL;
701     g_free(pages->offset);
702     pages->offset = NULL;
703     g_free(pages);
704 }
705 
706 static void multifd_send_fill_packet(MultiFDSendParams *p)
707 {
708     MultiFDPacket_t *packet = p->packet;
709     int i;
710 
711     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
712     packet->version = cpu_to_be32(MULTIFD_VERSION);
713     packet->flags = cpu_to_be32(p->flags);
714     packet->size = cpu_to_be32(migrate_multifd_page_count());
715     packet->used = cpu_to_be32(p->pages->used);
716     packet->packet_num = cpu_to_be64(p->packet_num);
717 
718     if (p->pages->block) {
719         strncpy(packet->ramblock, p->pages->block->idstr, 256);
720     }
721 
722     for (i = 0; i < p->pages->used; i++) {
723         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
724     }
725 }
726 
727 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
728 {
729     MultiFDPacket_t *packet = p->packet;
730     RAMBlock *block;
731     int i;
732 
733     be32_to_cpus(&packet->magic);
734     if (packet->magic != MULTIFD_MAGIC) {
735         error_setg(errp, "multifd: received packet "
736                    "magic %x and expected magic %x",
737                    packet->magic, MULTIFD_MAGIC);
738         return -1;
739     }
740 
741     be32_to_cpus(&packet->version);
742     if (packet->version != MULTIFD_VERSION) {
743         error_setg(errp, "multifd: received packet "
744                    "version %d and expected version %d",
745                    packet->version, MULTIFD_VERSION);
746         return -1;
747     }
748 
749     p->flags = be32_to_cpu(packet->flags);
750 
751     be32_to_cpus(&packet->size);
752     if (packet->size > migrate_multifd_page_count()) {
753         error_setg(errp, "multifd: received packet "
754                    "with size %d and expected maximum size %d",
755                    packet->size, migrate_multifd_page_count()) ;
756         return -1;
757     }
758 
759     p->pages->used = be32_to_cpu(packet->used);
760     if (p->pages->used > packet->size) {
761         error_setg(errp, "multifd: received packet "
762                    "with size %d and expected maximum size %d",
763                    p->pages->used, packet->size) ;
764         return -1;
765     }
766 
767     p->packet_num = be64_to_cpu(packet->packet_num);
768 
769     if (p->pages->used) {
770         /* make sure that ramblock is 0 terminated */
771         packet->ramblock[255] = 0;
772         block = qemu_ram_block_by_name(packet->ramblock);
773         if (!block) {
774             error_setg(errp, "multifd: unknown ram block %s",
775                        packet->ramblock);
776             return -1;
777         }
778     }
779 
780     for (i = 0; i < p->pages->used; i++) {
781         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
782 
783         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
784             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
785                        " (max " RAM_ADDR_FMT ")",
786                        offset, block->max_length);
787             return -1;
788         }
789         p->pages->iov[i].iov_base = block->host + offset;
790         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
791     }
792 
793     return 0;
794 }
795 
796 struct {
797     MultiFDSendParams *params;
798     /* number of created threads */
799     int count;
800     /* array of pages to sent */
801     MultiFDPages_t *pages;
802     /* syncs main thread and channels */
803     QemuSemaphore sem_sync;
804     /* global number of generated multifd packets */
805     uint64_t packet_num;
806     /* send channels ready */
807     QemuSemaphore channels_ready;
808 } *multifd_send_state;
809 
810 /*
811  * How we use multifd_send_state->pages and channel->pages?
812  *
813  * We create a pages for each channel, and a main one.  Each time that
814  * we need to send a batch of pages we interchange the ones between
815  * multifd_send_state and the channel that is sending it.  There are
816  * two reasons for that:
817  *    - to not have to do so many mallocs during migration
818  *    - to make easier to know what to free at the end of migration
819  *
820  * This way we always know who is the owner of each "pages" struct,
821  * and we don't need any loocking.  It belongs to the migration thread
822  * or to the channel thread.  Switching is safe because the migration
823  * thread is using the channel mutex when changing it, and the channel
824  * have to had finish with its own, otherwise pending_job can't be
825  * false.
826  */
827 
828 static void multifd_send_pages(void)
829 {
830     int i;
831     static int next_channel;
832     MultiFDSendParams *p = NULL; /* make happy gcc */
833     MultiFDPages_t *pages = multifd_send_state->pages;
834     uint64_t transferred;
835 
836     qemu_sem_wait(&multifd_send_state->channels_ready);
837     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
838         p = &multifd_send_state->params[i];
839 
840         qemu_mutex_lock(&p->mutex);
841         if (!p->pending_job) {
842             p->pending_job++;
843             next_channel = (i + 1) % migrate_multifd_channels();
844             break;
845         }
846         qemu_mutex_unlock(&p->mutex);
847     }
848     p->pages->used = 0;
849 
850     p->packet_num = multifd_send_state->packet_num++;
851     p->pages->block = NULL;
852     multifd_send_state->pages = p->pages;
853     p->pages = pages;
854     transferred = pages->used * TARGET_PAGE_SIZE + p->packet_len;
855     ram_counters.multifd_bytes += transferred;
856     ram_counters.transferred += transferred;;
857     qemu_mutex_unlock(&p->mutex);
858     qemu_sem_post(&p->sem);
859 }
860 
861 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
862 {
863     MultiFDPages_t *pages = multifd_send_state->pages;
864 
865     if (!pages->block) {
866         pages->block = block;
867     }
868 
869     if (pages->block == block) {
870         pages->offset[pages->used] = offset;
871         pages->iov[pages->used].iov_base = block->host + offset;
872         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
873         pages->used++;
874 
875         if (pages->used < pages->allocated) {
876             return;
877         }
878     }
879 
880     multifd_send_pages();
881 
882     if (pages->block != block) {
883         multifd_queue_page(block, offset);
884     }
885 }
886 
887 static void multifd_send_terminate_threads(Error *err)
888 {
889     int i;
890 
891     if (err) {
892         MigrationState *s = migrate_get_current();
893         migrate_set_error(s, err);
894         if (s->state == MIGRATION_STATUS_SETUP ||
895             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
896             s->state == MIGRATION_STATUS_DEVICE ||
897             s->state == MIGRATION_STATUS_ACTIVE) {
898             migrate_set_state(&s->state, s->state,
899                               MIGRATION_STATUS_FAILED);
900         }
901     }
902 
903     for (i = 0; i < migrate_multifd_channels(); i++) {
904         MultiFDSendParams *p = &multifd_send_state->params[i];
905 
906         qemu_mutex_lock(&p->mutex);
907         p->quit = true;
908         qemu_sem_post(&p->sem);
909         qemu_mutex_unlock(&p->mutex);
910     }
911 }
912 
913 int multifd_save_cleanup(Error **errp)
914 {
915     int i;
916     int ret = 0;
917 
918     if (!migrate_use_multifd()) {
919         return 0;
920     }
921     multifd_send_terminate_threads(NULL);
922     for (i = 0; i < migrate_multifd_channels(); i++) {
923         MultiFDSendParams *p = &multifd_send_state->params[i];
924 
925         if (p->running) {
926             qemu_thread_join(&p->thread);
927         }
928         socket_send_channel_destroy(p->c);
929         p->c = NULL;
930         qemu_mutex_destroy(&p->mutex);
931         qemu_sem_destroy(&p->sem);
932         qemu_sem_destroy(&p->sem_sync);
933         g_free(p->name);
934         p->name = NULL;
935         multifd_pages_clear(p->pages);
936         p->pages = NULL;
937         p->packet_len = 0;
938         g_free(p->packet);
939         p->packet = NULL;
940     }
941     qemu_sem_destroy(&multifd_send_state->channels_ready);
942     qemu_sem_destroy(&multifd_send_state->sem_sync);
943     g_free(multifd_send_state->params);
944     multifd_send_state->params = NULL;
945     multifd_pages_clear(multifd_send_state->pages);
946     multifd_send_state->pages = NULL;
947     g_free(multifd_send_state);
948     multifd_send_state = NULL;
949     return ret;
950 }
951 
952 static void multifd_send_sync_main(void)
953 {
954     int i;
955 
956     if (!migrate_use_multifd()) {
957         return;
958     }
959     if (multifd_send_state->pages->used) {
960         multifd_send_pages();
961     }
962     for (i = 0; i < migrate_multifd_channels(); i++) {
963         MultiFDSendParams *p = &multifd_send_state->params[i];
964 
965         trace_multifd_send_sync_main_signal(p->id);
966 
967         qemu_mutex_lock(&p->mutex);
968 
969         p->packet_num = multifd_send_state->packet_num++;
970         p->flags |= MULTIFD_FLAG_SYNC;
971         p->pending_job++;
972         qemu_mutex_unlock(&p->mutex);
973         qemu_sem_post(&p->sem);
974     }
975     for (i = 0; i < migrate_multifd_channels(); i++) {
976         MultiFDSendParams *p = &multifd_send_state->params[i];
977 
978         trace_multifd_send_sync_main_wait(p->id);
979         qemu_sem_wait(&multifd_send_state->sem_sync);
980     }
981     trace_multifd_send_sync_main(multifd_send_state->packet_num);
982 }
983 
984 static void *multifd_send_thread(void *opaque)
985 {
986     MultiFDSendParams *p = opaque;
987     Error *local_err = NULL;
988     int ret;
989 
990     trace_multifd_send_thread_start(p->id);
991 
992     if (multifd_send_initial_packet(p, &local_err) < 0) {
993         goto out;
994     }
995     /* initial packet */
996     p->num_packets = 1;
997 
998     while (true) {
999         qemu_sem_wait(&p->sem);
1000         qemu_mutex_lock(&p->mutex);
1001 
1002         if (p->pending_job) {
1003             uint32_t used = p->pages->used;
1004             uint64_t packet_num = p->packet_num;
1005             uint32_t flags = p->flags;
1006 
1007             multifd_send_fill_packet(p);
1008             p->flags = 0;
1009             p->num_packets++;
1010             p->num_pages += used;
1011             p->pages->used = 0;
1012             qemu_mutex_unlock(&p->mutex);
1013 
1014             trace_multifd_send(p->id, packet_num, used, flags);
1015 
1016             ret = qio_channel_write_all(p->c, (void *)p->packet,
1017                                         p->packet_len, &local_err);
1018             if (ret != 0) {
1019                 break;
1020             }
1021 
1022             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1023             if (ret != 0) {
1024                 break;
1025             }
1026 
1027             qemu_mutex_lock(&p->mutex);
1028             p->pending_job--;
1029             qemu_mutex_unlock(&p->mutex);
1030 
1031             if (flags & MULTIFD_FLAG_SYNC) {
1032                 qemu_sem_post(&multifd_send_state->sem_sync);
1033             }
1034             qemu_sem_post(&multifd_send_state->channels_ready);
1035         } else if (p->quit) {
1036             qemu_mutex_unlock(&p->mutex);
1037             break;
1038         } else {
1039             qemu_mutex_unlock(&p->mutex);
1040             /* sometimes there are spurious wakeups */
1041         }
1042     }
1043 
1044 out:
1045     if (local_err) {
1046         multifd_send_terminate_threads(local_err);
1047     }
1048 
1049     qemu_mutex_lock(&p->mutex);
1050     p->running = false;
1051     qemu_mutex_unlock(&p->mutex);
1052 
1053     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1054 
1055     return NULL;
1056 }
1057 
1058 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1059 {
1060     MultiFDSendParams *p = opaque;
1061     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1062     Error *local_err = NULL;
1063 
1064     if (qio_task_propagate_error(task, &local_err)) {
1065         if (multifd_save_cleanup(&local_err) != 0) {
1066             migrate_set_error(migrate_get_current(), local_err);
1067         }
1068     } else {
1069         p->c = QIO_CHANNEL(sioc);
1070         qio_channel_set_delay(p->c, false);
1071         p->running = true;
1072         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1073                            QEMU_THREAD_JOINABLE);
1074 
1075         atomic_inc(&multifd_send_state->count);
1076     }
1077 }
1078 
1079 int multifd_save_setup(void)
1080 {
1081     int thread_count;
1082     uint32_t page_count = migrate_multifd_page_count();
1083     uint8_t i;
1084 
1085     if (!migrate_use_multifd()) {
1086         return 0;
1087     }
1088     thread_count = migrate_multifd_channels();
1089     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1090     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1091     atomic_set(&multifd_send_state->count, 0);
1092     multifd_send_state->pages = multifd_pages_init(page_count);
1093     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1094     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1095 
1096     for (i = 0; i < thread_count; i++) {
1097         MultiFDSendParams *p = &multifd_send_state->params[i];
1098 
1099         qemu_mutex_init(&p->mutex);
1100         qemu_sem_init(&p->sem, 0);
1101         qemu_sem_init(&p->sem_sync, 0);
1102         p->quit = false;
1103         p->pending_job = 0;
1104         p->id = i;
1105         p->pages = multifd_pages_init(page_count);
1106         p->packet_len = sizeof(MultiFDPacket_t)
1107                       + sizeof(ram_addr_t) * page_count;
1108         p->packet = g_malloc0(p->packet_len);
1109         p->name = g_strdup_printf("multifdsend_%d", i);
1110         socket_send_channel_create(multifd_new_send_channel_async, p);
1111     }
1112     return 0;
1113 }
1114 
1115 struct {
1116     MultiFDRecvParams *params;
1117     /* number of created threads */
1118     int count;
1119     /* syncs main thread and channels */
1120     QemuSemaphore sem_sync;
1121     /* global number of generated multifd packets */
1122     uint64_t packet_num;
1123 } *multifd_recv_state;
1124 
1125 static void multifd_recv_terminate_threads(Error *err)
1126 {
1127     int i;
1128 
1129     if (err) {
1130         MigrationState *s = migrate_get_current();
1131         migrate_set_error(s, err);
1132         if (s->state == MIGRATION_STATUS_SETUP ||
1133             s->state == MIGRATION_STATUS_ACTIVE) {
1134             migrate_set_state(&s->state, s->state,
1135                               MIGRATION_STATUS_FAILED);
1136         }
1137     }
1138 
1139     for (i = 0; i < migrate_multifd_channels(); i++) {
1140         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1141 
1142         qemu_mutex_lock(&p->mutex);
1143         /* We could arrive here for two reasons:
1144            - normal quit, i.e. everything went fine, just finished
1145            - error quit: We close the channels so the channel threads
1146              finish the qio_channel_read_all_eof() */
1147         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1148         qemu_mutex_unlock(&p->mutex);
1149     }
1150 }
1151 
1152 int multifd_load_cleanup(Error **errp)
1153 {
1154     int i;
1155     int ret = 0;
1156 
1157     if (!migrate_use_multifd()) {
1158         return 0;
1159     }
1160     multifd_recv_terminate_threads(NULL);
1161     for (i = 0; i < migrate_multifd_channels(); i++) {
1162         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1163 
1164         if (p->running) {
1165             qemu_thread_join(&p->thread);
1166         }
1167         object_unref(OBJECT(p->c));
1168         p->c = NULL;
1169         qemu_mutex_destroy(&p->mutex);
1170         qemu_sem_destroy(&p->sem_sync);
1171         g_free(p->name);
1172         p->name = NULL;
1173         multifd_pages_clear(p->pages);
1174         p->pages = NULL;
1175         p->packet_len = 0;
1176         g_free(p->packet);
1177         p->packet = NULL;
1178     }
1179     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1180     g_free(multifd_recv_state->params);
1181     multifd_recv_state->params = NULL;
1182     g_free(multifd_recv_state);
1183     multifd_recv_state = NULL;
1184 
1185     return ret;
1186 }
1187 
1188 static void multifd_recv_sync_main(void)
1189 {
1190     int i;
1191 
1192     if (!migrate_use_multifd()) {
1193         return;
1194     }
1195     for (i = 0; i < migrate_multifd_channels(); i++) {
1196         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1197 
1198         trace_multifd_recv_sync_main_wait(p->id);
1199         qemu_sem_wait(&multifd_recv_state->sem_sync);
1200         qemu_mutex_lock(&p->mutex);
1201         if (multifd_recv_state->packet_num < p->packet_num) {
1202             multifd_recv_state->packet_num = p->packet_num;
1203         }
1204         qemu_mutex_unlock(&p->mutex);
1205     }
1206     for (i = 0; i < migrate_multifd_channels(); i++) {
1207         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1208 
1209         trace_multifd_recv_sync_main_signal(p->id);
1210         qemu_sem_post(&p->sem_sync);
1211     }
1212     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1213 }
1214 
1215 static void *multifd_recv_thread(void *opaque)
1216 {
1217     MultiFDRecvParams *p = opaque;
1218     Error *local_err = NULL;
1219     int ret;
1220 
1221     trace_multifd_recv_thread_start(p->id);
1222 
1223     while (true) {
1224         uint32_t used;
1225         uint32_t flags;
1226 
1227         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1228                                        p->packet_len, &local_err);
1229         if (ret == 0) {   /* EOF */
1230             break;
1231         }
1232         if (ret == -1) {   /* Error */
1233             break;
1234         }
1235 
1236         qemu_mutex_lock(&p->mutex);
1237         ret = multifd_recv_unfill_packet(p, &local_err);
1238         if (ret) {
1239             qemu_mutex_unlock(&p->mutex);
1240             break;
1241         }
1242 
1243         used = p->pages->used;
1244         flags = p->flags;
1245         trace_multifd_recv(p->id, p->packet_num, used, flags);
1246         p->num_packets++;
1247         p->num_pages += used;
1248         qemu_mutex_unlock(&p->mutex);
1249 
1250         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1251         if (ret != 0) {
1252             break;
1253         }
1254 
1255         if (flags & MULTIFD_FLAG_SYNC) {
1256             qemu_sem_post(&multifd_recv_state->sem_sync);
1257             qemu_sem_wait(&p->sem_sync);
1258         }
1259     }
1260 
1261     if (local_err) {
1262         multifd_recv_terminate_threads(local_err);
1263     }
1264     qemu_mutex_lock(&p->mutex);
1265     p->running = false;
1266     qemu_mutex_unlock(&p->mutex);
1267 
1268     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1269 
1270     return NULL;
1271 }
1272 
1273 int multifd_load_setup(void)
1274 {
1275     int thread_count;
1276     uint32_t page_count = migrate_multifd_page_count();
1277     uint8_t i;
1278 
1279     if (!migrate_use_multifd()) {
1280         return 0;
1281     }
1282     thread_count = migrate_multifd_channels();
1283     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1284     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1285     atomic_set(&multifd_recv_state->count, 0);
1286     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1287 
1288     for (i = 0; i < thread_count; i++) {
1289         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1290 
1291         qemu_mutex_init(&p->mutex);
1292         qemu_sem_init(&p->sem_sync, 0);
1293         p->id = i;
1294         p->pages = multifd_pages_init(page_count);
1295         p->packet_len = sizeof(MultiFDPacket_t)
1296                       + sizeof(ram_addr_t) * page_count;
1297         p->packet = g_malloc0(p->packet_len);
1298         p->name = g_strdup_printf("multifdrecv_%d", i);
1299     }
1300     return 0;
1301 }
1302 
1303 bool multifd_recv_all_channels_created(void)
1304 {
1305     int thread_count = migrate_multifd_channels();
1306 
1307     if (!migrate_use_multifd()) {
1308         return true;
1309     }
1310 
1311     return thread_count == atomic_read(&multifd_recv_state->count);
1312 }
1313 
1314 /* Return true if multifd is ready for the migration, otherwise false */
1315 bool multifd_recv_new_channel(QIOChannel *ioc)
1316 {
1317     MultiFDRecvParams *p;
1318     Error *local_err = NULL;
1319     int id;
1320 
1321     id = multifd_recv_initial_packet(ioc, &local_err);
1322     if (id < 0) {
1323         multifd_recv_terminate_threads(local_err);
1324         return false;
1325     }
1326 
1327     p = &multifd_recv_state->params[id];
1328     if (p->c != NULL) {
1329         error_setg(&local_err, "multifd: received id '%d' already setup'",
1330                    id);
1331         multifd_recv_terminate_threads(local_err);
1332         return false;
1333     }
1334     p->c = ioc;
1335     object_ref(OBJECT(ioc));
1336     /* initial packet */
1337     p->num_packets = 1;
1338 
1339     p->running = true;
1340     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1341                        QEMU_THREAD_JOINABLE);
1342     atomic_inc(&multifd_recv_state->count);
1343     return multifd_recv_state->count == migrate_multifd_channels();
1344 }
1345 
1346 /**
1347  * save_page_header: write page header to wire
1348  *
1349  * If this is the 1st block, it also writes the block identification
1350  *
1351  * Returns the number of bytes written
1352  *
1353  * @f: QEMUFile where to send the data
1354  * @block: block that contains the page we want to send
1355  * @offset: offset inside the block for the page
1356  *          in the lower bits, it contains flags
1357  */
1358 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1359                                ram_addr_t offset)
1360 {
1361     size_t size, len;
1362 
1363     if (block == rs->last_sent_block) {
1364         offset |= RAM_SAVE_FLAG_CONTINUE;
1365     }
1366     qemu_put_be64(f, offset);
1367     size = 8;
1368 
1369     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1370         len = strlen(block->idstr);
1371         qemu_put_byte(f, len);
1372         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1373         size += 1 + len;
1374         rs->last_sent_block = block;
1375     }
1376     return size;
1377 }
1378 
1379 /**
1380  * mig_throttle_guest_down: throotle down the guest
1381  *
1382  * Reduce amount of guest cpu execution to hopefully slow down memory
1383  * writes. If guest dirty memory rate is reduced below the rate at
1384  * which we can transfer pages to the destination then we should be
1385  * able to complete migration. Some workloads dirty memory way too
1386  * fast and will not effectively converge, even with auto-converge.
1387  */
1388 static void mig_throttle_guest_down(void)
1389 {
1390     MigrationState *s = migrate_get_current();
1391     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1392     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1393 
1394     /* We have not started throttling yet. Let's start it. */
1395     if (!cpu_throttle_active()) {
1396         cpu_throttle_set(pct_initial);
1397     } else {
1398         /* Throttling already on, just increase the rate */
1399         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
1400     }
1401 }
1402 
1403 /**
1404  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1405  *
1406  * @rs: current RAM state
1407  * @current_addr: address for the zero page
1408  *
1409  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1410  * The important thing is that a stale (not-yet-0'd) page be replaced
1411  * by the new data.
1412  * As a bonus, if the page wasn't in the cache it gets added so that
1413  * when a small write is made into the 0'd page it gets XBZRLE sent.
1414  */
1415 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1416 {
1417     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1418         return;
1419     }
1420 
1421     /* We don't care if this fails to allocate a new cache page
1422      * as long as it updated an old one */
1423     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1424                  ram_counters.dirty_sync_count);
1425 }
1426 
1427 #define ENCODING_FLAG_XBZRLE 0x1
1428 
1429 /**
1430  * save_xbzrle_page: compress and send current page
1431  *
1432  * Returns: 1 means that we wrote the page
1433  *          0 means that page is identical to the one already sent
1434  *          -1 means that xbzrle would be longer than normal
1435  *
1436  * @rs: current RAM state
1437  * @current_data: pointer to the address of the page contents
1438  * @current_addr: addr of the page
1439  * @block: block that contains the page we want to send
1440  * @offset: offset inside the block for the page
1441  * @last_stage: if we are at the completion stage
1442  */
1443 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1444                             ram_addr_t current_addr, RAMBlock *block,
1445                             ram_addr_t offset, bool last_stage)
1446 {
1447     int encoded_len = 0, bytes_xbzrle;
1448     uint8_t *prev_cached_page;
1449 
1450     if (!cache_is_cached(XBZRLE.cache, current_addr,
1451                          ram_counters.dirty_sync_count)) {
1452         xbzrle_counters.cache_miss++;
1453         if (!last_stage) {
1454             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1455                              ram_counters.dirty_sync_count) == -1) {
1456                 return -1;
1457             } else {
1458                 /* update *current_data when the page has been
1459                    inserted into cache */
1460                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1461             }
1462         }
1463         return -1;
1464     }
1465 
1466     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1467 
1468     /* save current buffer into memory */
1469     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1470 
1471     /* XBZRLE encoding (if there is no overflow) */
1472     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1473                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1474                                        TARGET_PAGE_SIZE);
1475     if (encoded_len == 0) {
1476         trace_save_xbzrle_page_skipping();
1477         return 0;
1478     } else if (encoded_len == -1) {
1479         trace_save_xbzrle_page_overflow();
1480         xbzrle_counters.overflow++;
1481         /* update data in the cache */
1482         if (!last_stage) {
1483             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1484             *current_data = prev_cached_page;
1485         }
1486         return -1;
1487     }
1488 
1489     /* we need to update the data in the cache, in order to get the same data */
1490     if (!last_stage) {
1491         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1492     }
1493 
1494     /* Send XBZRLE based compressed page */
1495     bytes_xbzrle = save_page_header(rs, rs->f, block,
1496                                     offset | RAM_SAVE_FLAG_XBZRLE);
1497     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1498     qemu_put_be16(rs->f, encoded_len);
1499     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1500     bytes_xbzrle += encoded_len + 1 + 2;
1501     xbzrle_counters.pages++;
1502     xbzrle_counters.bytes += bytes_xbzrle;
1503     ram_counters.transferred += bytes_xbzrle;
1504 
1505     return 1;
1506 }
1507 
1508 /**
1509  * migration_bitmap_find_dirty: find the next dirty page from start
1510  *
1511  * Called with rcu_read_lock() to protect migration_bitmap
1512  *
1513  * Returns the byte offset within memory region of the start of a dirty page
1514  *
1515  * @rs: current RAM state
1516  * @rb: RAMBlock where to search for dirty pages
1517  * @start: page where we start the search
1518  */
1519 static inline
1520 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1521                                           unsigned long start)
1522 {
1523     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1524     unsigned long *bitmap = rb->bmap;
1525     unsigned long next;
1526 
1527     if (!qemu_ram_is_migratable(rb)) {
1528         return size;
1529     }
1530 
1531     if (rs->ram_bulk_stage && start > 0) {
1532         next = start + 1;
1533     } else {
1534         next = find_next_bit(bitmap, size, start);
1535     }
1536 
1537     return next;
1538 }
1539 
1540 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1541                                                 RAMBlock *rb,
1542                                                 unsigned long page)
1543 {
1544     bool ret;
1545 
1546     ret = test_and_clear_bit(page, rb->bmap);
1547 
1548     if (ret) {
1549         rs->migration_dirty_pages--;
1550     }
1551     return ret;
1552 }
1553 
1554 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1555                                         ram_addr_t start, ram_addr_t length)
1556 {
1557     rs->migration_dirty_pages +=
1558         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1559                                               &rs->num_dirty_pages_period);
1560 }
1561 
1562 /**
1563  * ram_pagesize_summary: calculate all the pagesizes of a VM
1564  *
1565  * Returns a summary bitmap of the page sizes of all RAMBlocks
1566  *
1567  * For VMs with just normal pages this is equivalent to the host page
1568  * size. If it's got some huge pages then it's the OR of all the
1569  * different page sizes.
1570  */
1571 uint64_t ram_pagesize_summary(void)
1572 {
1573     RAMBlock *block;
1574     uint64_t summary = 0;
1575 
1576     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1577         summary |= block->page_size;
1578     }
1579 
1580     return summary;
1581 }
1582 
1583 static void migration_update_rates(RAMState *rs, int64_t end_time)
1584 {
1585     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1586 
1587     /* calculate period counters */
1588     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1589                 / (end_time - rs->time_last_bitmap_sync);
1590 
1591     if (!iter_count) {
1592         return;
1593     }
1594 
1595     if (migrate_use_xbzrle()) {
1596         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1597             rs->xbzrle_cache_miss_prev) / iter_count;
1598         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1599     }
1600 }
1601 
1602 static void migration_bitmap_sync(RAMState *rs)
1603 {
1604     RAMBlock *block;
1605     int64_t end_time;
1606     uint64_t bytes_xfer_now;
1607 
1608     ram_counters.dirty_sync_count++;
1609 
1610     if (!rs->time_last_bitmap_sync) {
1611         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1612     }
1613 
1614     trace_migration_bitmap_sync_start();
1615     memory_global_dirty_log_sync();
1616 
1617     qemu_mutex_lock(&rs->bitmap_mutex);
1618     rcu_read_lock();
1619     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1620         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1621     }
1622     ram_counters.remaining = ram_bytes_remaining();
1623     rcu_read_unlock();
1624     qemu_mutex_unlock(&rs->bitmap_mutex);
1625 
1626     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1627 
1628     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1629 
1630     /* more than 1 second = 1000 millisecons */
1631     if (end_time > rs->time_last_bitmap_sync + 1000) {
1632         bytes_xfer_now = ram_counters.transferred;
1633 
1634         /* During block migration the auto-converge logic incorrectly detects
1635          * that ram migration makes no progress. Avoid this by disabling the
1636          * throttling logic during the bulk phase of block migration. */
1637         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1638             /* The following detection logic can be refined later. For now:
1639                Check to see if the dirtied bytes is 50% more than the approx.
1640                amount of bytes that just got transferred since the last time we
1641                were in this routine. If that happens twice, start or increase
1642                throttling */
1643 
1644             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1645                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1646                 (++rs->dirty_rate_high_cnt >= 2)) {
1647                     trace_migration_throttle();
1648                     rs->dirty_rate_high_cnt = 0;
1649                     mig_throttle_guest_down();
1650             }
1651         }
1652 
1653         migration_update_rates(rs, end_time);
1654 
1655         rs->iterations_prev = rs->iterations;
1656 
1657         /* reset period counters */
1658         rs->time_last_bitmap_sync = end_time;
1659         rs->num_dirty_pages_period = 0;
1660         rs->bytes_xfer_prev = bytes_xfer_now;
1661     }
1662     if (migrate_use_events()) {
1663         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1664     }
1665 }
1666 
1667 /**
1668  * save_zero_page: send the zero page to the stream
1669  *
1670  * Returns the number of pages written.
1671  *
1672  * @rs: current RAM state
1673  * @block: block that contains the page we want to send
1674  * @offset: offset inside the block for the page
1675  */
1676 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1677 {
1678     uint8_t *p = block->host + offset;
1679     int pages = -1;
1680 
1681     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1682         ram_counters.duplicate++;
1683         ram_counters.transferred +=
1684             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1685         qemu_put_byte(rs->f, 0);
1686         ram_counters.transferred += 1;
1687         pages = 1;
1688     }
1689 
1690     return pages;
1691 }
1692 
1693 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1694 {
1695     if (!migrate_release_ram() || !migration_in_postcopy()) {
1696         return;
1697     }
1698 
1699     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1700 }
1701 
1702 /*
1703  * @pages: the number of pages written by the control path,
1704  *        < 0 - error
1705  *        > 0 - number of pages written
1706  *
1707  * Return true if the pages has been saved, otherwise false is returned.
1708  */
1709 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1710                               int *pages)
1711 {
1712     uint64_t bytes_xmit = 0;
1713     int ret;
1714 
1715     *pages = -1;
1716     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1717                                 &bytes_xmit);
1718     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1719         return false;
1720     }
1721 
1722     if (bytes_xmit) {
1723         ram_counters.transferred += bytes_xmit;
1724         *pages = 1;
1725     }
1726 
1727     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1728         return true;
1729     }
1730 
1731     if (bytes_xmit > 0) {
1732         ram_counters.normal++;
1733     } else if (bytes_xmit == 0) {
1734         ram_counters.duplicate++;
1735     }
1736 
1737     return true;
1738 }
1739 
1740 /*
1741  * directly send the page to the stream
1742  *
1743  * Returns the number of pages written.
1744  *
1745  * @rs: current RAM state
1746  * @block: block that contains the page we want to send
1747  * @offset: offset inside the block for the page
1748  * @buf: the page to be sent
1749  * @async: send to page asyncly
1750  */
1751 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1752                             uint8_t *buf, bool async)
1753 {
1754     ram_counters.transferred += save_page_header(rs, rs->f, block,
1755                                                  offset | RAM_SAVE_FLAG_PAGE);
1756     if (async) {
1757         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1758                               migrate_release_ram() &
1759                               migration_in_postcopy());
1760     } else {
1761         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1762     }
1763     ram_counters.transferred += TARGET_PAGE_SIZE;
1764     ram_counters.normal++;
1765     return 1;
1766 }
1767 
1768 /**
1769  * ram_save_page: send the given page to the stream
1770  *
1771  * Returns the number of pages written.
1772  *          < 0 - error
1773  *          >=0 - Number of pages written - this might legally be 0
1774  *                if xbzrle noticed the page was the same.
1775  *
1776  * @rs: current RAM state
1777  * @block: block that contains the page we want to send
1778  * @offset: offset inside the block for the page
1779  * @last_stage: if we are at the completion stage
1780  */
1781 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1782 {
1783     int pages = -1;
1784     uint8_t *p;
1785     bool send_async = true;
1786     RAMBlock *block = pss->block;
1787     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1788     ram_addr_t current_addr = block->offset + offset;
1789 
1790     p = block->host + offset;
1791     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1792 
1793     XBZRLE_cache_lock();
1794     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1795         migrate_use_xbzrle()) {
1796         pages = save_xbzrle_page(rs, &p, current_addr, block,
1797                                  offset, last_stage);
1798         if (!last_stage) {
1799             /* Can't send this cached data async, since the cache page
1800              * might get updated before it gets to the wire
1801              */
1802             send_async = false;
1803         }
1804     }
1805 
1806     /* XBZRLE overflow or normal page */
1807     if (pages == -1) {
1808         pages = save_normal_page(rs, block, offset, p, send_async);
1809     }
1810 
1811     XBZRLE_cache_unlock();
1812 
1813     return pages;
1814 }
1815 
1816 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1817                                  ram_addr_t offset)
1818 {
1819     multifd_queue_page(block, offset);
1820     ram_counters.normal++;
1821 
1822     return 1;
1823 }
1824 
1825 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1826                                 ram_addr_t offset, uint8_t *source_buf)
1827 {
1828     RAMState *rs = ram_state;
1829     int bytes_sent, blen;
1830     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1831 
1832     bytes_sent = save_page_header(rs, f, block, offset |
1833                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1834 
1835     /*
1836      * copy it to a internal buffer to avoid it being modified by VM
1837      * so that we can catch up the error during compression and
1838      * decompression
1839      */
1840     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1841     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1842     if (blen < 0) {
1843         bytes_sent = 0;
1844         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1845         error_report("compressed data failed!");
1846     } else {
1847         bytes_sent += blen;
1848         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1849     }
1850 
1851     return bytes_sent;
1852 }
1853 
1854 static void flush_compressed_data(RAMState *rs)
1855 {
1856     int idx, len, thread_count;
1857 
1858     if (!migrate_use_compression()) {
1859         return;
1860     }
1861     thread_count = migrate_compress_threads();
1862 
1863     qemu_mutex_lock(&comp_done_lock);
1864     for (idx = 0; idx < thread_count; idx++) {
1865         while (!comp_param[idx].done) {
1866             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1867         }
1868     }
1869     qemu_mutex_unlock(&comp_done_lock);
1870 
1871     for (idx = 0; idx < thread_count; idx++) {
1872         qemu_mutex_lock(&comp_param[idx].mutex);
1873         if (!comp_param[idx].quit) {
1874             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1875             ram_counters.transferred += len;
1876         }
1877         qemu_mutex_unlock(&comp_param[idx].mutex);
1878     }
1879 }
1880 
1881 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1882                                        ram_addr_t offset)
1883 {
1884     param->block = block;
1885     param->offset = offset;
1886 }
1887 
1888 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1889                                            ram_addr_t offset)
1890 {
1891     int idx, thread_count, bytes_xmit = -1, pages = -1;
1892 
1893     thread_count = migrate_compress_threads();
1894     qemu_mutex_lock(&comp_done_lock);
1895     while (true) {
1896         for (idx = 0; idx < thread_count; idx++) {
1897             if (comp_param[idx].done) {
1898                 comp_param[idx].done = false;
1899                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1900                 qemu_mutex_lock(&comp_param[idx].mutex);
1901                 set_compress_params(&comp_param[idx], block, offset);
1902                 qemu_cond_signal(&comp_param[idx].cond);
1903                 qemu_mutex_unlock(&comp_param[idx].mutex);
1904                 pages = 1;
1905                 ram_counters.normal++;
1906                 ram_counters.transferred += bytes_xmit;
1907                 break;
1908             }
1909         }
1910         if (pages > 0) {
1911             break;
1912         } else {
1913             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1914         }
1915     }
1916     qemu_mutex_unlock(&comp_done_lock);
1917 
1918     return pages;
1919 }
1920 
1921 /**
1922  * find_dirty_block: find the next dirty page and update any state
1923  * associated with the search process.
1924  *
1925  * Returns if a page is found
1926  *
1927  * @rs: current RAM state
1928  * @pss: data about the state of the current dirty page scan
1929  * @again: set to false if the search has scanned the whole of RAM
1930  */
1931 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1932 {
1933     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1934     if (pss->complete_round && pss->block == rs->last_seen_block &&
1935         pss->page >= rs->last_page) {
1936         /*
1937          * We've been once around the RAM and haven't found anything.
1938          * Give up.
1939          */
1940         *again = false;
1941         return false;
1942     }
1943     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1944         /* Didn't find anything in this RAM Block */
1945         pss->page = 0;
1946         pss->block = QLIST_NEXT_RCU(pss->block, next);
1947         if (!pss->block) {
1948             /* Hit the end of the list */
1949             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1950             /* Flag that we've looped */
1951             pss->complete_round = true;
1952             rs->ram_bulk_stage = false;
1953             if (migrate_use_xbzrle()) {
1954                 /* If xbzrle is on, stop using the data compression at this
1955                  * point. In theory, xbzrle can do better than compression.
1956                  */
1957                 flush_compressed_data(rs);
1958             }
1959         }
1960         /* Didn't find anything this time, but try again on the new block */
1961         *again = true;
1962         return false;
1963     } else {
1964         /* Can go around again, but... */
1965         *again = true;
1966         /* We've found something so probably don't need to */
1967         return true;
1968     }
1969 }
1970 
1971 /**
1972  * unqueue_page: gets a page of the queue
1973  *
1974  * Helper for 'get_queued_page' - gets a page off the queue
1975  *
1976  * Returns the block of the page (or NULL if none available)
1977  *
1978  * @rs: current RAM state
1979  * @offset: used to return the offset within the RAMBlock
1980  */
1981 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1982 {
1983     RAMBlock *block = NULL;
1984 
1985     qemu_mutex_lock(&rs->src_page_req_mutex);
1986     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1987         struct RAMSrcPageRequest *entry =
1988                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1989         block = entry->rb;
1990         *offset = entry->offset;
1991 
1992         if (entry->len > TARGET_PAGE_SIZE) {
1993             entry->len -= TARGET_PAGE_SIZE;
1994             entry->offset += TARGET_PAGE_SIZE;
1995         } else {
1996             memory_region_unref(block->mr);
1997             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1998             g_free(entry);
1999             migration_consume_urgent_request();
2000         }
2001     }
2002     qemu_mutex_unlock(&rs->src_page_req_mutex);
2003 
2004     return block;
2005 }
2006 
2007 /**
2008  * get_queued_page: unqueue a page from the postocpy requests
2009  *
2010  * Skips pages that are already sent (!dirty)
2011  *
2012  * Returns if a queued page is found
2013  *
2014  * @rs: current RAM state
2015  * @pss: data about the state of the current dirty page scan
2016  */
2017 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2018 {
2019     RAMBlock  *block;
2020     ram_addr_t offset;
2021     bool dirty;
2022 
2023     do {
2024         block = unqueue_page(rs, &offset);
2025         /*
2026          * We're sending this page, and since it's postcopy nothing else
2027          * will dirty it, and we must make sure it doesn't get sent again
2028          * even if this queue request was received after the background
2029          * search already sent it.
2030          */
2031         if (block) {
2032             unsigned long page;
2033 
2034             page = offset >> TARGET_PAGE_BITS;
2035             dirty = test_bit(page, block->bmap);
2036             if (!dirty) {
2037                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2038                        page, test_bit(page, block->unsentmap));
2039             } else {
2040                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2041             }
2042         }
2043 
2044     } while (block && !dirty);
2045 
2046     if (block) {
2047         /*
2048          * As soon as we start servicing pages out of order, then we have
2049          * to kill the bulk stage, since the bulk stage assumes
2050          * in (migration_bitmap_find_and_reset_dirty) that every page is
2051          * dirty, that's no longer true.
2052          */
2053         rs->ram_bulk_stage = false;
2054 
2055         /*
2056          * We want the background search to continue from the queued page
2057          * since the guest is likely to want other pages near to the page
2058          * it just requested.
2059          */
2060         pss->block = block;
2061         pss->page = offset >> TARGET_PAGE_BITS;
2062     }
2063 
2064     return !!block;
2065 }
2066 
2067 /**
2068  * migration_page_queue_free: drop any remaining pages in the ram
2069  * request queue
2070  *
2071  * It should be empty at the end anyway, but in error cases there may
2072  * be some left.  in case that there is any page left, we drop it.
2073  *
2074  */
2075 static void migration_page_queue_free(RAMState *rs)
2076 {
2077     struct RAMSrcPageRequest *mspr, *next_mspr;
2078     /* This queue generally should be empty - but in the case of a failed
2079      * migration might have some droppings in.
2080      */
2081     rcu_read_lock();
2082     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2083         memory_region_unref(mspr->rb->mr);
2084         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2085         g_free(mspr);
2086     }
2087     rcu_read_unlock();
2088 }
2089 
2090 /**
2091  * ram_save_queue_pages: queue the page for transmission
2092  *
2093  * A request from postcopy destination for example.
2094  *
2095  * Returns zero on success or negative on error
2096  *
2097  * @rbname: Name of the RAMBLock of the request. NULL means the
2098  *          same that last one.
2099  * @start: starting address from the start of the RAMBlock
2100  * @len: length (in bytes) to send
2101  */
2102 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2103 {
2104     RAMBlock *ramblock;
2105     RAMState *rs = ram_state;
2106 
2107     ram_counters.postcopy_requests++;
2108     rcu_read_lock();
2109     if (!rbname) {
2110         /* Reuse last RAMBlock */
2111         ramblock = rs->last_req_rb;
2112 
2113         if (!ramblock) {
2114             /*
2115              * Shouldn't happen, we can't reuse the last RAMBlock if
2116              * it's the 1st request.
2117              */
2118             error_report("ram_save_queue_pages no previous block");
2119             goto err;
2120         }
2121     } else {
2122         ramblock = qemu_ram_block_by_name(rbname);
2123 
2124         if (!ramblock) {
2125             /* We shouldn't be asked for a non-existent RAMBlock */
2126             error_report("ram_save_queue_pages no block '%s'", rbname);
2127             goto err;
2128         }
2129         rs->last_req_rb = ramblock;
2130     }
2131     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2132     if (start+len > ramblock->used_length) {
2133         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2134                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2135                      __func__, start, len, ramblock->used_length);
2136         goto err;
2137     }
2138 
2139     struct RAMSrcPageRequest *new_entry =
2140         g_malloc0(sizeof(struct RAMSrcPageRequest));
2141     new_entry->rb = ramblock;
2142     new_entry->offset = start;
2143     new_entry->len = len;
2144 
2145     memory_region_ref(ramblock->mr);
2146     qemu_mutex_lock(&rs->src_page_req_mutex);
2147     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2148     migration_make_urgent_request();
2149     qemu_mutex_unlock(&rs->src_page_req_mutex);
2150     rcu_read_unlock();
2151 
2152     return 0;
2153 
2154 err:
2155     rcu_read_unlock();
2156     return -1;
2157 }
2158 
2159 static bool save_page_use_compression(RAMState *rs)
2160 {
2161     if (!migrate_use_compression()) {
2162         return false;
2163     }
2164 
2165     /*
2166      * If xbzrle is on, stop using the data compression after first
2167      * round of migration even if compression is enabled. In theory,
2168      * xbzrle can do better than compression.
2169      */
2170     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2171         return true;
2172     }
2173 
2174     return false;
2175 }
2176 
2177 /**
2178  * ram_save_target_page: save one target page
2179  *
2180  * Returns the number of pages written
2181  *
2182  * @rs: current RAM state
2183  * @pss: data about the page we want to send
2184  * @last_stage: if we are at the completion stage
2185  */
2186 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2187                                 bool last_stage)
2188 {
2189     RAMBlock *block = pss->block;
2190     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2191     int res;
2192 
2193     if (control_save_page(rs, block, offset, &res)) {
2194         return res;
2195     }
2196 
2197     /*
2198      * When starting the process of a new block, the first page of
2199      * the block should be sent out before other pages in the same
2200      * block, and all the pages in last block should have been sent
2201      * out, keeping this order is important, because the 'cont' flag
2202      * is used to avoid resending the block name.
2203      */
2204     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2205             flush_compressed_data(rs);
2206     }
2207 
2208     res = save_zero_page(rs, block, offset);
2209     if (res > 0) {
2210         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2211          * page would be stale
2212          */
2213         if (!save_page_use_compression(rs)) {
2214             XBZRLE_cache_lock();
2215             xbzrle_cache_zero_page(rs, block->offset + offset);
2216             XBZRLE_cache_unlock();
2217         }
2218         ram_release_pages(block->idstr, offset, res);
2219         return res;
2220     }
2221 
2222     /*
2223      * Make sure the first page is sent out before other pages.
2224      *
2225      * we post it as normal page as compression will take much
2226      * CPU resource.
2227      */
2228     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2229         return compress_page_with_multi_thread(rs, block, offset);
2230     } else if (migrate_use_multifd()) {
2231         return ram_save_multifd_page(rs, block, offset);
2232     }
2233 
2234     return ram_save_page(rs, pss, last_stage);
2235 }
2236 
2237 /**
2238  * ram_save_host_page: save a whole host page
2239  *
2240  * Starting at *offset send pages up to the end of the current host
2241  * page. It's valid for the initial offset to point into the middle of
2242  * a host page in which case the remainder of the hostpage is sent.
2243  * Only dirty target pages are sent. Note that the host page size may
2244  * be a huge page for this block.
2245  * The saving stops at the boundary of the used_length of the block
2246  * if the RAMBlock isn't a multiple of the host page size.
2247  *
2248  * Returns the number of pages written or negative on error
2249  *
2250  * @rs: current RAM state
2251  * @ms: current migration state
2252  * @pss: data about the page we want to send
2253  * @last_stage: if we are at the completion stage
2254  */
2255 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2256                               bool last_stage)
2257 {
2258     int tmppages, pages = 0;
2259     size_t pagesize_bits =
2260         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2261 
2262     if (!qemu_ram_is_migratable(pss->block)) {
2263         error_report("block %s should not be migrated !", pss->block->idstr);
2264         return 0;
2265     }
2266 
2267     do {
2268         /* Check the pages is dirty and if it is send it */
2269         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2270             pss->page++;
2271             continue;
2272         }
2273 
2274         tmppages = ram_save_target_page(rs, pss, last_stage);
2275         if (tmppages < 0) {
2276             return tmppages;
2277         }
2278 
2279         pages += tmppages;
2280         if (pss->block->unsentmap) {
2281             clear_bit(pss->page, pss->block->unsentmap);
2282         }
2283 
2284         pss->page++;
2285     } while ((pss->page & (pagesize_bits - 1)) &&
2286              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2287 
2288     /* The offset we leave with is the last one we looked at */
2289     pss->page--;
2290     return pages;
2291 }
2292 
2293 /**
2294  * ram_find_and_save_block: finds a dirty page and sends it to f
2295  *
2296  * Called within an RCU critical section.
2297  *
2298  * Returns the number of pages written where zero means no dirty pages
2299  *
2300  * @rs: current RAM state
2301  * @last_stage: if we are at the completion stage
2302  *
2303  * On systems where host-page-size > target-page-size it will send all the
2304  * pages in a host page that are dirty.
2305  */
2306 
2307 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2308 {
2309     PageSearchStatus pss;
2310     int pages = 0;
2311     bool again, found;
2312 
2313     /* No dirty page as there is zero RAM */
2314     if (!ram_bytes_total()) {
2315         return pages;
2316     }
2317 
2318     pss.block = rs->last_seen_block;
2319     pss.page = rs->last_page;
2320     pss.complete_round = false;
2321 
2322     if (!pss.block) {
2323         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2324     }
2325 
2326     do {
2327         again = true;
2328         found = get_queued_page(rs, &pss);
2329 
2330         if (!found) {
2331             /* priority queue empty, so just search for something dirty */
2332             found = find_dirty_block(rs, &pss, &again);
2333         }
2334 
2335         if (found) {
2336             pages = ram_save_host_page(rs, &pss, last_stage);
2337         }
2338     } while (!pages && again);
2339 
2340     rs->last_seen_block = pss.block;
2341     rs->last_page = pss.page;
2342 
2343     return pages;
2344 }
2345 
2346 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2347 {
2348     uint64_t pages = size / TARGET_PAGE_SIZE;
2349 
2350     if (zero) {
2351         ram_counters.duplicate += pages;
2352     } else {
2353         ram_counters.normal += pages;
2354         ram_counters.transferred += size;
2355         qemu_update_position(f, size);
2356     }
2357 }
2358 
2359 uint64_t ram_bytes_total(void)
2360 {
2361     RAMBlock *block;
2362     uint64_t total = 0;
2363 
2364     rcu_read_lock();
2365     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2366         total += block->used_length;
2367     }
2368     rcu_read_unlock();
2369     return total;
2370 }
2371 
2372 static void xbzrle_load_setup(void)
2373 {
2374     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2375 }
2376 
2377 static void xbzrle_load_cleanup(void)
2378 {
2379     g_free(XBZRLE.decoded_buf);
2380     XBZRLE.decoded_buf = NULL;
2381 }
2382 
2383 static void ram_state_cleanup(RAMState **rsp)
2384 {
2385     if (*rsp) {
2386         migration_page_queue_free(*rsp);
2387         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2388         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2389         g_free(*rsp);
2390         *rsp = NULL;
2391     }
2392 }
2393 
2394 static void xbzrle_cleanup(void)
2395 {
2396     XBZRLE_cache_lock();
2397     if (XBZRLE.cache) {
2398         cache_fini(XBZRLE.cache);
2399         g_free(XBZRLE.encoded_buf);
2400         g_free(XBZRLE.current_buf);
2401         g_free(XBZRLE.zero_target_page);
2402         XBZRLE.cache = NULL;
2403         XBZRLE.encoded_buf = NULL;
2404         XBZRLE.current_buf = NULL;
2405         XBZRLE.zero_target_page = NULL;
2406     }
2407     XBZRLE_cache_unlock();
2408 }
2409 
2410 static void ram_save_cleanup(void *opaque)
2411 {
2412     RAMState **rsp = opaque;
2413     RAMBlock *block;
2414 
2415     /* caller have hold iothread lock or is in a bh, so there is
2416      * no writing race against this migration_bitmap
2417      */
2418     memory_global_dirty_log_stop();
2419 
2420     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2421         g_free(block->bmap);
2422         block->bmap = NULL;
2423         g_free(block->unsentmap);
2424         block->unsentmap = NULL;
2425     }
2426 
2427     xbzrle_cleanup();
2428     compress_threads_save_cleanup();
2429     ram_state_cleanup(rsp);
2430 }
2431 
2432 static void ram_state_reset(RAMState *rs)
2433 {
2434     rs->last_seen_block = NULL;
2435     rs->last_sent_block = NULL;
2436     rs->last_page = 0;
2437     rs->last_version = ram_list.version;
2438     rs->ram_bulk_stage = true;
2439 }
2440 
2441 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2442 
2443 /*
2444  * 'expected' is the value you expect the bitmap mostly to be full
2445  * of; it won't bother printing lines that are all this value.
2446  * If 'todump' is null the migration bitmap is dumped.
2447  */
2448 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2449                            unsigned long pages)
2450 {
2451     int64_t cur;
2452     int64_t linelen = 128;
2453     char linebuf[129];
2454 
2455     for (cur = 0; cur < pages; cur += linelen) {
2456         int64_t curb;
2457         bool found = false;
2458         /*
2459          * Last line; catch the case where the line length
2460          * is longer than remaining ram
2461          */
2462         if (cur + linelen > pages) {
2463             linelen = pages - cur;
2464         }
2465         for (curb = 0; curb < linelen; curb++) {
2466             bool thisbit = test_bit(cur + curb, todump);
2467             linebuf[curb] = thisbit ? '1' : '.';
2468             found = found || (thisbit != expected);
2469         }
2470         if (found) {
2471             linebuf[curb] = '\0';
2472             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2473         }
2474     }
2475 }
2476 
2477 /* **** functions for postcopy ***** */
2478 
2479 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2480 {
2481     struct RAMBlock *block;
2482 
2483     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2484         unsigned long *bitmap = block->bmap;
2485         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2486         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2487 
2488         while (run_start < range) {
2489             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2490             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2491                               (run_end - run_start) << TARGET_PAGE_BITS);
2492             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2493         }
2494     }
2495 }
2496 
2497 /**
2498  * postcopy_send_discard_bm_ram: discard a RAMBlock
2499  *
2500  * Returns zero on success
2501  *
2502  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2503  * Note: At this point the 'unsentmap' is the processed bitmap combined
2504  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2505  *
2506  * @ms: current migration state
2507  * @pds: state for postcopy
2508  * @start: RAMBlock starting page
2509  * @length: RAMBlock size
2510  */
2511 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2512                                         PostcopyDiscardState *pds,
2513                                         RAMBlock *block)
2514 {
2515     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2516     unsigned long current;
2517     unsigned long *unsentmap = block->unsentmap;
2518 
2519     for (current = 0; current < end; ) {
2520         unsigned long one = find_next_bit(unsentmap, end, current);
2521 
2522         if (one <= end) {
2523             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2524             unsigned long discard_length;
2525 
2526             if (zero >= end) {
2527                 discard_length = end - one;
2528             } else {
2529                 discard_length = zero - one;
2530             }
2531             if (discard_length) {
2532                 postcopy_discard_send_range(ms, pds, one, discard_length);
2533             }
2534             current = one + discard_length;
2535         } else {
2536             current = one;
2537         }
2538     }
2539 
2540     return 0;
2541 }
2542 
2543 /**
2544  * postcopy_each_ram_send_discard: discard all RAMBlocks
2545  *
2546  * Returns 0 for success or negative for error
2547  *
2548  * Utility for the outgoing postcopy code.
2549  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2550  *   passing it bitmap indexes and name.
2551  * (qemu_ram_foreach_block ends up passing unscaled lengths
2552  *  which would mean postcopy code would have to deal with target page)
2553  *
2554  * @ms: current migration state
2555  */
2556 static int postcopy_each_ram_send_discard(MigrationState *ms)
2557 {
2558     struct RAMBlock *block;
2559     int ret;
2560 
2561     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2562         PostcopyDiscardState *pds =
2563             postcopy_discard_send_init(ms, block->idstr);
2564 
2565         /*
2566          * Postcopy sends chunks of bitmap over the wire, but it
2567          * just needs indexes at this point, avoids it having
2568          * target page specific code.
2569          */
2570         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2571         postcopy_discard_send_finish(ms, pds);
2572         if (ret) {
2573             return ret;
2574         }
2575     }
2576 
2577     return 0;
2578 }
2579 
2580 /**
2581  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2582  *
2583  * Helper for postcopy_chunk_hostpages; it's called twice to
2584  * canonicalize the two bitmaps, that are similar, but one is
2585  * inverted.
2586  *
2587  * Postcopy requires that all target pages in a hostpage are dirty or
2588  * clean, not a mix.  This function canonicalizes the bitmaps.
2589  *
2590  * @ms: current migration state
2591  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2592  *               otherwise we need to canonicalize partially dirty host pages
2593  * @block: block that contains the page we want to canonicalize
2594  * @pds: state for postcopy
2595  */
2596 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2597                                           RAMBlock *block,
2598                                           PostcopyDiscardState *pds)
2599 {
2600     RAMState *rs = ram_state;
2601     unsigned long *bitmap = block->bmap;
2602     unsigned long *unsentmap = block->unsentmap;
2603     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2604     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2605     unsigned long run_start;
2606 
2607     if (block->page_size == TARGET_PAGE_SIZE) {
2608         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2609         return;
2610     }
2611 
2612     if (unsent_pass) {
2613         /* Find a sent page */
2614         run_start = find_next_zero_bit(unsentmap, pages, 0);
2615     } else {
2616         /* Find a dirty page */
2617         run_start = find_next_bit(bitmap, pages, 0);
2618     }
2619 
2620     while (run_start < pages) {
2621         bool do_fixup = false;
2622         unsigned long fixup_start_addr;
2623         unsigned long host_offset;
2624 
2625         /*
2626          * If the start of this run of pages is in the middle of a host
2627          * page, then we need to fixup this host page.
2628          */
2629         host_offset = run_start % host_ratio;
2630         if (host_offset) {
2631             do_fixup = true;
2632             run_start -= host_offset;
2633             fixup_start_addr = run_start;
2634             /* For the next pass */
2635             run_start = run_start + host_ratio;
2636         } else {
2637             /* Find the end of this run */
2638             unsigned long run_end;
2639             if (unsent_pass) {
2640                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2641             } else {
2642                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2643             }
2644             /*
2645              * If the end isn't at the start of a host page, then the
2646              * run doesn't finish at the end of a host page
2647              * and we need to discard.
2648              */
2649             host_offset = run_end % host_ratio;
2650             if (host_offset) {
2651                 do_fixup = true;
2652                 fixup_start_addr = run_end - host_offset;
2653                 /*
2654                  * This host page has gone, the next loop iteration starts
2655                  * from after the fixup
2656                  */
2657                 run_start = fixup_start_addr + host_ratio;
2658             } else {
2659                 /*
2660                  * No discards on this iteration, next loop starts from
2661                  * next sent/dirty page
2662                  */
2663                 run_start = run_end + 1;
2664             }
2665         }
2666 
2667         if (do_fixup) {
2668             unsigned long page;
2669 
2670             /* Tell the destination to discard this page */
2671             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2672                 /* For the unsent_pass we:
2673                  *     discard partially sent pages
2674                  * For the !unsent_pass (dirty) we:
2675                  *     discard partially dirty pages that were sent
2676                  *     (any partially sent pages were already discarded
2677                  *     by the previous unsent_pass)
2678                  */
2679                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2680                                             host_ratio);
2681             }
2682 
2683             /* Clean up the bitmap */
2684             for (page = fixup_start_addr;
2685                  page < fixup_start_addr + host_ratio; page++) {
2686                 /* All pages in this host page are now not sent */
2687                 set_bit(page, unsentmap);
2688 
2689                 /*
2690                  * Remark them as dirty, updating the count for any pages
2691                  * that weren't previously dirty.
2692                  */
2693                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2694             }
2695         }
2696 
2697         if (unsent_pass) {
2698             /* Find the next sent page for the next iteration */
2699             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2700         } else {
2701             /* Find the next dirty page for the next iteration */
2702             run_start = find_next_bit(bitmap, pages, run_start);
2703         }
2704     }
2705 }
2706 
2707 /**
2708  * postcopy_chuck_hostpages: discrad any partially sent host page
2709  *
2710  * Utility for the outgoing postcopy code.
2711  *
2712  * Discard any partially sent host-page size chunks, mark any partially
2713  * dirty host-page size chunks as all dirty.  In this case the host-page
2714  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2715  *
2716  * Returns zero on success
2717  *
2718  * @ms: current migration state
2719  * @block: block we want to work with
2720  */
2721 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2722 {
2723     PostcopyDiscardState *pds =
2724         postcopy_discard_send_init(ms, block->idstr);
2725 
2726     /* First pass: Discard all partially sent host pages */
2727     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2728     /*
2729      * Second pass: Ensure that all partially dirty host pages are made
2730      * fully dirty.
2731      */
2732     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2733 
2734     postcopy_discard_send_finish(ms, pds);
2735     return 0;
2736 }
2737 
2738 /**
2739  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2740  *
2741  * Returns zero on success
2742  *
2743  * Transmit the set of pages to be discarded after precopy to the target
2744  * these are pages that:
2745  *     a) Have been previously transmitted but are now dirty again
2746  *     b) Pages that have never been transmitted, this ensures that
2747  *        any pages on the destination that have been mapped by background
2748  *        tasks get discarded (transparent huge pages is the specific concern)
2749  * Hopefully this is pretty sparse
2750  *
2751  * @ms: current migration state
2752  */
2753 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2754 {
2755     RAMState *rs = ram_state;
2756     RAMBlock *block;
2757     int ret;
2758 
2759     rcu_read_lock();
2760 
2761     /* This should be our last sync, the src is now paused */
2762     migration_bitmap_sync(rs);
2763 
2764     /* Easiest way to make sure we don't resume in the middle of a host-page */
2765     rs->last_seen_block = NULL;
2766     rs->last_sent_block = NULL;
2767     rs->last_page = 0;
2768 
2769     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2770         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2771         unsigned long *bitmap = block->bmap;
2772         unsigned long *unsentmap = block->unsentmap;
2773 
2774         if (!unsentmap) {
2775             /* We don't have a safe way to resize the sentmap, so
2776              * if the bitmap was resized it will be NULL at this
2777              * point.
2778              */
2779             error_report("migration ram resized during precopy phase");
2780             rcu_read_unlock();
2781             return -EINVAL;
2782         }
2783         /* Deal with TPS != HPS and huge pages */
2784         ret = postcopy_chunk_hostpages(ms, block);
2785         if (ret) {
2786             rcu_read_unlock();
2787             return ret;
2788         }
2789 
2790         /*
2791          * Update the unsentmap to be unsentmap = unsentmap | dirty
2792          */
2793         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2794 #ifdef DEBUG_POSTCOPY
2795         ram_debug_dump_bitmap(unsentmap, true, pages);
2796 #endif
2797     }
2798     trace_ram_postcopy_send_discard_bitmap();
2799 
2800     ret = postcopy_each_ram_send_discard(ms);
2801     rcu_read_unlock();
2802 
2803     return ret;
2804 }
2805 
2806 /**
2807  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2808  *
2809  * Returns zero on success
2810  *
2811  * @rbname: name of the RAMBlock of the request. NULL means the
2812  *          same that last one.
2813  * @start: RAMBlock starting page
2814  * @length: RAMBlock size
2815  */
2816 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2817 {
2818     int ret = -1;
2819 
2820     trace_ram_discard_range(rbname, start, length);
2821 
2822     rcu_read_lock();
2823     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2824 
2825     if (!rb) {
2826         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2827         goto err;
2828     }
2829 
2830     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2831                  length >> qemu_target_page_bits());
2832     ret = ram_block_discard_range(rb, start, length);
2833 
2834 err:
2835     rcu_read_unlock();
2836 
2837     return ret;
2838 }
2839 
2840 /*
2841  * For every allocation, we will try not to crash the VM if the
2842  * allocation failed.
2843  */
2844 static int xbzrle_init(void)
2845 {
2846     Error *local_err = NULL;
2847 
2848     if (!migrate_use_xbzrle()) {
2849         return 0;
2850     }
2851 
2852     XBZRLE_cache_lock();
2853 
2854     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2855     if (!XBZRLE.zero_target_page) {
2856         error_report("%s: Error allocating zero page", __func__);
2857         goto err_out;
2858     }
2859 
2860     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2861                               TARGET_PAGE_SIZE, &local_err);
2862     if (!XBZRLE.cache) {
2863         error_report_err(local_err);
2864         goto free_zero_page;
2865     }
2866 
2867     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2868     if (!XBZRLE.encoded_buf) {
2869         error_report("%s: Error allocating encoded_buf", __func__);
2870         goto free_cache;
2871     }
2872 
2873     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2874     if (!XBZRLE.current_buf) {
2875         error_report("%s: Error allocating current_buf", __func__);
2876         goto free_encoded_buf;
2877     }
2878 
2879     /* We are all good */
2880     XBZRLE_cache_unlock();
2881     return 0;
2882 
2883 free_encoded_buf:
2884     g_free(XBZRLE.encoded_buf);
2885     XBZRLE.encoded_buf = NULL;
2886 free_cache:
2887     cache_fini(XBZRLE.cache);
2888     XBZRLE.cache = NULL;
2889 free_zero_page:
2890     g_free(XBZRLE.zero_target_page);
2891     XBZRLE.zero_target_page = NULL;
2892 err_out:
2893     XBZRLE_cache_unlock();
2894     return -ENOMEM;
2895 }
2896 
2897 static int ram_state_init(RAMState **rsp)
2898 {
2899     *rsp = g_try_new0(RAMState, 1);
2900 
2901     if (!*rsp) {
2902         error_report("%s: Init ramstate fail", __func__);
2903         return -1;
2904     }
2905 
2906     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2907     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2908     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2909 
2910     /*
2911      * Count the total number of pages used by ram blocks not including any
2912      * gaps due to alignment or unplugs.
2913      */
2914     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2915 
2916     ram_state_reset(*rsp);
2917 
2918     return 0;
2919 }
2920 
2921 static void ram_list_init_bitmaps(void)
2922 {
2923     RAMBlock *block;
2924     unsigned long pages;
2925 
2926     /* Skip setting bitmap if there is no RAM */
2927     if (ram_bytes_total()) {
2928         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2929             pages = block->max_length >> TARGET_PAGE_BITS;
2930             block->bmap = bitmap_new(pages);
2931             bitmap_set(block->bmap, 0, pages);
2932             if (migrate_postcopy_ram()) {
2933                 block->unsentmap = bitmap_new(pages);
2934                 bitmap_set(block->unsentmap, 0, pages);
2935             }
2936         }
2937     }
2938 }
2939 
2940 static void ram_init_bitmaps(RAMState *rs)
2941 {
2942     /* For memory_global_dirty_log_start below.  */
2943     qemu_mutex_lock_iothread();
2944     qemu_mutex_lock_ramlist();
2945     rcu_read_lock();
2946 
2947     ram_list_init_bitmaps();
2948     memory_global_dirty_log_start();
2949     migration_bitmap_sync(rs);
2950 
2951     rcu_read_unlock();
2952     qemu_mutex_unlock_ramlist();
2953     qemu_mutex_unlock_iothread();
2954 }
2955 
2956 static int ram_init_all(RAMState **rsp)
2957 {
2958     if (ram_state_init(rsp)) {
2959         return -1;
2960     }
2961 
2962     if (xbzrle_init()) {
2963         ram_state_cleanup(rsp);
2964         return -1;
2965     }
2966 
2967     ram_init_bitmaps(*rsp);
2968 
2969     return 0;
2970 }
2971 
2972 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2973 {
2974     RAMBlock *block;
2975     uint64_t pages = 0;
2976 
2977     /*
2978      * Postcopy is not using xbzrle/compression, so no need for that.
2979      * Also, since source are already halted, we don't need to care
2980      * about dirty page logging as well.
2981      */
2982 
2983     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2984         pages += bitmap_count_one(block->bmap,
2985                                   block->used_length >> TARGET_PAGE_BITS);
2986     }
2987 
2988     /* This may not be aligned with current bitmaps. Recalculate. */
2989     rs->migration_dirty_pages = pages;
2990 
2991     rs->last_seen_block = NULL;
2992     rs->last_sent_block = NULL;
2993     rs->last_page = 0;
2994     rs->last_version = ram_list.version;
2995     /*
2996      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2997      * matter what we have sent.
2998      */
2999     rs->ram_bulk_stage = false;
3000 
3001     /* Update RAMState cache of output QEMUFile */
3002     rs->f = out;
3003 
3004     trace_ram_state_resume_prepare(pages);
3005 }
3006 
3007 /*
3008  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3009  * long-running RCU critical section.  When rcu-reclaims in the code
3010  * start to become numerous it will be necessary to reduce the
3011  * granularity of these critical sections.
3012  */
3013 
3014 /**
3015  * ram_save_setup: Setup RAM for migration
3016  *
3017  * Returns zero to indicate success and negative for error
3018  *
3019  * @f: QEMUFile where to send the data
3020  * @opaque: RAMState pointer
3021  */
3022 static int ram_save_setup(QEMUFile *f, void *opaque)
3023 {
3024     RAMState **rsp = opaque;
3025     RAMBlock *block;
3026 
3027     if (compress_threads_save_setup()) {
3028         return -1;
3029     }
3030 
3031     /* migration has already setup the bitmap, reuse it. */
3032     if (!migration_in_colo_state()) {
3033         if (ram_init_all(rsp) != 0) {
3034             compress_threads_save_cleanup();
3035             return -1;
3036         }
3037     }
3038     (*rsp)->f = f;
3039 
3040     rcu_read_lock();
3041 
3042     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3043 
3044     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3045         qemu_put_byte(f, strlen(block->idstr));
3046         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3047         qemu_put_be64(f, block->used_length);
3048         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3049             qemu_put_be64(f, block->page_size);
3050         }
3051     }
3052 
3053     rcu_read_unlock();
3054 
3055     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3056     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3057 
3058     multifd_send_sync_main();
3059     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3060     qemu_fflush(f);
3061 
3062     return 0;
3063 }
3064 
3065 /**
3066  * ram_save_iterate: iterative stage for migration
3067  *
3068  * Returns zero to indicate success and negative for error
3069  *
3070  * @f: QEMUFile where to send the data
3071  * @opaque: RAMState pointer
3072  */
3073 static int ram_save_iterate(QEMUFile *f, void *opaque)
3074 {
3075     RAMState **temp = opaque;
3076     RAMState *rs = *temp;
3077     int ret;
3078     int i;
3079     int64_t t0;
3080     int done = 0;
3081 
3082     if (blk_mig_bulk_active()) {
3083         /* Avoid transferring ram during bulk phase of block migration as
3084          * the bulk phase will usually take a long time and transferring
3085          * ram updates during that time is pointless. */
3086         goto out;
3087     }
3088 
3089     rcu_read_lock();
3090     if (ram_list.version != rs->last_version) {
3091         ram_state_reset(rs);
3092     }
3093 
3094     /* Read version before ram_list.blocks */
3095     smp_rmb();
3096 
3097     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3098 
3099     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3100     i = 0;
3101     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3102             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3103         int pages;
3104 
3105         if (qemu_file_get_error(f)) {
3106             break;
3107         }
3108 
3109         pages = ram_find_and_save_block(rs, false);
3110         /* no more pages to sent */
3111         if (pages == 0) {
3112             done = 1;
3113             break;
3114         }
3115         rs->iterations++;
3116 
3117         /* we want to check in the 1st loop, just in case it was the 1st time
3118            and we had to sync the dirty bitmap.
3119            qemu_get_clock_ns() is a bit expensive, so we only check each some
3120            iterations
3121         */
3122         if ((i & 63) == 0) {
3123             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3124             if (t1 > MAX_WAIT) {
3125                 trace_ram_save_iterate_big_wait(t1, i);
3126                 break;
3127             }
3128         }
3129         i++;
3130     }
3131     flush_compressed_data(rs);
3132     rcu_read_unlock();
3133 
3134     /*
3135      * Must occur before EOS (or any QEMUFile operation)
3136      * because of RDMA protocol.
3137      */
3138     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3139 
3140     multifd_send_sync_main();
3141 out:
3142     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3143     qemu_fflush(f);
3144     ram_counters.transferred += 8;
3145 
3146     ret = qemu_file_get_error(f);
3147     if (ret < 0) {
3148         return ret;
3149     }
3150 
3151     return done;
3152 }
3153 
3154 /**
3155  * ram_save_complete: function called to send the remaining amount of ram
3156  *
3157  * Returns zero to indicate success
3158  *
3159  * Called with iothread lock
3160  *
3161  * @f: QEMUFile where to send the data
3162  * @opaque: RAMState pointer
3163  */
3164 static int ram_save_complete(QEMUFile *f, void *opaque)
3165 {
3166     RAMState **temp = opaque;
3167     RAMState *rs = *temp;
3168 
3169     rcu_read_lock();
3170 
3171     if (!migration_in_postcopy()) {
3172         migration_bitmap_sync(rs);
3173     }
3174 
3175     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3176 
3177     /* try transferring iterative blocks of memory */
3178 
3179     /* flush all remaining blocks regardless of rate limiting */
3180     while (true) {
3181         int pages;
3182 
3183         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3184         /* no more blocks to sent */
3185         if (pages == 0) {
3186             break;
3187         }
3188     }
3189 
3190     flush_compressed_data(rs);
3191     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3192 
3193     rcu_read_unlock();
3194 
3195     multifd_send_sync_main();
3196     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3197     qemu_fflush(f);
3198 
3199     return 0;
3200 }
3201 
3202 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3203                              uint64_t *res_precopy_only,
3204                              uint64_t *res_compatible,
3205                              uint64_t *res_postcopy_only)
3206 {
3207     RAMState **temp = opaque;
3208     RAMState *rs = *temp;
3209     uint64_t remaining_size;
3210 
3211     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3212 
3213     if (!migration_in_postcopy() &&
3214         remaining_size < max_size) {
3215         qemu_mutex_lock_iothread();
3216         rcu_read_lock();
3217         migration_bitmap_sync(rs);
3218         rcu_read_unlock();
3219         qemu_mutex_unlock_iothread();
3220         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3221     }
3222 
3223     if (migrate_postcopy_ram()) {
3224         /* We can do postcopy, and all the data is postcopiable */
3225         *res_compatible += remaining_size;
3226     } else {
3227         *res_precopy_only += remaining_size;
3228     }
3229 }
3230 
3231 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3232 {
3233     unsigned int xh_len;
3234     int xh_flags;
3235     uint8_t *loaded_data;
3236 
3237     /* extract RLE header */
3238     xh_flags = qemu_get_byte(f);
3239     xh_len = qemu_get_be16(f);
3240 
3241     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3242         error_report("Failed to load XBZRLE page - wrong compression!");
3243         return -1;
3244     }
3245 
3246     if (xh_len > TARGET_PAGE_SIZE) {
3247         error_report("Failed to load XBZRLE page - len overflow!");
3248         return -1;
3249     }
3250     loaded_data = XBZRLE.decoded_buf;
3251     /* load data and decode */
3252     /* it can change loaded_data to point to an internal buffer */
3253     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3254 
3255     /* decode RLE */
3256     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3257                              TARGET_PAGE_SIZE) == -1) {
3258         error_report("Failed to load XBZRLE page - decode error!");
3259         return -1;
3260     }
3261 
3262     return 0;
3263 }
3264 
3265 /**
3266  * ram_block_from_stream: read a RAMBlock id from the migration stream
3267  *
3268  * Must be called from within a rcu critical section.
3269  *
3270  * Returns a pointer from within the RCU-protected ram_list.
3271  *
3272  * @f: QEMUFile where to read the data from
3273  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3274  */
3275 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3276 {
3277     static RAMBlock *block = NULL;
3278     char id[256];
3279     uint8_t len;
3280 
3281     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3282         if (!block) {
3283             error_report("Ack, bad migration stream!");
3284             return NULL;
3285         }
3286         return block;
3287     }
3288 
3289     len = qemu_get_byte(f);
3290     qemu_get_buffer(f, (uint8_t *)id, len);
3291     id[len] = 0;
3292 
3293     block = qemu_ram_block_by_name(id);
3294     if (!block) {
3295         error_report("Can't find block %s", id);
3296         return NULL;
3297     }
3298 
3299     if (!qemu_ram_is_migratable(block)) {
3300         error_report("block %s should not be migrated !", id);
3301         return NULL;
3302     }
3303 
3304     return block;
3305 }
3306 
3307 static inline void *host_from_ram_block_offset(RAMBlock *block,
3308                                                ram_addr_t offset)
3309 {
3310     if (!offset_in_ramblock(block, offset)) {
3311         return NULL;
3312     }
3313 
3314     return block->host + offset;
3315 }
3316 
3317 /**
3318  * ram_handle_compressed: handle the zero page case
3319  *
3320  * If a page (or a whole RDMA chunk) has been
3321  * determined to be zero, then zap it.
3322  *
3323  * @host: host address for the zero page
3324  * @ch: what the page is filled from.  We only support zero
3325  * @size: size of the zero page
3326  */
3327 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3328 {
3329     if (ch != 0 || !is_zero_range(host, size)) {
3330         memset(host, ch, size);
3331     }
3332 }
3333 
3334 /* return the size after decompression, or negative value on error */
3335 static int
3336 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3337                      const uint8_t *source, size_t source_len)
3338 {
3339     int err;
3340 
3341     err = inflateReset(stream);
3342     if (err != Z_OK) {
3343         return -1;
3344     }
3345 
3346     stream->avail_in = source_len;
3347     stream->next_in = (uint8_t *)source;
3348     stream->avail_out = dest_len;
3349     stream->next_out = dest;
3350 
3351     err = inflate(stream, Z_NO_FLUSH);
3352     if (err != Z_STREAM_END) {
3353         return -1;
3354     }
3355 
3356     return stream->total_out;
3357 }
3358 
3359 static void *do_data_decompress(void *opaque)
3360 {
3361     DecompressParam *param = opaque;
3362     unsigned long pagesize;
3363     uint8_t *des;
3364     int len, ret;
3365 
3366     qemu_mutex_lock(&param->mutex);
3367     while (!param->quit) {
3368         if (param->des) {
3369             des = param->des;
3370             len = param->len;
3371             param->des = 0;
3372             qemu_mutex_unlock(&param->mutex);
3373 
3374             pagesize = TARGET_PAGE_SIZE;
3375 
3376             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3377                                        param->compbuf, len);
3378             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3379                 error_report("decompress data failed");
3380                 qemu_file_set_error(decomp_file, ret);
3381             }
3382 
3383             qemu_mutex_lock(&decomp_done_lock);
3384             param->done = true;
3385             qemu_cond_signal(&decomp_done_cond);
3386             qemu_mutex_unlock(&decomp_done_lock);
3387 
3388             qemu_mutex_lock(&param->mutex);
3389         } else {
3390             qemu_cond_wait(&param->cond, &param->mutex);
3391         }
3392     }
3393     qemu_mutex_unlock(&param->mutex);
3394 
3395     return NULL;
3396 }
3397 
3398 static int wait_for_decompress_done(void)
3399 {
3400     int idx, thread_count;
3401 
3402     if (!migrate_use_compression()) {
3403         return 0;
3404     }
3405 
3406     thread_count = migrate_decompress_threads();
3407     qemu_mutex_lock(&decomp_done_lock);
3408     for (idx = 0; idx < thread_count; idx++) {
3409         while (!decomp_param[idx].done) {
3410             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3411         }
3412     }
3413     qemu_mutex_unlock(&decomp_done_lock);
3414     return qemu_file_get_error(decomp_file);
3415 }
3416 
3417 static void compress_threads_load_cleanup(void)
3418 {
3419     int i, thread_count;
3420 
3421     if (!migrate_use_compression()) {
3422         return;
3423     }
3424     thread_count = migrate_decompress_threads();
3425     for (i = 0; i < thread_count; i++) {
3426         /*
3427          * we use it as a indicator which shows if the thread is
3428          * properly init'd or not
3429          */
3430         if (!decomp_param[i].compbuf) {
3431             break;
3432         }
3433 
3434         qemu_mutex_lock(&decomp_param[i].mutex);
3435         decomp_param[i].quit = true;
3436         qemu_cond_signal(&decomp_param[i].cond);
3437         qemu_mutex_unlock(&decomp_param[i].mutex);
3438     }
3439     for (i = 0; i < thread_count; i++) {
3440         if (!decomp_param[i].compbuf) {
3441             break;
3442         }
3443 
3444         qemu_thread_join(decompress_threads + i);
3445         qemu_mutex_destroy(&decomp_param[i].mutex);
3446         qemu_cond_destroy(&decomp_param[i].cond);
3447         inflateEnd(&decomp_param[i].stream);
3448         g_free(decomp_param[i].compbuf);
3449         decomp_param[i].compbuf = NULL;
3450     }
3451     g_free(decompress_threads);
3452     g_free(decomp_param);
3453     decompress_threads = NULL;
3454     decomp_param = NULL;
3455     decomp_file = NULL;
3456 }
3457 
3458 static int compress_threads_load_setup(QEMUFile *f)
3459 {
3460     int i, thread_count;
3461 
3462     if (!migrate_use_compression()) {
3463         return 0;
3464     }
3465 
3466     thread_count = migrate_decompress_threads();
3467     decompress_threads = g_new0(QemuThread, thread_count);
3468     decomp_param = g_new0(DecompressParam, thread_count);
3469     qemu_mutex_init(&decomp_done_lock);
3470     qemu_cond_init(&decomp_done_cond);
3471     decomp_file = f;
3472     for (i = 0; i < thread_count; i++) {
3473         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3474             goto exit;
3475         }
3476 
3477         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3478         qemu_mutex_init(&decomp_param[i].mutex);
3479         qemu_cond_init(&decomp_param[i].cond);
3480         decomp_param[i].done = true;
3481         decomp_param[i].quit = false;
3482         qemu_thread_create(decompress_threads + i, "decompress",
3483                            do_data_decompress, decomp_param + i,
3484                            QEMU_THREAD_JOINABLE);
3485     }
3486     return 0;
3487 exit:
3488     compress_threads_load_cleanup();
3489     return -1;
3490 }
3491 
3492 static void decompress_data_with_multi_threads(QEMUFile *f,
3493                                                void *host, int len)
3494 {
3495     int idx, thread_count;
3496 
3497     thread_count = migrate_decompress_threads();
3498     qemu_mutex_lock(&decomp_done_lock);
3499     while (true) {
3500         for (idx = 0; idx < thread_count; idx++) {
3501             if (decomp_param[idx].done) {
3502                 decomp_param[idx].done = false;
3503                 qemu_mutex_lock(&decomp_param[idx].mutex);
3504                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3505                 decomp_param[idx].des = host;
3506                 decomp_param[idx].len = len;
3507                 qemu_cond_signal(&decomp_param[idx].cond);
3508                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3509                 break;
3510             }
3511         }
3512         if (idx < thread_count) {
3513             break;
3514         } else {
3515             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3516         }
3517     }
3518     qemu_mutex_unlock(&decomp_done_lock);
3519 }
3520 
3521 /**
3522  * ram_load_setup: Setup RAM for migration incoming side
3523  *
3524  * Returns zero to indicate success and negative for error
3525  *
3526  * @f: QEMUFile where to receive the data
3527  * @opaque: RAMState pointer
3528  */
3529 static int ram_load_setup(QEMUFile *f, void *opaque)
3530 {
3531     if (compress_threads_load_setup(f)) {
3532         return -1;
3533     }
3534 
3535     xbzrle_load_setup();
3536     ramblock_recv_map_init();
3537     return 0;
3538 }
3539 
3540 static int ram_load_cleanup(void *opaque)
3541 {
3542     RAMBlock *rb;
3543     xbzrle_load_cleanup();
3544     compress_threads_load_cleanup();
3545 
3546     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3547         g_free(rb->receivedmap);
3548         rb->receivedmap = NULL;
3549     }
3550     return 0;
3551 }
3552 
3553 /**
3554  * ram_postcopy_incoming_init: allocate postcopy data structures
3555  *
3556  * Returns 0 for success and negative if there was one error
3557  *
3558  * @mis: current migration incoming state
3559  *
3560  * Allocate data structures etc needed by incoming migration with
3561  * postcopy-ram. postcopy-ram's similarly names
3562  * postcopy_ram_incoming_init does the work.
3563  */
3564 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3565 {
3566     return postcopy_ram_incoming_init(mis);
3567 }
3568 
3569 /**
3570  * ram_load_postcopy: load a page in postcopy case
3571  *
3572  * Returns 0 for success or -errno in case of error
3573  *
3574  * Called in postcopy mode by ram_load().
3575  * rcu_read_lock is taken prior to this being called.
3576  *
3577  * @f: QEMUFile where to send the data
3578  */
3579 static int ram_load_postcopy(QEMUFile *f)
3580 {
3581     int flags = 0, ret = 0;
3582     bool place_needed = false;
3583     bool matches_target_page_size = false;
3584     MigrationIncomingState *mis = migration_incoming_get_current();
3585     /* Temporary page that is later 'placed' */
3586     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3587     void *last_host = NULL;
3588     bool all_zero = false;
3589 
3590     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3591         ram_addr_t addr;
3592         void *host = NULL;
3593         void *page_buffer = NULL;
3594         void *place_source = NULL;
3595         RAMBlock *block = NULL;
3596         uint8_t ch;
3597 
3598         addr = qemu_get_be64(f);
3599 
3600         /*
3601          * If qemu file error, we should stop here, and then "addr"
3602          * may be invalid
3603          */
3604         ret = qemu_file_get_error(f);
3605         if (ret) {
3606             break;
3607         }
3608 
3609         flags = addr & ~TARGET_PAGE_MASK;
3610         addr &= TARGET_PAGE_MASK;
3611 
3612         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3613         place_needed = false;
3614         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3615             block = ram_block_from_stream(f, flags);
3616 
3617             host = host_from_ram_block_offset(block, addr);
3618             if (!host) {
3619                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3620                 ret = -EINVAL;
3621                 break;
3622             }
3623             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3624             /*
3625              * Postcopy requires that we place whole host pages atomically;
3626              * these may be huge pages for RAMBlocks that are backed by
3627              * hugetlbfs.
3628              * To make it atomic, the data is read into a temporary page
3629              * that's moved into place later.
3630              * The migration protocol uses,  possibly smaller, target-pages
3631              * however the source ensures it always sends all the components
3632              * of a host page in order.
3633              */
3634             page_buffer = postcopy_host_page +
3635                           ((uintptr_t)host & (block->page_size - 1));
3636             /* If all TP are zero then we can optimise the place */
3637             if (!((uintptr_t)host & (block->page_size - 1))) {
3638                 all_zero = true;
3639             } else {
3640                 /* not the 1st TP within the HP */
3641                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3642                     error_report("Non-sequential target page %p/%p",
3643                                   host, last_host);
3644                     ret = -EINVAL;
3645                     break;
3646                 }
3647             }
3648 
3649 
3650             /*
3651              * If it's the last part of a host page then we place the host
3652              * page
3653              */
3654             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3655                                      (block->page_size - 1)) == 0;
3656             place_source = postcopy_host_page;
3657         }
3658         last_host = host;
3659 
3660         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3661         case RAM_SAVE_FLAG_ZERO:
3662             ch = qemu_get_byte(f);
3663             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3664             if (ch) {
3665                 all_zero = false;
3666             }
3667             break;
3668 
3669         case RAM_SAVE_FLAG_PAGE:
3670             all_zero = false;
3671             if (!matches_target_page_size) {
3672                 /* For huge pages, we always use temporary buffer */
3673                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3674             } else {
3675                 /*
3676                  * For small pages that matches target page size, we
3677                  * avoid the qemu_file copy.  Instead we directly use
3678                  * the buffer of QEMUFile to place the page.  Note: we
3679                  * cannot do any QEMUFile operation before using that
3680                  * buffer to make sure the buffer is valid when
3681                  * placing the page.
3682                  */
3683                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3684                                          TARGET_PAGE_SIZE);
3685             }
3686             break;
3687         case RAM_SAVE_FLAG_EOS:
3688             /* normal exit */
3689             multifd_recv_sync_main();
3690             break;
3691         default:
3692             error_report("Unknown combination of migration flags: %#x"
3693                          " (postcopy mode)", flags);
3694             ret = -EINVAL;
3695             break;
3696         }
3697 
3698         /* Detect for any possible file errors */
3699         if (!ret && qemu_file_get_error(f)) {
3700             ret = qemu_file_get_error(f);
3701         }
3702 
3703         if (!ret && place_needed) {
3704             /* This gets called at the last target page in the host page */
3705             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3706 
3707             if (all_zero) {
3708                 ret = postcopy_place_page_zero(mis, place_dest,
3709                                                block);
3710             } else {
3711                 ret = postcopy_place_page(mis, place_dest,
3712                                           place_source, block);
3713             }
3714         }
3715     }
3716 
3717     return ret;
3718 }
3719 
3720 static bool postcopy_is_advised(void)
3721 {
3722     PostcopyState ps = postcopy_state_get();
3723     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3724 }
3725 
3726 static bool postcopy_is_running(void)
3727 {
3728     PostcopyState ps = postcopy_state_get();
3729     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3730 }
3731 
3732 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3733 {
3734     int flags = 0, ret = 0, invalid_flags = 0;
3735     static uint64_t seq_iter;
3736     int len = 0;
3737     /*
3738      * If system is running in postcopy mode, page inserts to host memory must
3739      * be atomic
3740      */
3741     bool postcopy_running = postcopy_is_running();
3742     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3743     bool postcopy_advised = postcopy_is_advised();
3744 
3745     seq_iter++;
3746 
3747     if (version_id != 4) {
3748         ret = -EINVAL;
3749     }
3750 
3751     if (!migrate_use_compression()) {
3752         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3753     }
3754     /* This RCU critical section can be very long running.
3755      * When RCU reclaims in the code start to become numerous,
3756      * it will be necessary to reduce the granularity of this
3757      * critical section.
3758      */
3759     rcu_read_lock();
3760 
3761     if (postcopy_running) {
3762         ret = ram_load_postcopy(f);
3763     }
3764 
3765     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3766         ram_addr_t addr, total_ram_bytes;
3767         void *host = NULL;
3768         uint8_t ch;
3769 
3770         addr = qemu_get_be64(f);
3771         flags = addr & ~TARGET_PAGE_MASK;
3772         addr &= TARGET_PAGE_MASK;
3773 
3774         if (flags & invalid_flags) {
3775             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3776                 error_report("Received an unexpected compressed page");
3777             }
3778 
3779             ret = -EINVAL;
3780             break;
3781         }
3782 
3783         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3784                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3785             RAMBlock *block = ram_block_from_stream(f, flags);
3786 
3787             host = host_from_ram_block_offset(block, addr);
3788             if (!host) {
3789                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3790                 ret = -EINVAL;
3791                 break;
3792             }
3793             ramblock_recv_bitmap_set(block, host);
3794             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3795         }
3796 
3797         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3798         case RAM_SAVE_FLAG_MEM_SIZE:
3799             /* Synchronize RAM block list */
3800             total_ram_bytes = addr;
3801             while (!ret && total_ram_bytes) {
3802                 RAMBlock *block;
3803                 char id[256];
3804                 ram_addr_t length;
3805 
3806                 len = qemu_get_byte(f);
3807                 qemu_get_buffer(f, (uint8_t *)id, len);
3808                 id[len] = 0;
3809                 length = qemu_get_be64(f);
3810 
3811                 block = qemu_ram_block_by_name(id);
3812                 if (block && !qemu_ram_is_migratable(block)) {
3813                     error_report("block %s should not be migrated !", id);
3814                     ret = -EINVAL;
3815                 } else if (block) {
3816                     if (length != block->used_length) {
3817                         Error *local_err = NULL;
3818 
3819                         ret = qemu_ram_resize(block, length,
3820                                               &local_err);
3821                         if (local_err) {
3822                             error_report_err(local_err);
3823                         }
3824                     }
3825                     /* For postcopy we need to check hugepage sizes match */
3826                     if (postcopy_advised &&
3827                         block->page_size != qemu_host_page_size) {
3828                         uint64_t remote_page_size = qemu_get_be64(f);
3829                         if (remote_page_size != block->page_size) {
3830                             error_report("Mismatched RAM page size %s "
3831                                          "(local) %zd != %" PRId64,
3832                                          id, block->page_size,
3833                                          remote_page_size);
3834                             ret = -EINVAL;
3835                         }
3836                     }
3837                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3838                                           block->idstr);
3839                 } else {
3840                     error_report("Unknown ramblock \"%s\", cannot "
3841                                  "accept migration", id);
3842                     ret = -EINVAL;
3843                 }
3844 
3845                 total_ram_bytes -= length;
3846             }
3847             break;
3848 
3849         case RAM_SAVE_FLAG_ZERO:
3850             ch = qemu_get_byte(f);
3851             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3852             break;
3853 
3854         case RAM_SAVE_FLAG_PAGE:
3855             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3856             break;
3857 
3858         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3859             len = qemu_get_be32(f);
3860             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3861                 error_report("Invalid compressed data length: %d", len);
3862                 ret = -EINVAL;
3863                 break;
3864             }
3865             decompress_data_with_multi_threads(f, host, len);
3866             break;
3867 
3868         case RAM_SAVE_FLAG_XBZRLE:
3869             if (load_xbzrle(f, addr, host) < 0) {
3870                 error_report("Failed to decompress XBZRLE page at "
3871                              RAM_ADDR_FMT, addr);
3872                 ret = -EINVAL;
3873                 break;
3874             }
3875             break;
3876         case RAM_SAVE_FLAG_EOS:
3877             /* normal exit */
3878             multifd_recv_sync_main();
3879             break;
3880         default:
3881             if (flags & RAM_SAVE_FLAG_HOOK) {
3882                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3883             } else {
3884                 error_report("Unknown combination of migration flags: %#x",
3885                              flags);
3886                 ret = -EINVAL;
3887             }
3888         }
3889         if (!ret) {
3890             ret = qemu_file_get_error(f);
3891         }
3892     }
3893 
3894     ret |= wait_for_decompress_done();
3895     rcu_read_unlock();
3896     trace_ram_load_complete(ret, seq_iter);
3897     return ret;
3898 }
3899 
3900 static bool ram_has_postcopy(void *opaque)
3901 {
3902     return migrate_postcopy_ram();
3903 }
3904 
3905 /* Sync all the dirty bitmap with destination VM.  */
3906 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3907 {
3908     RAMBlock *block;
3909     QEMUFile *file = s->to_dst_file;
3910     int ramblock_count = 0;
3911 
3912     trace_ram_dirty_bitmap_sync_start();
3913 
3914     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3915         qemu_savevm_send_recv_bitmap(file, block->idstr);
3916         trace_ram_dirty_bitmap_request(block->idstr);
3917         ramblock_count++;
3918     }
3919 
3920     trace_ram_dirty_bitmap_sync_wait();
3921 
3922     /* Wait until all the ramblocks' dirty bitmap synced */
3923     while (ramblock_count--) {
3924         qemu_sem_wait(&s->rp_state.rp_sem);
3925     }
3926 
3927     trace_ram_dirty_bitmap_sync_complete();
3928 
3929     return 0;
3930 }
3931 
3932 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3933 {
3934     qemu_sem_post(&s->rp_state.rp_sem);
3935 }
3936 
3937 /*
3938  * Read the received bitmap, revert it as the initial dirty bitmap.
3939  * This is only used when the postcopy migration is paused but wants
3940  * to resume from a middle point.
3941  */
3942 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3943 {
3944     int ret = -EINVAL;
3945     QEMUFile *file = s->rp_state.from_dst_file;
3946     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3947     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3948     uint64_t size, end_mark;
3949 
3950     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3951 
3952     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3953         error_report("%s: incorrect state %s", __func__,
3954                      MigrationStatus_str(s->state));
3955         return -EINVAL;
3956     }
3957 
3958     /*
3959      * Note: see comments in ramblock_recv_bitmap_send() on why we
3960      * need the endianess convertion, and the paddings.
3961      */
3962     local_size = ROUND_UP(local_size, 8);
3963 
3964     /* Add paddings */
3965     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3966 
3967     size = qemu_get_be64(file);
3968 
3969     /* The size of the bitmap should match with our ramblock */
3970     if (size != local_size) {
3971         error_report("%s: ramblock '%s' bitmap size mismatch "
3972                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3973                      block->idstr, size, local_size);
3974         ret = -EINVAL;
3975         goto out;
3976     }
3977 
3978     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3979     end_mark = qemu_get_be64(file);
3980 
3981     ret = qemu_file_get_error(file);
3982     if (ret || size != local_size) {
3983         error_report("%s: read bitmap failed for ramblock '%s': %d"
3984                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3985                      __func__, block->idstr, ret, local_size, size);
3986         ret = -EIO;
3987         goto out;
3988     }
3989 
3990     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3991         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3992                      __func__, block->idstr, end_mark);
3993         ret = -EINVAL;
3994         goto out;
3995     }
3996 
3997     /*
3998      * Endianess convertion. We are during postcopy (though paused).
3999      * The dirty bitmap won't change. We can directly modify it.
4000      */
4001     bitmap_from_le(block->bmap, le_bitmap, nbits);
4002 
4003     /*
4004      * What we received is "received bitmap". Revert it as the initial
4005      * dirty bitmap for this ramblock.
4006      */
4007     bitmap_complement(block->bmap, block->bmap, nbits);
4008 
4009     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4010 
4011     /*
4012      * We succeeded to sync bitmap for current ramblock. If this is
4013      * the last one to sync, we need to notify the main send thread.
4014      */
4015     ram_dirty_bitmap_reload_notify(s);
4016 
4017     ret = 0;
4018 out:
4019     g_free(le_bitmap);
4020     return ret;
4021 }
4022 
4023 static int ram_resume_prepare(MigrationState *s, void *opaque)
4024 {
4025     RAMState *rs = *(RAMState **)opaque;
4026     int ret;
4027 
4028     ret = ram_dirty_bitmap_sync_all(s, rs);
4029     if (ret) {
4030         return ret;
4031     }
4032 
4033     ram_state_resume_prepare(rs, s->to_dst_file);
4034 
4035     return 0;
4036 }
4037 
4038 static SaveVMHandlers savevm_ram_handlers = {
4039     .save_setup = ram_save_setup,
4040     .save_live_iterate = ram_save_iterate,
4041     .save_live_complete_postcopy = ram_save_complete,
4042     .save_live_complete_precopy = ram_save_complete,
4043     .has_postcopy = ram_has_postcopy,
4044     .save_live_pending = ram_save_pending,
4045     .load_state = ram_load,
4046     .save_cleanup = ram_save_cleanup,
4047     .load_setup = ram_load_setup,
4048     .load_cleanup = ram_load_cleanup,
4049     .resume_prepare = ram_resume_prepare,
4050 };
4051 
4052 void ram_mig_init(void)
4053 {
4054     qemu_mutex_init(&XBZRLE.lock);
4055     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4056 }
4057