xref: /openbmc/qemu/migration/ram.c (revision 56411125)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include <stdint.h>
29 #include <zlib.h>
30 #include "qemu/bitops.h"
31 #include "qemu/bitmap.h"
32 #include "qemu/timer.h"
33 #include "qemu/main-loop.h"
34 #include "migration/migration.h"
35 #include "exec/address-spaces.h"
36 #include "migration/page_cache.h"
37 #include "qemu/error-report.h"
38 #include "trace.h"
39 #include "exec/ram_addr.h"
40 #include "qemu/rcu_queue.h"
41 
42 #ifdef DEBUG_MIGRATION_RAM
43 #define DPRINTF(fmt, ...) \
44     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
45 #else
46 #define DPRINTF(fmt, ...) \
47     do { } while (0)
48 #endif
49 
50 static int dirty_rate_high_cnt;
51 
52 static uint64_t bitmap_sync_count;
53 
54 /***********************************************************/
55 /* ram save/restore */
56 
57 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
58 #define RAM_SAVE_FLAG_COMPRESS 0x02
59 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
60 #define RAM_SAVE_FLAG_PAGE     0x08
61 #define RAM_SAVE_FLAG_EOS      0x10
62 #define RAM_SAVE_FLAG_CONTINUE 0x20
63 #define RAM_SAVE_FLAG_XBZRLE   0x40
64 /* 0x80 is reserved in migration.h start with 0x100 next */
65 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
66 
67 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
68 
69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
70 {
71     return buffer_find_nonzero_offset(p, size) == size;
72 }
73 
74 /* struct contains XBZRLE cache and a static page
75    used by the compression */
76 static struct {
77     /* buffer used for XBZRLE encoding */
78     uint8_t *encoded_buf;
79     /* buffer for storing page content */
80     uint8_t *current_buf;
81     /* Cache for XBZRLE, Protected by lock. */
82     PageCache *cache;
83     QemuMutex lock;
84 } XBZRLE;
85 
86 /* buffer used for XBZRLE decoding */
87 static uint8_t *xbzrle_decoded_buf;
88 
89 static void XBZRLE_cache_lock(void)
90 {
91     if (migrate_use_xbzrle())
92         qemu_mutex_lock(&XBZRLE.lock);
93 }
94 
95 static void XBZRLE_cache_unlock(void)
96 {
97     if (migrate_use_xbzrle())
98         qemu_mutex_unlock(&XBZRLE.lock);
99 }
100 
101 /*
102  * called from qmp_migrate_set_cache_size in main thread, possibly while
103  * a migration is in progress.
104  * A running migration maybe using the cache and might finish during this
105  * call, hence changes to the cache are protected by XBZRLE.lock().
106  */
107 int64_t xbzrle_cache_resize(int64_t new_size)
108 {
109     PageCache *new_cache;
110     int64_t ret;
111 
112     if (new_size < TARGET_PAGE_SIZE) {
113         return -1;
114     }
115 
116     XBZRLE_cache_lock();
117 
118     if (XBZRLE.cache != NULL) {
119         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120             goto out_new_size;
121         }
122         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123                                         TARGET_PAGE_SIZE);
124         if (!new_cache) {
125             error_report("Error creating cache");
126             ret = -1;
127             goto out;
128         }
129 
130         cache_fini(XBZRLE.cache);
131         XBZRLE.cache = new_cache;
132     }
133 
134 out_new_size:
135     ret = pow2floor(new_size);
136 out:
137     XBZRLE_cache_unlock();
138     return ret;
139 }
140 
141 /* accounting for migration statistics */
142 typedef struct AccountingInfo {
143     uint64_t dup_pages;
144     uint64_t skipped_pages;
145     uint64_t norm_pages;
146     uint64_t iterations;
147     uint64_t xbzrle_bytes;
148     uint64_t xbzrle_pages;
149     uint64_t xbzrle_cache_miss;
150     double xbzrle_cache_miss_rate;
151     uint64_t xbzrle_overflows;
152 } AccountingInfo;
153 
154 static AccountingInfo acct_info;
155 
156 static void acct_clear(void)
157 {
158     memset(&acct_info, 0, sizeof(acct_info));
159 }
160 
161 uint64_t dup_mig_bytes_transferred(void)
162 {
163     return acct_info.dup_pages * TARGET_PAGE_SIZE;
164 }
165 
166 uint64_t dup_mig_pages_transferred(void)
167 {
168     return acct_info.dup_pages;
169 }
170 
171 uint64_t skipped_mig_bytes_transferred(void)
172 {
173     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
174 }
175 
176 uint64_t skipped_mig_pages_transferred(void)
177 {
178     return acct_info.skipped_pages;
179 }
180 
181 uint64_t norm_mig_bytes_transferred(void)
182 {
183     return acct_info.norm_pages * TARGET_PAGE_SIZE;
184 }
185 
186 uint64_t norm_mig_pages_transferred(void)
187 {
188     return acct_info.norm_pages;
189 }
190 
191 uint64_t xbzrle_mig_bytes_transferred(void)
192 {
193     return acct_info.xbzrle_bytes;
194 }
195 
196 uint64_t xbzrle_mig_pages_transferred(void)
197 {
198     return acct_info.xbzrle_pages;
199 }
200 
201 uint64_t xbzrle_mig_pages_cache_miss(void)
202 {
203     return acct_info.xbzrle_cache_miss;
204 }
205 
206 double xbzrle_mig_cache_miss_rate(void)
207 {
208     return acct_info.xbzrle_cache_miss_rate;
209 }
210 
211 uint64_t xbzrle_mig_pages_overflow(void)
212 {
213     return acct_info.xbzrle_overflows;
214 }
215 
216 /* This is the last block that we have visited serching for dirty pages
217  */
218 static RAMBlock *last_seen_block;
219 /* This is the last block from where we have sent data */
220 static RAMBlock *last_sent_block;
221 static ram_addr_t last_offset;
222 static QemuMutex migration_bitmap_mutex;
223 static uint64_t migration_dirty_pages;
224 static uint32_t last_version;
225 static bool ram_bulk_stage;
226 
227 /* used by the search for pages to send */
228 struct PageSearchStatus {
229     /* Current block being searched */
230     RAMBlock    *block;
231     /* Current offset to search from */
232     ram_addr_t   offset;
233     /* Set once we wrap around */
234     bool         complete_round;
235 };
236 typedef struct PageSearchStatus PageSearchStatus;
237 
238 static struct BitmapRcu {
239     struct rcu_head rcu;
240     unsigned long *bmap;
241 } *migration_bitmap_rcu;
242 
243 struct CompressParam {
244     bool start;
245     bool done;
246     QEMUFile *file;
247     QemuMutex mutex;
248     QemuCond cond;
249     RAMBlock *block;
250     ram_addr_t offset;
251 };
252 typedef struct CompressParam CompressParam;
253 
254 struct DecompressParam {
255     bool start;
256     QemuMutex mutex;
257     QemuCond cond;
258     void *des;
259     uint8 *compbuf;
260     int len;
261 };
262 typedef struct DecompressParam DecompressParam;
263 
264 static CompressParam *comp_param;
265 static QemuThread *compress_threads;
266 /* comp_done_cond is used to wake up the migration thread when
267  * one of the compression threads has finished the compression.
268  * comp_done_lock is used to co-work with comp_done_cond.
269  */
270 static QemuMutex *comp_done_lock;
271 static QemuCond *comp_done_cond;
272 /* The empty QEMUFileOps will be used by file in CompressParam */
273 static const QEMUFileOps empty_ops = { };
274 
275 static bool compression_switch;
276 static bool quit_comp_thread;
277 static bool quit_decomp_thread;
278 static DecompressParam *decomp_param;
279 static QemuThread *decompress_threads;
280 static uint8_t *compressed_data_buf;
281 
282 static int do_compress_ram_page(CompressParam *param);
283 
284 static void *do_data_compress(void *opaque)
285 {
286     CompressParam *param = opaque;
287 
288     while (!quit_comp_thread) {
289         qemu_mutex_lock(&param->mutex);
290         /* Re-check the quit_comp_thread in case of
291          * terminate_compression_threads is called just before
292          * qemu_mutex_lock(&param->mutex) and after
293          * while(!quit_comp_thread), re-check it here can make
294          * sure the compression thread terminate as expected.
295          */
296         while (!param->start && !quit_comp_thread) {
297             qemu_cond_wait(&param->cond, &param->mutex);
298         }
299         if (!quit_comp_thread) {
300             do_compress_ram_page(param);
301         }
302         param->start = false;
303         qemu_mutex_unlock(&param->mutex);
304 
305         qemu_mutex_lock(comp_done_lock);
306         param->done = true;
307         qemu_cond_signal(comp_done_cond);
308         qemu_mutex_unlock(comp_done_lock);
309     }
310 
311     return NULL;
312 }
313 
314 static inline void terminate_compression_threads(void)
315 {
316     int idx, thread_count;
317 
318     thread_count = migrate_compress_threads();
319     quit_comp_thread = true;
320     for (idx = 0; idx < thread_count; idx++) {
321         qemu_mutex_lock(&comp_param[idx].mutex);
322         qemu_cond_signal(&comp_param[idx].cond);
323         qemu_mutex_unlock(&comp_param[idx].mutex);
324     }
325 }
326 
327 void migrate_compress_threads_join(void)
328 {
329     int i, thread_count;
330 
331     if (!migrate_use_compression()) {
332         return;
333     }
334     terminate_compression_threads();
335     thread_count = migrate_compress_threads();
336     for (i = 0; i < thread_count; i++) {
337         qemu_thread_join(compress_threads + i);
338         qemu_fclose(comp_param[i].file);
339         qemu_mutex_destroy(&comp_param[i].mutex);
340         qemu_cond_destroy(&comp_param[i].cond);
341     }
342     qemu_mutex_destroy(comp_done_lock);
343     qemu_cond_destroy(comp_done_cond);
344     g_free(compress_threads);
345     g_free(comp_param);
346     g_free(comp_done_cond);
347     g_free(comp_done_lock);
348     compress_threads = NULL;
349     comp_param = NULL;
350     comp_done_cond = NULL;
351     comp_done_lock = NULL;
352 }
353 
354 void migrate_compress_threads_create(void)
355 {
356     int i, thread_count;
357 
358     if (!migrate_use_compression()) {
359         return;
360     }
361     quit_comp_thread = false;
362     compression_switch = true;
363     thread_count = migrate_compress_threads();
364     compress_threads = g_new0(QemuThread, thread_count);
365     comp_param = g_new0(CompressParam, thread_count);
366     comp_done_cond = g_new0(QemuCond, 1);
367     comp_done_lock = g_new0(QemuMutex, 1);
368     qemu_cond_init(comp_done_cond);
369     qemu_mutex_init(comp_done_lock);
370     for (i = 0; i < thread_count; i++) {
371         /* com_param[i].file is just used as a dummy buffer to save data, set
372          * it's ops to empty.
373          */
374         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
375         comp_param[i].done = true;
376         qemu_mutex_init(&comp_param[i].mutex);
377         qemu_cond_init(&comp_param[i].cond);
378         qemu_thread_create(compress_threads + i, "compress",
379                            do_data_compress, comp_param + i,
380                            QEMU_THREAD_JOINABLE);
381     }
382 }
383 
384 /**
385  * save_page_header: Write page header to wire
386  *
387  * If this is the 1st block, it also writes the block identification
388  *
389  * Returns: Number of bytes written
390  *
391  * @f: QEMUFile where to send the data
392  * @block: block that contains the page we want to send
393  * @offset: offset inside the block for the page
394  *          in the lower bits, it contains flags
395  */
396 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
397 {
398     size_t size, len;
399 
400     qemu_put_be64(f, offset);
401     size = 8;
402 
403     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
404         len = strlen(block->idstr);
405         qemu_put_byte(f, len);
406         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
407         size += 1 + len;
408     }
409     return size;
410 }
411 
412 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
413  * If guest dirty memory rate is reduced below the rate at which we can
414  * transfer pages to the destination then we should be able to complete
415  * migration. Some workloads dirty memory way too fast and will not effectively
416  * converge, even with auto-converge.
417  */
418 static void mig_throttle_guest_down(void)
419 {
420     MigrationState *s = migrate_get_current();
421     uint64_t pct_initial =
422             s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
423     uint64_t pct_icrement =
424             s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
425 
426     /* We have not started throttling yet. Let's start it. */
427     if (!cpu_throttle_active()) {
428         cpu_throttle_set(pct_initial);
429     } else {
430         /* Throttling already on, just increase the rate */
431         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
432     }
433 }
434 
435 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
436  * The important thing is that a stale (not-yet-0'd) page be replaced
437  * by the new data.
438  * As a bonus, if the page wasn't in the cache it gets added so that
439  * when a small write is made into the 0'd page it gets XBZRLE sent
440  */
441 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
442 {
443     if (ram_bulk_stage || !migrate_use_xbzrle()) {
444         return;
445     }
446 
447     /* We don't care if this fails to allocate a new cache page
448      * as long as it updated an old one */
449     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
450                  bitmap_sync_count);
451 }
452 
453 #define ENCODING_FLAG_XBZRLE 0x1
454 
455 /**
456  * save_xbzrle_page: compress and send current page
457  *
458  * Returns: 1 means that we wrote the page
459  *          0 means that page is identical to the one already sent
460  *          -1 means that xbzrle would be longer than normal
461  *
462  * @f: QEMUFile where to send the data
463  * @current_data:
464  * @current_addr:
465  * @block: block that contains the page we want to send
466  * @offset: offset inside the block for the page
467  * @last_stage: if we are at the completion stage
468  * @bytes_transferred: increase it with the number of transferred bytes
469  */
470 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
471                             ram_addr_t current_addr, RAMBlock *block,
472                             ram_addr_t offset, bool last_stage,
473                             uint64_t *bytes_transferred)
474 {
475     int encoded_len = 0, bytes_xbzrle;
476     uint8_t *prev_cached_page;
477 
478     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
479         acct_info.xbzrle_cache_miss++;
480         if (!last_stage) {
481             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
482                              bitmap_sync_count) == -1) {
483                 return -1;
484             } else {
485                 /* update *current_data when the page has been
486                    inserted into cache */
487                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
488             }
489         }
490         return -1;
491     }
492 
493     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
494 
495     /* save current buffer into memory */
496     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
497 
498     /* XBZRLE encoding (if there is no overflow) */
499     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
500                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
501                                        TARGET_PAGE_SIZE);
502     if (encoded_len == 0) {
503         DPRINTF("Skipping unmodified page\n");
504         return 0;
505     } else if (encoded_len == -1) {
506         DPRINTF("Overflow\n");
507         acct_info.xbzrle_overflows++;
508         /* update data in the cache */
509         if (!last_stage) {
510             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
511             *current_data = prev_cached_page;
512         }
513         return -1;
514     }
515 
516     /* we need to update the data in the cache, in order to get the same data */
517     if (!last_stage) {
518         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
519     }
520 
521     /* Send XBZRLE based compressed page */
522     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
523     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
524     qemu_put_be16(f, encoded_len);
525     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
526     bytes_xbzrle += encoded_len + 1 + 2;
527     acct_info.xbzrle_pages++;
528     acct_info.xbzrle_bytes += bytes_xbzrle;
529     *bytes_transferred += bytes_xbzrle;
530 
531     return 1;
532 }
533 
534 /* Called with rcu_read_lock() to protect migration_bitmap */
535 static inline
536 ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
537                                                  ram_addr_t start)
538 {
539     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
540     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
541     uint64_t rb_size = rb->used_length;
542     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
543     unsigned long *bitmap;
544 
545     unsigned long next;
546 
547     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
548     if (ram_bulk_stage && nr > base) {
549         next = nr + 1;
550     } else {
551         next = find_next_bit(bitmap, size, nr);
552     }
553 
554     if (next < size) {
555         clear_bit(next, bitmap);
556         migration_dirty_pages--;
557     }
558     return (next - base) << TARGET_PAGE_BITS;
559 }
560 
561 /* Called with rcu_read_lock() to protect migration_bitmap */
562 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
563 {
564     unsigned long *bitmap;
565     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
566     migration_dirty_pages +=
567         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
568 }
569 
570 /* Fix me: there are too many global variables used in migration process. */
571 static int64_t start_time;
572 static int64_t bytes_xfer_prev;
573 static int64_t num_dirty_pages_period;
574 static uint64_t xbzrle_cache_miss_prev;
575 static uint64_t iterations_prev;
576 
577 static void migration_bitmap_sync_init(void)
578 {
579     start_time = 0;
580     bytes_xfer_prev = 0;
581     num_dirty_pages_period = 0;
582     xbzrle_cache_miss_prev = 0;
583     iterations_prev = 0;
584 }
585 
586 /* Called with iothread lock held, to protect ram_list.dirty_memory[] */
587 static void migration_bitmap_sync(void)
588 {
589     RAMBlock *block;
590     uint64_t num_dirty_pages_init = migration_dirty_pages;
591     MigrationState *s = migrate_get_current();
592     int64_t end_time;
593     int64_t bytes_xfer_now;
594 
595     bitmap_sync_count++;
596 
597     if (!bytes_xfer_prev) {
598         bytes_xfer_prev = ram_bytes_transferred();
599     }
600 
601     if (!start_time) {
602         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
603     }
604 
605     trace_migration_bitmap_sync_start();
606     address_space_sync_dirty_bitmap(&address_space_memory);
607 
608     qemu_mutex_lock(&migration_bitmap_mutex);
609     rcu_read_lock();
610     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
611         migration_bitmap_sync_range(block->offset, block->used_length);
612     }
613     rcu_read_unlock();
614     qemu_mutex_unlock(&migration_bitmap_mutex);
615 
616     trace_migration_bitmap_sync_end(migration_dirty_pages
617                                     - num_dirty_pages_init);
618     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
619     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
620 
621     /* more than 1 second = 1000 millisecons */
622     if (end_time > start_time + 1000) {
623         if (migrate_auto_converge()) {
624             /* The following detection logic can be refined later. For now:
625                Check to see if the dirtied bytes is 50% more than the approx.
626                amount of bytes that just got transferred since the last time we
627                were in this routine. If that happens twice, start or increase
628                throttling */
629             bytes_xfer_now = ram_bytes_transferred();
630 
631             if (s->dirty_pages_rate &&
632                (num_dirty_pages_period * TARGET_PAGE_SIZE >
633                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
634                (dirty_rate_high_cnt++ >= 2)) {
635                     trace_migration_throttle();
636                     dirty_rate_high_cnt = 0;
637                     mig_throttle_guest_down();
638              }
639              bytes_xfer_prev = bytes_xfer_now;
640         }
641 
642         if (migrate_use_xbzrle()) {
643             if (iterations_prev != acct_info.iterations) {
644                 acct_info.xbzrle_cache_miss_rate =
645                    (double)(acct_info.xbzrle_cache_miss -
646                             xbzrle_cache_miss_prev) /
647                    (acct_info.iterations - iterations_prev);
648             }
649             iterations_prev = acct_info.iterations;
650             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
651         }
652         s->dirty_pages_rate = num_dirty_pages_period * 1000
653             / (end_time - start_time);
654         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
655         start_time = end_time;
656         num_dirty_pages_period = 0;
657     }
658     s->dirty_sync_count = bitmap_sync_count;
659 }
660 
661 /**
662  * save_zero_page: Send the zero page to the stream
663  *
664  * Returns: Number of pages written.
665  *
666  * @f: QEMUFile where to send the data
667  * @block: block that contains the page we want to send
668  * @offset: offset inside the block for the page
669  * @p: pointer to the page
670  * @bytes_transferred: increase it with the number of transferred bytes
671  */
672 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
673                           uint8_t *p, uint64_t *bytes_transferred)
674 {
675     int pages = -1;
676 
677     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
678         acct_info.dup_pages++;
679         *bytes_transferred += save_page_header(f, block,
680                                                offset | RAM_SAVE_FLAG_COMPRESS);
681         qemu_put_byte(f, 0);
682         *bytes_transferred += 1;
683         pages = 1;
684     }
685 
686     return pages;
687 }
688 
689 /**
690  * ram_save_page: Send the given page to the stream
691  *
692  * Returns: Number of pages written.
693  *
694  * @f: QEMUFile where to send the data
695  * @block: block that contains the page we want to send
696  * @offset: offset inside the block for the page
697  * @last_stage: if we are at the completion stage
698  * @bytes_transferred: increase it with the number of transferred bytes
699  */
700 static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
701                          bool last_stage, uint64_t *bytes_transferred)
702 {
703     int pages = -1;
704     uint64_t bytes_xmit;
705     ram_addr_t current_addr;
706     uint8_t *p;
707     int ret;
708     bool send_async = true;
709 
710     p = block->host + offset;
711 
712     /* In doubt sent page as normal */
713     bytes_xmit = 0;
714     ret = ram_control_save_page(f, block->offset,
715                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
716     if (bytes_xmit) {
717         *bytes_transferred += bytes_xmit;
718         pages = 1;
719     }
720 
721     XBZRLE_cache_lock();
722 
723     current_addr = block->offset + offset;
724 
725     if (block == last_sent_block) {
726         offset |= RAM_SAVE_FLAG_CONTINUE;
727     }
728     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
729         if (ret != RAM_SAVE_CONTROL_DELAYED) {
730             if (bytes_xmit > 0) {
731                 acct_info.norm_pages++;
732             } else if (bytes_xmit == 0) {
733                 acct_info.dup_pages++;
734             }
735         }
736     } else {
737         pages = save_zero_page(f, block, offset, p, bytes_transferred);
738         if (pages > 0) {
739             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
740              * page would be stale
741              */
742             xbzrle_cache_zero_page(current_addr);
743         } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
744             pages = save_xbzrle_page(f, &p, current_addr, block,
745                                      offset, last_stage, bytes_transferred);
746             if (!last_stage) {
747                 /* Can't send this cached data async, since the cache page
748                  * might get updated before it gets to the wire
749                  */
750                 send_async = false;
751             }
752         }
753     }
754 
755     /* XBZRLE overflow or normal page */
756     if (pages == -1) {
757         *bytes_transferred += save_page_header(f, block,
758                                                offset | RAM_SAVE_FLAG_PAGE);
759         if (send_async) {
760             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
761         } else {
762             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
763         }
764         *bytes_transferred += TARGET_PAGE_SIZE;
765         pages = 1;
766         acct_info.norm_pages++;
767     }
768 
769     XBZRLE_cache_unlock();
770 
771     return pages;
772 }
773 
774 static int do_compress_ram_page(CompressParam *param)
775 {
776     int bytes_sent, blen;
777     uint8_t *p;
778     RAMBlock *block = param->block;
779     ram_addr_t offset = param->offset;
780 
781     p = block->host + (offset & TARGET_PAGE_MASK);
782 
783     bytes_sent = save_page_header(param->file, block, offset |
784                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
785     blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
786                                      migrate_compress_level());
787     bytes_sent += blen;
788 
789     return bytes_sent;
790 }
791 
792 static inline void start_compression(CompressParam *param)
793 {
794     param->done = false;
795     qemu_mutex_lock(&param->mutex);
796     param->start = true;
797     qemu_cond_signal(&param->cond);
798     qemu_mutex_unlock(&param->mutex);
799 }
800 
801 static inline void start_decompression(DecompressParam *param)
802 {
803     qemu_mutex_lock(&param->mutex);
804     param->start = true;
805     qemu_cond_signal(&param->cond);
806     qemu_mutex_unlock(&param->mutex);
807 }
808 
809 static uint64_t bytes_transferred;
810 
811 static void flush_compressed_data(QEMUFile *f)
812 {
813     int idx, len, thread_count;
814 
815     if (!migrate_use_compression()) {
816         return;
817     }
818     thread_count = migrate_compress_threads();
819     for (idx = 0; idx < thread_count; idx++) {
820         if (!comp_param[idx].done) {
821             qemu_mutex_lock(comp_done_lock);
822             while (!comp_param[idx].done && !quit_comp_thread) {
823                 qemu_cond_wait(comp_done_cond, comp_done_lock);
824             }
825             qemu_mutex_unlock(comp_done_lock);
826         }
827         if (!quit_comp_thread) {
828             len = qemu_put_qemu_file(f, comp_param[idx].file);
829             bytes_transferred += len;
830         }
831     }
832 }
833 
834 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
835                                        ram_addr_t offset)
836 {
837     param->block = block;
838     param->offset = offset;
839 }
840 
841 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
842                                            ram_addr_t offset,
843                                            uint64_t *bytes_transferred)
844 {
845     int idx, thread_count, bytes_xmit = -1, pages = -1;
846 
847     thread_count = migrate_compress_threads();
848     qemu_mutex_lock(comp_done_lock);
849     while (true) {
850         for (idx = 0; idx < thread_count; idx++) {
851             if (comp_param[idx].done) {
852                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
853                 set_compress_params(&comp_param[idx], block, offset);
854                 start_compression(&comp_param[idx]);
855                 pages = 1;
856                 acct_info.norm_pages++;
857                 *bytes_transferred += bytes_xmit;
858                 break;
859             }
860         }
861         if (pages > 0) {
862             break;
863         } else {
864             qemu_cond_wait(comp_done_cond, comp_done_lock);
865         }
866     }
867     qemu_mutex_unlock(comp_done_lock);
868 
869     return pages;
870 }
871 
872 /**
873  * ram_save_compressed_page: compress the given page and send it to the stream
874  *
875  * Returns: Number of pages written.
876  *
877  * @f: QEMUFile where to send the data
878  * @block: block that contains the page we want to send
879  * @offset: offset inside the block for the page
880  * @last_stage: if we are at the completion stage
881  * @bytes_transferred: increase it with the number of transferred bytes
882  */
883 static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
884                                     ram_addr_t offset, bool last_stage,
885                                     uint64_t *bytes_transferred)
886 {
887     int pages = -1;
888     uint64_t bytes_xmit;
889     uint8_t *p;
890     int ret;
891 
892     p = block->host + offset;
893 
894     bytes_xmit = 0;
895     ret = ram_control_save_page(f, block->offset,
896                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
897     if (bytes_xmit) {
898         *bytes_transferred += bytes_xmit;
899         pages = 1;
900     }
901     if (block == last_sent_block) {
902         offset |= RAM_SAVE_FLAG_CONTINUE;
903     }
904     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
905         if (ret != RAM_SAVE_CONTROL_DELAYED) {
906             if (bytes_xmit > 0) {
907                 acct_info.norm_pages++;
908             } else if (bytes_xmit == 0) {
909                 acct_info.dup_pages++;
910             }
911         }
912     } else {
913         /* When starting the process of a new block, the first page of
914          * the block should be sent out before other pages in the same
915          * block, and all the pages in last block should have been sent
916          * out, keeping this order is important, because the 'cont' flag
917          * is used to avoid resending the block name.
918          */
919         if (block != last_sent_block) {
920             flush_compressed_data(f);
921             pages = save_zero_page(f, block, offset, p, bytes_transferred);
922             if (pages == -1) {
923                 set_compress_params(&comp_param[0], block, offset);
924                 /* Use the qemu thread to compress the data to make sure the
925                  * first page is sent out before other pages
926                  */
927                 bytes_xmit = do_compress_ram_page(&comp_param[0]);
928                 acct_info.norm_pages++;
929                 qemu_put_qemu_file(f, comp_param[0].file);
930                 *bytes_transferred += bytes_xmit;
931                 pages = 1;
932             }
933         } else {
934             pages = save_zero_page(f, block, offset, p, bytes_transferred);
935             if (pages == -1) {
936                 pages = compress_page_with_multi_thread(f, block, offset,
937                                                         bytes_transferred);
938             }
939         }
940     }
941 
942     return pages;
943 }
944 
945 /*
946  * Find the next dirty page and update any state associated with
947  * the search process.
948  *
949  * Returns: True if a page is found
950  *
951  * @f: Current migration stream.
952  * @pss: Data about the state of the current dirty page scan.
953  * @*again: Set to false if the search has scanned the whole of RAM
954  */
955 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
956                              bool *again)
957 {
958     pss->offset = migration_bitmap_find_and_reset_dirty(pss->block,
959                                                        pss->offset);
960     if (pss->complete_round && pss->block == last_seen_block &&
961         pss->offset >= last_offset) {
962         /*
963          * We've been once around the RAM and haven't found anything.
964          * Give up.
965          */
966         *again = false;
967         return false;
968     }
969     if (pss->offset >= pss->block->used_length) {
970         /* Didn't find anything in this RAM Block */
971         pss->offset = 0;
972         pss->block = QLIST_NEXT_RCU(pss->block, next);
973         if (!pss->block) {
974             /* Hit the end of the list */
975             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
976             /* Flag that we've looped */
977             pss->complete_round = true;
978             ram_bulk_stage = false;
979             if (migrate_use_xbzrle()) {
980                 /* If xbzrle is on, stop using the data compression at this
981                  * point. In theory, xbzrle can do better than compression.
982                  */
983                 flush_compressed_data(f);
984                 compression_switch = false;
985             }
986         }
987         /* Didn't find anything this time, but try again on the new block */
988         *again = true;
989         return false;
990     } else {
991         /* Can go around again, but... */
992         *again = true;
993         /* We've found something so probably don't need to */
994         return true;
995     }
996 }
997 
998 /**
999  * ram_find_and_save_block: Finds a dirty page and sends it to f
1000  *
1001  * Called within an RCU critical section.
1002  *
1003  * Returns:  The number of pages written
1004  *           0 means no dirty pages
1005  *
1006  * @f: QEMUFile where to send the data
1007  * @last_stage: if we are at the completion stage
1008  * @bytes_transferred: increase it with the number of transferred bytes
1009  */
1010 
1011 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1012                                    uint64_t *bytes_transferred)
1013 {
1014     PageSearchStatus pss;
1015     int pages = 0;
1016     bool again, found;
1017 
1018     pss.block = last_seen_block;
1019     pss.offset = last_offset;
1020     pss.complete_round = false;
1021 
1022     if (!pss.block) {
1023         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1024     }
1025 
1026     do {
1027         found = find_dirty_block(f, &pss, &again);
1028 
1029         if (found) {
1030             if (compression_switch && migrate_use_compression()) {
1031                 pages = ram_save_compressed_page(f, pss.block, pss.offset,
1032                                                  last_stage,
1033                                                  bytes_transferred);
1034             } else {
1035                 pages = ram_save_page(f, pss.block, pss.offset, last_stage,
1036                                       bytes_transferred);
1037             }
1038 
1039             /* if page is unmodified, continue to the next */
1040             if (pages > 0) {
1041                 last_sent_block = pss.block;
1042             }
1043         }
1044     } while (!pages && again);
1045 
1046     last_seen_block = pss.block;
1047     last_offset = pss.offset;
1048 
1049     return pages;
1050 }
1051 
1052 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1053 {
1054     uint64_t pages = size / TARGET_PAGE_SIZE;
1055     if (zero) {
1056         acct_info.dup_pages += pages;
1057     } else {
1058         acct_info.norm_pages += pages;
1059         bytes_transferred += size;
1060         qemu_update_position(f, size);
1061     }
1062 }
1063 
1064 static ram_addr_t ram_save_remaining(void)
1065 {
1066     return migration_dirty_pages;
1067 }
1068 
1069 uint64_t ram_bytes_remaining(void)
1070 {
1071     return ram_save_remaining() * TARGET_PAGE_SIZE;
1072 }
1073 
1074 uint64_t ram_bytes_transferred(void)
1075 {
1076     return bytes_transferred;
1077 }
1078 
1079 uint64_t ram_bytes_total(void)
1080 {
1081     RAMBlock *block;
1082     uint64_t total = 0;
1083 
1084     rcu_read_lock();
1085     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1086         total += block->used_length;
1087     rcu_read_unlock();
1088     return total;
1089 }
1090 
1091 void free_xbzrle_decoded_buf(void)
1092 {
1093     g_free(xbzrle_decoded_buf);
1094     xbzrle_decoded_buf = NULL;
1095 }
1096 
1097 static void migration_bitmap_free(struct BitmapRcu *bmap)
1098 {
1099     g_free(bmap->bmap);
1100     g_free(bmap);
1101 }
1102 
1103 static void migration_end(void)
1104 {
1105     /* caller have hold iothread lock or is in a bh, so there is
1106      * no writing race against this migration_bitmap
1107      */
1108     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1109     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1110     if (bitmap) {
1111         memory_global_dirty_log_stop();
1112         call_rcu(bitmap, migration_bitmap_free, rcu);
1113     }
1114 
1115     XBZRLE_cache_lock();
1116     if (XBZRLE.cache) {
1117         cache_fini(XBZRLE.cache);
1118         g_free(XBZRLE.encoded_buf);
1119         g_free(XBZRLE.current_buf);
1120         XBZRLE.cache = NULL;
1121         XBZRLE.encoded_buf = NULL;
1122         XBZRLE.current_buf = NULL;
1123     }
1124     XBZRLE_cache_unlock();
1125 }
1126 
1127 static void ram_migration_cancel(void *opaque)
1128 {
1129     migration_end();
1130 }
1131 
1132 static void reset_ram_globals(void)
1133 {
1134     last_seen_block = NULL;
1135     last_sent_block = NULL;
1136     last_offset = 0;
1137     last_version = ram_list.version;
1138     ram_bulk_stage = true;
1139 }
1140 
1141 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1142 
1143 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1144 {
1145     /* called in qemu main thread, so there is
1146      * no writing race against this migration_bitmap
1147      */
1148     if (migration_bitmap_rcu) {
1149         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1150         bitmap = g_new(struct BitmapRcu, 1);
1151         bitmap->bmap = bitmap_new(new);
1152 
1153         /* prevent migration_bitmap content from being set bit
1154          * by migration_bitmap_sync_range() at the same time.
1155          * it is safe to migration if migration_bitmap is cleared bit
1156          * at the same time.
1157          */
1158         qemu_mutex_lock(&migration_bitmap_mutex);
1159         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1160         bitmap_set(bitmap->bmap, old, new - old);
1161         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1162         qemu_mutex_unlock(&migration_bitmap_mutex);
1163         migration_dirty_pages += new - old;
1164         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1165     }
1166 }
1167 
1168 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1169  * long-running RCU critical section.  When rcu-reclaims in the code
1170  * start to become numerous it will be necessary to reduce the
1171  * granularity of these critical sections.
1172  */
1173 
1174 static int ram_save_setup(QEMUFile *f, void *opaque)
1175 {
1176     RAMBlock *block;
1177     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1178 
1179     dirty_rate_high_cnt = 0;
1180     bitmap_sync_count = 0;
1181     migration_bitmap_sync_init();
1182     qemu_mutex_init(&migration_bitmap_mutex);
1183 
1184     if (migrate_use_xbzrle()) {
1185         XBZRLE_cache_lock();
1186         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1187                                   TARGET_PAGE_SIZE,
1188                                   TARGET_PAGE_SIZE);
1189         if (!XBZRLE.cache) {
1190             XBZRLE_cache_unlock();
1191             error_report("Error creating cache");
1192             return -1;
1193         }
1194         XBZRLE_cache_unlock();
1195 
1196         /* We prefer not to abort if there is no memory */
1197         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1198         if (!XBZRLE.encoded_buf) {
1199             error_report("Error allocating encoded_buf");
1200             return -1;
1201         }
1202 
1203         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1204         if (!XBZRLE.current_buf) {
1205             error_report("Error allocating current_buf");
1206             g_free(XBZRLE.encoded_buf);
1207             XBZRLE.encoded_buf = NULL;
1208             return -1;
1209         }
1210 
1211         acct_clear();
1212     }
1213 
1214     /* iothread lock needed for ram_list.dirty_memory[] */
1215     qemu_mutex_lock_iothread();
1216     qemu_mutex_lock_ramlist();
1217     rcu_read_lock();
1218     bytes_transferred = 0;
1219     reset_ram_globals();
1220 
1221     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1222     migration_bitmap_rcu = g_new(struct BitmapRcu, 1);
1223     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1224     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1225 
1226     /*
1227      * Count the total number of pages used by ram blocks not including any
1228      * gaps due to alignment or unplugs.
1229      */
1230     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1231 
1232     memory_global_dirty_log_start();
1233     migration_bitmap_sync();
1234     qemu_mutex_unlock_ramlist();
1235     qemu_mutex_unlock_iothread();
1236 
1237     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1238 
1239     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1240         qemu_put_byte(f, strlen(block->idstr));
1241         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1242         qemu_put_be64(f, block->used_length);
1243     }
1244 
1245     rcu_read_unlock();
1246 
1247     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1248     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1249 
1250     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1251 
1252     return 0;
1253 }
1254 
1255 static int ram_save_iterate(QEMUFile *f, void *opaque)
1256 {
1257     int ret;
1258     int i;
1259     int64_t t0;
1260     int pages_sent = 0;
1261 
1262     rcu_read_lock();
1263     if (ram_list.version != last_version) {
1264         reset_ram_globals();
1265     }
1266 
1267     /* Read version before ram_list.blocks */
1268     smp_rmb();
1269 
1270     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1271 
1272     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1273     i = 0;
1274     while ((ret = qemu_file_rate_limit(f)) == 0) {
1275         int pages;
1276 
1277         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1278         /* no more pages to sent */
1279         if (pages == 0) {
1280             break;
1281         }
1282         pages_sent += pages;
1283         acct_info.iterations++;
1284 
1285         /* we want to check in the 1st loop, just in case it was the 1st time
1286            and we had to sync the dirty bitmap.
1287            qemu_get_clock_ns() is a bit expensive, so we only check each some
1288            iterations
1289         */
1290         if ((i & 63) == 0) {
1291             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1292             if (t1 > MAX_WAIT) {
1293                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
1294                         t1, i);
1295                 break;
1296             }
1297         }
1298         i++;
1299     }
1300     flush_compressed_data(f);
1301     rcu_read_unlock();
1302 
1303     /*
1304      * Must occur before EOS (or any QEMUFile operation)
1305      * because of RDMA protocol.
1306      */
1307     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1308 
1309     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1310     bytes_transferred += 8;
1311 
1312     ret = qemu_file_get_error(f);
1313     if (ret < 0) {
1314         return ret;
1315     }
1316 
1317     return pages_sent;
1318 }
1319 
1320 /* Called with iothread lock */
1321 static int ram_save_complete(QEMUFile *f, void *opaque)
1322 {
1323     rcu_read_lock();
1324 
1325     migration_bitmap_sync();
1326 
1327     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
1328 
1329     /* try transferring iterative blocks of memory */
1330 
1331     /* flush all remaining blocks regardless of rate limiting */
1332     while (true) {
1333         int pages;
1334 
1335         pages = ram_find_and_save_block(f, true, &bytes_transferred);
1336         /* no more blocks to sent */
1337         if (pages == 0) {
1338             break;
1339         }
1340     }
1341 
1342     flush_compressed_data(f);
1343     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
1344 
1345     rcu_read_unlock();
1346 
1347     migration_end();
1348     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1349 
1350     return 0;
1351 }
1352 
1353 static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
1354 {
1355     uint64_t remaining_size;
1356 
1357     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1358 
1359     if (remaining_size < max_size) {
1360         qemu_mutex_lock_iothread();
1361         rcu_read_lock();
1362         migration_bitmap_sync();
1363         rcu_read_unlock();
1364         qemu_mutex_unlock_iothread();
1365         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1366     }
1367     return remaining_size;
1368 }
1369 
1370 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
1371 {
1372     unsigned int xh_len;
1373     int xh_flags;
1374 
1375     if (!xbzrle_decoded_buf) {
1376         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1377     }
1378 
1379     /* extract RLE header */
1380     xh_flags = qemu_get_byte(f);
1381     xh_len = qemu_get_be16(f);
1382 
1383     if (xh_flags != ENCODING_FLAG_XBZRLE) {
1384         error_report("Failed to load XBZRLE page - wrong compression!");
1385         return -1;
1386     }
1387 
1388     if (xh_len > TARGET_PAGE_SIZE) {
1389         error_report("Failed to load XBZRLE page - len overflow!");
1390         return -1;
1391     }
1392     /* load data and decode */
1393     qemu_get_buffer(f, xbzrle_decoded_buf, xh_len);
1394 
1395     /* decode RLE */
1396     if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host,
1397                              TARGET_PAGE_SIZE) == -1) {
1398         error_report("Failed to load XBZRLE page - decode error!");
1399         return -1;
1400     }
1401 
1402     return 0;
1403 }
1404 
1405 /* Must be called from within a rcu critical section.
1406  * Returns a pointer from within the RCU-protected ram_list.
1407  */
1408 static inline void *host_from_stream_offset(QEMUFile *f,
1409                                             ram_addr_t offset,
1410                                             int flags)
1411 {
1412     static RAMBlock *block = NULL;
1413     char id[256];
1414     uint8_t len;
1415 
1416     if (flags & RAM_SAVE_FLAG_CONTINUE) {
1417         if (!block || block->max_length <= offset) {
1418             error_report("Ack, bad migration stream!");
1419             return NULL;
1420         }
1421 
1422         return block->host + offset;
1423     }
1424 
1425     len = qemu_get_byte(f);
1426     qemu_get_buffer(f, (uint8_t *)id, len);
1427     id[len] = 0;
1428 
1429     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1430         if (!strncmp(id, block->idstr, sizeof(id)) &&
1431             block->max_length > offset) {
1432             return block->host + offset;
1433         }
1434     }
1435 
1436     error_report("Can't find block %s!", id);
1437     return NULL;
1438 }
1439 
1440 /*
1441  * If a page (or a whole RDMA chunk) has been
1442  * determined to be zero, then zap it.
1443  */
1444 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
1445 {
1446     if (ch != 0 || !is_zero_range(host, size)) {
1447         memset(host, ch, size);
1448     }
1449 }
1450 
1451 static void *do_data_decompress(void *opaque)
1452 {
1453     DecompressParam *param = opaque;
1454     unsigned long pagesize;
1455 
1456     while (!quit_decomp_thread) {
1457         qemu_mutex_lock(&param->mutex);
1458         while (!param->start && !quit_decomp_thread) {
1459             qemu_cond_wait(&param->cond, &param->mutex);
1460             pagesize = TARGET_PAGE_SIZE;
1461             if (!quit_decomp_thread) {
1462                 /* uncompress() will return failed in some case, especially
1463                  * when the page is dirted when doing the compression, it's
1464                  * not a problem because the dirty page will be retransferred
1465                  * and uncompress() won't break the data in other pages.
1466                  */
1467                 uncompress((Bytef *)param->des, &pagesize,
1468                            (const Bytef *)param->compbuf, param->len);
1469             }
1470             param->start = false;
1471         }
1472         qemu_mutex_unlock(&param->mutex);
1473     }
1474 
1475     return NULL;
1476 }
1477 
1478 void migrate_decompress_threads_create(void)
1479 {
1480     int i, thread_count;
1481 
1482     thread_count = migrate_decompress_threads();
1483     decompress_threads = g_new0(QemuThread, thread_count);
1484     decomp_param = g_new0(DecompressParam, thread_count);
1485     compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1486     quit_decomp_thread = false;
1487     for (i = 0; i < thread_count; i++) {
1488         qemu_mutex_init(&decomp_param[i].mutex);
1489         qemu_cond_init(&decomp_param[i].cond);
1490         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1491         qemu_thread_create(decompress_threads + i, "decompress",
1492                            do_data_decompress, decomp_param + i,
1493                            QEMU_THREAD_JOINABLE);
1494     }
1495 }
1496 
1497 void migrate_decompress_threads_join(void)
1498 {
1499     int i, thread_count;
1500 
1501     quit_decomp_thread = true;
1502     thread_count = migrate_decompress_threads();
1503     for (i = 0; i < thread_count; i++) {
1504         qemu_mutex_lock(&decomp_param[i].mutex);
1505         qemu_cond_signal(&decomp_param[i].cond);
1506         qemu_mutex_unlock(&decomp_param[i].mutex);
1507     }
1508     for (i = 0; i < thread_count; i++) {
1509         qemu_thread_join(decompress_threads + i);
1510         qemu_mutex_destroy(&decomp_param[i].mutex);
1511         qemu_cond_destroy(&decomp_param[i].cond);
1512         g_free(decomp_param[i].compbuf);
1513     }
1514     g_free(decompress_threads);
1515     g_free(decomp_param);
1516     g_free(compressed_data_buf);
1517     decompress_threads = NULL;
1518     decomp_param = NULL;
1519     compressed_data_buf = NULL;
1520 }
1521 
1522 static void decompress_data_with_multi_threads(uint8_t *compbuf,
1523                                                void *host, int len)
1524 {
1525     int idx, thread_count;
1526 
1527     thread_count = migrate_decompress_threads();
1528     while (true) {
1529         for (idx = 0; idx < thread_count; idx++) {
1530             if (!decomp_param[idx].start) {
1531                 memcpy(decomp_param[idx].compbuf, compbuf, len);
1532                 decomp_param[idx].des = host;
1533                 decomp_param[idx].len = len;
1534                 start_decompression(&decomp_param[idx]);
1535                 break;
1536             }
1537         }
1538         if (idx < thread_count) {
1539             break;
1540         }
1541     }
1542 }
1543 
1544 static int ram_load(QEMUFile *f, void *opaque, int version_id)
1545 {
1546     int flags = 0, ret = 0;
1547     static uint64_t seq_iter;
1548     int len = 0;
1549 
1550     seq_iter++;
1551 
1552     if (version_id != 4) {
1553         ret = -EINVAL;
1554     }
1555 
1556     /* This RCU critical section can be very long running.
1557      * When RCU reclaims in the code start to become numerous,
1558      * it will be necessary to reduce the granularity of this
1559      * critical section.
1560      */
1561     rcu_read_lock();
1562     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
1563         ram_addr_t addr, total_ram_bytes;
1564         void *host;
1565         uint8_t ch;
1566 
1567         addr = qemu_get_be64(f);
1568         flags = addr & ~TARGET_PAGE_MASK;
1569         addr &= TARGET_PAGE_MASK;
1570 
1571         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
1572         case RAM_SAVE_FLAG_MEM_SIZE:
1573             /* Synchronize RAM block list */
1574             total_ram_bytes = addr;
1575             while (!ret && total_ram_bytes) {
1576                 RAMBlock *block;
1577                 char id[256];
1578                 ram_addr_t length;
1579 
1580                 len = qemu_get_byte(f);
1581                 qemu_get_buffer(f, (uint8_t *)id, len);
1582                 id[len] = 0;
1583                 length = qemu_get_be64(f);
1584 
1585                 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1586                     if (!strncmp(id, block->idstr, sizeof(id))) {
1587                         if (length != block->used_length) {
1588                             Error *local_err = NULL;
1589 
1590                             ret = qemu_ram_resize(block->offset, length, &local_err);
1591                             if (local_err) {
1592                                 error_report_err(local_err);
1593                             }
1594                         }
1595                         ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
1596                                               block->idstr);
1597                         break;
1598                     }
1599                 }
1600 
1601                 if (!block) {
1602                     error_report("Unknown ramblock \"%s\", cannot "
1603                                  "accept migration", id);
1604                     ret = -EINVAL;
1605                 }
1606 
1607                 total_ram_bytes -= length;
1608             }
1609             break;
1610         case RAM_SAVE_FLAG_COMPRESS:
1611             host = host_from_stream_offset(f, addr, flags);
1612             if (!host) {
1613                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1614                 ret = -EINVAL;
1615                 break;
1616             }
1617             ch = qemu_get_byte(f);
1618             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
1619             break;
1620         case RAM_SAVE_FLAG_PAGE:
1621             host = host_from_stream_offset(f, addr, flags);
1622             if (!host) {
1623                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1624                 ret = -EINVAL;
1625                 break;
1626             }
1627             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
1628             break;
1629         case RAM_SAVE_FLAG_COMPRESS_PAGE:
1630             host = host_from_stream_offset(f, addr, flags);
1631             if (!host) {
1632                 error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
1633                 ret = -EINVAL;
1634                 break;
1635             }
1636 
1637             len = qemu_get_be32(f);
1638             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
1639                 error_report("Invalid compressed data length: %d", len);
1640                 ret = -EINVAL;
1641                 break;
1642             }
1643             qemu_get_buffer(f, compressed_data_buf, len);
1644             decompress_data_with_multi_threads(compressed_data_buf, host, len);
1645             break;
1646         case RAM_SAVE_FLAG_XBZRLE:
1647             host = host_from_stream_offset(f, addr, flags);
1648             if (!host) {
1649                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1650                 ret = -EINVAL;
1651                 break;
1652             }
1653             if (load_xbzrle(f, addr, host) < 0) {
1654                 error_report("Failed to decompress XBZRLE page at "
1655                              RAM_ADDR_FMT, addr);
1656                 ret = -EINVAL;
1657                 break;
1658             }
1659             break;
1660         case RAM_SAVE_FLAG_EOS:
1661             /* normal exit */
1662             break;
1663         default:
1664             if (flags & RAM_SAVE_FLAG_HOOK) {
1665                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
1666             } else {
1667                 error_report("Unknown combination of migration flags: %#x",
1668                              flags);
1669                 ret = -EINVAL;
1670             }
1671         }
1672         if (!ret) {
1673             ret = qemu_file_get_error(f);
1674         }
1675     }
1676 
1677     rcu_read_unlock();
1678     DPRINTF("Completed load of VM with exit code %d seq iteration "
1679             "%" PRIu64 "\n", ret, seq_iter);
1680     return ret;
1681 }
1682 
1683 static SaveVMHandlers savevm_ram_handlers = {
1684     .save_live_setup = ram_save_setup,
1685     .save_live_iterate = ram_save_iterate,
1686     .save_live_complete = ram_save_complete,
1687     .save_live_pending = ram_save_pending,
1688     .load_state = ram_load,
1689     .cancel = ram_migration_cancel,
1690 };
1691 
1692 void ram_mig_init(void)
1693 {
1694     qemu_mutex_init(&XBZRLE.lock);
1695     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
1696 }
1697