xref: /openbmc/qemu/migration/ram.c (revision c1fcf220c95b17f1a05adb799824d2c352ff9281)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include <stdint.h>
29 #include <zlib.h>
30 #include "qemu/bitops.h"
31 #include "qemu/bitmap.h"
32 #include "qemu/timer.h"
33 #include "qemu/main-loop.h"
34 #include "migration/migration.h"
35 #include "exec/address-spaces.h"
36 #include "migration/page_cache.h"
37 #include "qemu/error-report.h"
38 #include "trace.h"
39 #include "exec/ram_addr.h"
40 #include "qemu/rcu_queue.h"
41 
42 #ifdef DEBUG_MIGRATION_RAM
43 #define DPRINTF(fmt, ...) \
44     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
45 #else
46 #define DPRINTF(fmt, ...) \
47     do { } while (0)
48 #endif
49 
50 static int dirty_rate_high_cnt;
51 
52 static uint64_t bitmap_sync_count;
53 
54 /***********************************************************/
55 /* ram save/restore */
56 
57 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
58 #define RAM_SAVE_FLAG_COMPRESS 0x02
59 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
60 #define RAM_SAVE_FLAG_PAGE     0x08
61 #define RAM_SAVE_FLAG_EOS      0x10
62 #define RAM_SAVE_FLAG_CONTINUE 0x20
63 #define RAM_SAVE_FLAG_XBZRLE   0x40
64 /* 0x80 is reserved in migration.h start with 0x100 next */
65 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
66 
67 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
68 
69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
70 {
71     return buffer_find_nonzero_offset(p, size) == size;
72 }
73 
74 /* struct contains XBZRLE cache and a static page
75    used by the compression */
76 static struct {
77     /* buffer used for XBZRLE encoding */
78     uint8_t *encoded_buf;
79     /* buffer for storing page content */
80     uint8_t *current_buf;
81     /* Cache for XBZRLE, Protected by lock. */
82     PageCache *cache;
83     QemuMutex lock;
84 } XBZRLE;
85 
86 /* buffer used for XBZRLE decoding */
87 static uint8_t *xbzrle_decoded_buf;
88 
89 static void XBZRLE_cache_lock(void)
90 {
91     if (migrate_use_xbzrle())
92         qemu_mutex_lock(&XBZRLE.lock);
93 }
94 
95 static void XBZRLE_cache_unlock(void)
96 {
97     if (migrate_use_xbzrle())
98         qemu_mutex_unlock(&XBZRLE.lock);
99 }
100 
101 /*
102  * called from qmp_migrate_set_cache_size in main thread, possibly while
103  * a migration is in progress.
104  * A running migration maybe using the cache and might finish during this
105  * call, hence changes to the cache are protected by XBZRLE.lock().
106  */
107 int64_t xbzrle_cache_resize(int64_t new_size)
108 {
109     PageCache *new_cache;
110     int64_t ret;
111 
112     if (new_size < TARGET_PAGE_SIZE) {
113         return -1;
114     }
115 
116     XBZRLE_cache_lock();
117 
118     if (XBZRLE.cache != NULL) {
119         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120             goto out_new_size;
121         }
122         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123                                         TARGET_PAGE_SIZE);
124         if (!new_cache) {
125             error_report("Error creating cache");
126             ret = -1;
127             goto out;
128         }
129 
130         cache_fini(XBZRLE.cache);
131         XBZRLE.cache = new_cache;
132     }
133 
134 out_new_size:
135     ret = pow2floor(new_size);
136 out:
137     XBZRLE_cache_unlock();
138     return ret;
139 }
140 
141 /* accounting for migration statistics */
142 typedef struct AccountingInfo {
143     uint64_t dup_pages;
144     uint64_t skipped_pages;
145     uint64_t norm_pages;
146     uint64_t iterations;
147     uint64_t xbzrle_bytes;
148     uint64_t xbzrle_pages;
149     uint64_t xbzrle_cache_miss;
150     double xbzrle_cache_miss_rate;
151     uint64_t xbzrle_overflows;
152 } AccountingInfo;
153 
154 static AccountingInfo acct_info;
155 
156 static void acct_clear(void)
157 {
158     memset(&acct_info, 0, sizeof(acct_info));
159 }
160 
161 uint64_t dup_mig_bytes_transferred(void)
162 {
163     return acct_info.dup_pages * TARGET_PAGE_SIZE;
164 }
165 
166 uint64_t dup_mig_pages_transferred(void)
167 {
168     return acct_info.dup_pages;
169 }
170 
171 uint64_t skipped_mig_bytes_transferred(void)
172 {
173     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
174 }
175 
176 uint64_t skipped_mig_pages_transferred(void)
177 {
178     return acct_info.skipped_pages;
179 }
180 
181 uint64_t norm_mig_bytes_transferred(void)
182 {
183     return acct_info.norm_pages * TARGET_PAGE_SIZE;
184 }
185 
186 uint64_t norm_mig_pages_transferred(void)
187 {
188     return acct_info.norm_pages;
189 }
190 
191 uint64_t xbzrle_mig_bytes_transferred(void)
192 {
193     return acct_info.xbzrle_bytes;
194 }
195 
196 uint64_t xbzrle_mig_pages_transferred(void)
197 {
198     return acct_info.xbzrle_pages;
199 }
200 
201 uint64_t xbzrle_mig_pages_cache_miss(void)
202 {
203     return acct_info.xbzrle_cache_miss;
204 }
205 
206 double xbzrle_mig_cache_miss_rate(void)
207 {
208     return acct_info.xbzrle_cache_miss_rate;
209 }
210 
211 uint64_t xbzrle_mig_pages_overflow(void)
212 {
213     return acct_info.xbzrle_overflows;
214 }
215 
216 /* This is the last block that we have visited serching for dirty pages
217  */
218 static RAMBlock *last_seen_block;
219 /* This is the last block from where we have sent data */
220 static RAMBlock *last_sent_block;
221 static ram_addr_t last_offset;
222 static QemuMutex migration_bitmap_mutex;
223 static uint64_t migration_dirty_pages;
224 static uint32_t last_version;
225 static bool ram_bulk_stage;
226 
227 /* used by the search for pages to send */
228 struct PageSearchStatus {
229     /* Current block being searched */
230     RAMBlock    *block;
231     /* Current offset to search from */
232     ram_addr_t   offset;
233     /* Set once we wrap around */
234     bool         complete_round;
235 };
236 typedef struct PageSearchStatus PageSearchStatus;
237 
238 static struct BitmapRcu {
239     struct rcu_head rcu;
240     unsigned long *bmap;
241 } *migration_bitmap_rcu;
242 
243 struct CompressParam {
244     bool start;
245     bool done;
246     QEMUFile *file;
247     QemuMutex mutex;
248     QemuCond cond;
249     RAMBlock *block;
250     ram_addr_t offset;
251 };
252 typedef struct CompressParam CompressParam;
253 
254 struct DecompressParam {
255     bool start;
256     QemuMutex mutex;
257     QemuCond cond;
258     void *des;
259     uint8 *compbuf;
260     int len;
261 };
262 typedef struct DecompressParam DecompressParam;
263 
264 static CompressParam *comp_param;
265 static QemuThread *compress_threads;
266 /* comp_done_cond is used to wake up the migration thread when
267  * one of the compression threads has finished the compression.
268  * comp_done_lock is used to co-work with comp_done_cond.
269  */
270 static QemuMutex *comp_done_lock;
271 static QemuCond *comp_done_cond;
272 /* The empty QEMUFileOps will be used by file in CompressParam */
273 static const QEMUFileOps empty_ops = { };
274 
275 static bool compression_switch;
276 static bool quit_comp_thread;
277 static bool quit_decomp_thread;
278 static DecompressParam *decomp_param;
279 static QemuThread *decompress_threads;
280 static uint8_t *compressed_data_buf;
281 
282 static int do_compress_ram_page(CompressParam *param);
283 
284 static void *do_data_compress(void *opaque)
285 {
286     CompressParam *param = opaque;
287 
288     while (!quit_comp_thread) {
289         qemu_mutex_lock(&param->mutex);
290         /* Re-check the quit_comp_thread in case of
291          * terminate_compression_threads is called just before
292          * qemu_mutex_lock(&param->mutex) and after
293          * while(!quit_comp_thread), re-check it here can make
294          * sure the compression thread terminate as expected.
295          */
296         while (!param->start && !quit_comp_thread) {
297             qemu_cond_wait(&param->cond, &param->mutex);
298         }
299         if (!quit_comp_thread) {
300             do_compress_ram_page(param);
301         }
302         param->start = false;
303         qemu_mutex_unlock(&param->mutex);
304 
305         qemu_mutex_lock(comp_done_lock);
306         param->done = true;
307         qemu_cond_signal(comp_done_cond);
308         qemu_mutex_unlock(comp_done_lock);
309     }
310 
311     return NULL;
312 }
313 
314 static inline void terminate_compression_threads(void)
315 {
316     int idx, thread_count;
317 
318     thread_count = migrate_compress_threads();
319     quit_comp_thread = true;
320     for (idx = 0; idx < thread_count; idx++) {
321         qemu_mutex_lock(&comp_param[idx].mutex);
322         qemu_cond_signal(&comp_param[idx].cond);
323         qemu_mutex_unlock(&comp_param[idx].mutex);
324     }
325 }
326 
327 void migrate_compress_threads_join(void)
328 {
329     int i, thread_count;
330 
331     if (!migrate_use_compression()) {
332         return;
333     }
334     terminate_compression_threads();
335     thread_count = migrate_compress_threads();
336     for (i = 0; i < thread_count; i++) {
337         qemu_thread_join(compress_threads + i);
338         qemu_fclose(comp_param[i].file);
339         qemu_mutex_destroy(&comp_param[i].mutex);
340         qemu_cond_destroy(&comp_param[i].cond);
341     }
342     qemu_mutex_destroy(comp_done_lock);
343     qemu_cond_destroy(comp_done_cond);
344     g_free(compress_threads);
345     g_free(comp_param);
346     g_free(comp_done_cond);
347     g_free(comp_done_lock);
348     compress_threads = NULL;
349     comp_param = NULL;
350     comp_done_cond = NULL;
351     comp_done_lock = NULL;
352 }
353 
354 void migrate_compress_threads_create(void)
355 {
356     int i, thread_count;
357 
358     if (!migrate_use_compression()) {
359         return;
360     }
361     quit_comp_thread = false;
362     compression_switch = true;
363     thread_count = migrate_compress_threads();
364     compress_threads = g_new0(QemuThread, thread_count);
365     comp_param = g_new0(CompressParam, thread_count);
366     comp_done_cond = g_new0(QemuCond, 1);
367     comp_done_lock = g_new0(QemuMutex, 1);
368     qemu_cond_init(comp_done_cond);
369     qemu_mutex_init(comp_done_lock);
370     for (i = 0; i < thread_count; i++) {
371         /* com_param[i].file is just used as a dummy buffer to save data, set
372          * it's ops to empty.
373          */
374         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
375         comp_param[i].done = true;
376         qemu_mutex_init(&comp_param[i].mutex);
377         qemu_cond_init(&comp_param[i].cond);
378         qemu_thread_create(compress_threads + i, "compress",
379                            do_data_compress, comp_param + i,
380                            QEMU_THREAD_JOINABLE);
381     }
382 }
383 
384 /**
385  * save_page_header: Write page header to wire
386  *
387  * If this is the 1st block, it also writes the block identification
388  *
389  * Returns: Number of bytes written
390  *
391  * @f: QEMUFile where to send the data
392  * @block: block that contains the page we want to send
393  * @offset: offset inside the block for the page
394  *          in the lower bits, it contains flags
395  */
396 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
397 {
398     size_t size, len;
399 
400     qemu_put_be64(f, offset);
401     size = 8;
402 
403     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
404         len = strlen(block->idstr);
405         qemu_put_byte(f, len);
406         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
407         size += 1 + len;
408     }
409     return size;
410 }
411 
412 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
413  * If guest dirty memory rate is reduced below the rate at which we can
414  * transfer pages to the destination then we should be able to complete
415  * migration. Some workloads dirty memory way too fast and will not effectively
416  * converge, even with auto-converge.
417  */
418 static void mig_throttle_guest_down(void)
419 {
420     MigrationState *s = migrate_get_current();
421     uint64_t pct_initial =
422             s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
423     uint64_t pct_icrement =
424             s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
425 
426     /* We have not started throttling yet. Let's start it. */
427     if (!cpu_throttle_active()) {
428         cpu_throttle_set(pct_initial);
429     } else {
430         /* Throttling already on, just increase the rate */
431         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
432     }
433 }
434 
435 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
436  * The important thing is that a stale (not-yet-0'd) page be replaced
437  * by the new data.
438  * As a bonus, if the page wasn't in the cache it gets added so that
439  * when a small write is made into the 0'd page it gets XBZRLE sent
440  */
441 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
442 {
443     if (ram_bulk_stage || !migrate_use_xbzrle()) {
444         return;
445     }
446 
447     /* We don't care if this fails to allocate a new cache page
448      * as long as it updated an old one */
449     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
450                  bitmap_sync_count);
451 }
452 
453 #define ENCODING_FLAG_XBZRLE 0x1
454 
455 /**
456  * save_xbzrle_page: compress and send current page
457  *
458  * Returns: 1 means that we wrote the page
459  *          0 means that page is identical to the one already sent
460  *          -1 means that xbzrle would be longer than normal
461  *
462  * @f: QEMUFile where to send the data
463  * @current_data:
464  * @current_addr:
465  * @block: block that contains the page we want to send
466  * @offset: offset inside the block for the page
467  * @last_stage: if we are at the completion stage
468  * @bytes_transferred: increase it with the number of transferred bytes
469  */
470 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
471                             ram_addr_t current_addr, RAMBlock *block,
472                             ram_addr_t offset, bool last_stage,
473                             uint64_t *bytes_transferred)
474 {
475     int encoded_len = 0, bytes_xbzrle;
476     uint8_t *prev_cached_page;
477 
478     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
479         acct_info.xbzrle_cache_miss++;
480         if (!last_stage) {
481             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
482                              bitmap_sync_count) == -1) {
483                 return -1;
484             } else {
485                 /* update *current_data when the page has been
486                    inserted into cache */
487                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
488             }
489         }
490         return -1;
491     }
492 
493     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
494 
495     /* save current buffer into memory */
496     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
497 
498     /* XBZRLE encoding (if there is no overflow) */
499     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
500                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
501                                        TARGET_PAGE_SIZE);
502     if (encoded_len == 0) {
503         DPRINTF("Skipping unmodified page\n");
504         return 0;
505     } else if (encoded_len == -1) {
506         DPRINTF("Overflow\n");
507         acct_info.xbzrle_overflows++;
508         /* update data in the cache */
509         if (!last_stage) {
510             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
511             *current_data = prev_cached_page;
512         }
513         return -1;
514     }
515 
516     /* we need to update the data in the cache, in order to get the same data */
517     if (!last_stage) {
518         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
519     }
520 
521     /* Send XBZRLE based compressed page */
522     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
523     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
524     qemu_put_be16(f, encoded_len);
525     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
526     bytes_xbzrle += encoded_len + 1 + 2;
527     acct_info.xbzrle_pages++;
528     acct_info.xbzrle_bytes += bytes_xbzrle;
529     *bytes_transferred += bytes_xbzrle;
530 
531     return 1;
532 }
533 
534 /* Called with rcu_read_lock() to protect migration_bitmap */
535 static inline
536 ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
537                                                  ram_addr_t start)
538 {
539     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
540     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
541     uint64_t rb_size = rb->used_length;
542     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
543     unsigned long *bitmap;
544 
545     unsigned long next;
546 
547     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
548     if (ram_bulk_stage && nr > base) {
549         next = nr + 1;
550     } else {
551         next = find_next_bit(bitmap, size, nr);
552     }
553 
554     if (next < size) {
555         clear_bit(next, bitmap);
556         migration_dirty_pages--;
557     }
558     return (next - base) << TARGET_PAGE_BITS;
559 }
560 
561 /* Called with rcu_read_lock() to protect migration_bitmap */
562 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
563 {
564     unsigned long *bitmap;
565     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
566     migration_dirty_pages +=
567         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
568 }
569 
570 /* Fix me: there are too many global variables used in migration process. */
571 static int64_t start_time;
572 static int64_t bytes_xfer_prev;
573 static int64_t num_dirty_pages_period;
574 static uint64_t xbzrle_cache_miss_prev;
575 static uint64_t iterations_prev;
576 
577 static void migration_bitmap_sync_init(void)
578 {
579     start_time = 0;
580     bytes_xfer_prev = 0;
581     num_dirty_pages_period = 0;
582     xbzrle_cache_miss_prev = 0;
583     iterations_prev = 0;
584 }
585 
586 /* Called with iothread lock held, to protect ram_list.dirty_memory[] */
587 static void migration_bitmap_sync(void)
588 {
589     RAMBlock *block;
590     uint64_t num_dirty_pages_init = migration_dirty_pages;
591     MigrationState *s = migrate_get_current();
592     int64_t end_time;
593     int64_t bytes_xfer_now;
594 
595     bitmap_sync_count++;
596 
597     if (!bytes_xfer_prev) {
598         bytes_xfer_prev = ram_bytes_transferred();
599     }
600 
601     if (!start_time) {
602         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
603     }
604 
605     trace_migration_bitmap_sync_start();
606     address_space_sync_dirty_bitmap(&address_space_memory);
607 
608     qemu_mutex_lock(&migration_bitmap_mutex);
609     rcu_read_lock();
610     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
611         migration_bitmap_sync_range(block->offset, block->used_length);
612     }
613     rcu_read_unlock();
614     qemu_mutex_unlock(&migration_bitmap_mutex);
615 
616     trace_migration_bitmap_sync_end(migration_dirty_pages
617                                     - num_dirty_pages_init);
618     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
619     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
620 
621     /* more than 1 second = 1000 millisecons */
622     if (end_time > start_time + 1000) {
623         if (migrate_auto_converge()) {
624             /* The following detection logic can be refined later. For now:
625                Check to see if the dirtied bytes is 50% more than the approx.
626                amount of bytes that just got transferred since the last time we
627                were in this routine. If that happens twice, start or increase
628                throttling */
629             bytes_xfer_now = ram_bytes_transferred();
630 
631             if (s->dirty_pages_rate &&
632                (num_dirty_pages_period * TARGET_PAGE_SIZE >
633                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
634                (dirty_rate_high_cnt++ >= 2)) {
635                     trace_migration_throttle();
636                     dirty_rate_high_cnt = 0;
637                     mig_throttle_guest_down();
638              }
639              bytes_xfer_prev = bytes_xfer_now;
640         }
641 
642         if (migrate_use_xbzrle()) {
643             if (iterations_prev != acct_info.iterations) {
644                 acct_info.xbzrle_cache_miss_rate =
645                    (double)(acct_info.xbzrle_cache_miss -
646                             xbzrle_cache_miss_prev) /
647                    (acct_info.iterations - iterations_prev);
648             }
649             iterations_prev = acct_info.iterations;
650             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
651         }
652         s->dirty_pages_rate = num_dirty_pages_period * 1000
653             / (end_time - start_time);
654         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
655         start_time = end_time;
656         num_dirty_pages_period = 0;
657     }
658     s->dirty_sync_count = bitmap_sync_count;
659 }
660 
661 /**
662  * save_zero_page: Send the zero page to the stream
663  *
664  * Returns: Number of pages written.
665  *
666  * @f: QEMUFile where to send the data
667  * @block: block that contains the page we want to send
668  * @offset: offset inside the block for the page
669  * @p: pointer to the page
670  * @bytes_transferred: increase it with the number of transferred bytes
671  */
672 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
673                           uint8_t *p, uint64_t *bytes_transferred)
674 {
675     int pages = -1;
676 
677     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
678         acct_info.dup_pages++;
679         *bytes_transferred += save_page_header(f, block,
680                                                offset | RAM_SAVE_FLAG_COMPRESS);
681         qemu_put_byte(f, 0);
682         *bytes_transferred += 1;
683         pages = 1;
684     }
685 
686     return pages;
687 }
688 
689 /**
690  * ram_save_page: Send the given page to the stream
691  *
692  * Returns: Number of pages written.
693  *
694  * @f: QEMUFile where to send the data
695  * @block: block that contains the page we want to send
696  * @offset: offset inside the block for the page
697  * @last_stage: if we are at the completion stage
698  * @bytes_transferred: increase it with the number of transferred bytes
699  */
700 static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
701                          bool last_stage, uint64_t *bytes_transferred)
702 {
703     int pages = -1;
704     uint64_t bytes_xmit;
705     ram_addr_t current_addr;
706     uint8_t *p;
707     int ret;
708     bool send_async = true;
709 
710     p = block->host + offset;
711 
712     /* In doubt sent page as normal */
713     bytes_xmit = 0;
714     ret = ram_control_save_page(f, block->offset,
715                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
716     if (bytes_xmit) {
717         *bytes_transferred += bytes_xmit;
718         pages = 1;
719     }
720 
721     XBZRLE_cache_lock();
722 
723     current_addr = block->offset + offset;
724 
725     if (block == last_sent_block) {
726         offset |= RAM_SAVE_FLAG_CONTINUE;
727     }
728     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
729         if (ret != RAM_SAVE_CONTROL_DELAYED) {
730             if (bytes_xmit > 0) {
731                 acct_info.norm_pages++;
732             } else if (bytes_xmit == 0) {
733                 acct_info.dup_pages++;
734             }
735         }
736     } else {
737         pages = save_zero_page(f, block, offset, p, bytes_transferred);
738         if (pages > 0) {
739             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
740              * page would be stale
741              */
742             xbzrle_cache_zero_page(current_addr);
743         } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
744             pages = save_xbzrle_page(f, &p, current_addr, block,
745                                      offset, last_stage, bytes_transferred);
746             if (!last_stage) {
747                 /* Can't send this cached data async, since the cache page
748                  * might get updated before it gets to the wire
749                  */
750                 send_async = false;
751             }
752         }
753     }
754 
755     /* XBZRLE overflow or normal page */
756     if (pages == -1) {
757         *bytes_transferred += save_page_header(f, block,
758                                                offset | RAM_SAVE_FLAG_PAGE);
759         if (send_async) {
760             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
761         } else {
762             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
763         }
764         *bytes_transferred += TARGET_PAGE_SIZE;
765         pages = 1;
766         acct_info.norm_pages++;
767     }
768 
769     XBZRLE_cache_unlock();
770 
771     return pages;
772 }
773 
774 static int do_compress_ram_page(CompressParam *param)
775 {
776     int bytes_sent, blen;
777     uint8_t *p;
778     RAMBlock *block = param->block;
779     ram_addr_t offset = param->offset;
780 
781     p = block->host + (offset & TARGET_PAGE_MASK);
782 
783     bytes_sent = save_page_header(param->file, block, offset |
784                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
785     blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
786                                      migrate_compress_level());
787     bytes_sent += blen;
788 
789     return bytes_sent;
790 }
791 
792 static inline void start_compression(CompressParam *param)
793 {
794     param->done = false;
795     qemu_mutex_lock(&param->mutex);
796     param->start = true;
797     qemu_cond_signal(&param->cond);
798     qemu_mutex_unlock(&param->mutex);
799 }
800 
801 static inline void start_decompression(DecompressParam *param)
802 {
803     qemu_mutex_lock(&param->mutex);
804     param->start = true;
805     qemu_cond_signal(&param->cond);
806     qemu_mutex_unlock(&param->mutex);
807 }
808 
809 static uint64_t bytes_transferred;
810 
811 static void flush_compressed_data(QEMUFile *f)
812 {
813     int idx, len, thread_count;
814 
815     if (!migrate_use_compression()) {
816         return;
817     }
818     thread_count = migrate_compress_threads();
819     for (idx = 0; idx < thread_count; idx++) {
820         if (!comp_param[idx].done) {
821             qemu_mutex_lock(comp_done_lock);
822             while (!comp_param[idx].done && !quit_comp_thread) {
823                 qemu_cond_wait(comp_done_cond, comp_done_lock);
824             }
825             qemu_mutex_unlock(comp_done_lock);
826         }
827         if (!quit_comp_thread) {
828             len = qemu_put_qemu_file(f, comp_param[idx].file);
829             bytes_transferred += len;
830         }
831     }
832 }
833 
834 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
835                                        ram_addr_t offset)
836 {
837     param->block = block;
838     param->offset = offset;
839 }
840 
841 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
842                                            ram_addr_t offset,
843                                            uint64_t *bytes_transferred)
844 {
845     int idx, thread_count, bytes_xmit = -1, pages = -1;
846 
847     thread_count = migrate_compress_threads();
848     qemu_mutex_lock(comp_done_lock);
849     while (true) {
850         for (idx = 0; idx < thread_count; idx++) {
851             if (comp_param[idx].done) {
852                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
853                 set_compress_params(&comp_param[idx], block, offset);
854                 start_compression(&comp_param[idx]);
855                 pages = 1;
856                 acct_info.norm_pages++;
857                 *bytes_transferred += bytes_xmit;
858                 break;
859             }
860         }
861         if (pages > 0) {
862             break;
863         } else {
864             qemu_cond_wait(comp_done_cond, comp_done_lock);
865         }
866     }
867     qemu_mutex_unlock(comp_done_lock);
868 
869     return pages;
870 }
871 
872 /**
873  * ram_save_compressed_page: compress the given page and send it to the stream
874  *
875  * Returns: Number of pages written.
876  *
877  * @f: QEMUFile where to send the data
878  * @block: block that contains the page we want to send
879  * @offset: offset inside the block for the page
880  * @last_stage: if we are at the completion stage
881  * @bytes_transferred: increase it with the number of transferred bytes
882  */
883 static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
884                                     ram_addr_t offset, bool last_stage,
885                                     uint64_t *bytes_transferred)
886 {
887     int pages = -1;
888     uint64_t bytes_xmit;
889     uint8_t *p;
890     int ret;
891 
892     p = block->host + offset;
893 
894     bytes_xmit = 0;
895     ret = ram_control_save_page(f, block->offset,
896                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
897     if (bytes_xmit) {
898         *bytes_transferred += bytes_xmit;
899         pages = 1;
900     }
901     if (block == last_sent_block) {
902         offset |= RAM_SAVE_FLAG_CONTINUE;
903     }
904     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
905         if (ret != RAM_SAVE_CONTROL_DELAYED) {
906             if (bytes_xmit > 0) {
907                 acct_info.norm_pages++;
908             } else if (bytes_xmit == 0) {
909                 acct_info.dup_pages++;
910             }
911         }
912     } else {
913         /* When starting the process of a new block, the first page of
914          * the block should be sent out before other pages in the same
915          * block, and all the pages in last block should have been sent
916          * out, keeping this order is important, because the 'cont' flag
917          * is used to avoid resending the block name.
918          */
919         if (block != last_sent_block) {
920             flush_compressed_data(f);
921             pages = save_zero_page(f, block, offset, p, bytes_transferred);
922             if (pages == -1) {
923                 set_compress_params(&comp_param[0], block, offset);
924                 /* Use the qemu thread to compress the data to make sure the
925                  * first page is sent out before other pages
926                  */
927                 bytes_xmit = do_compress_ram_page(&comp_param[0]);
928                 acct_info.norm_pages++;
929                 qemu_put_qemu_file(f, comp_param[0].file);
930                 *bytes_transferred += bytes_xmit;
931                 pages = 1;
932             }
933         } else {
934             pages = save_zero_page(f, block, offset, p, bytes_transferred);
935             if (pages == -1) {
936                 pages = compress_page_with_multi_thread(f, block, offset,
937                                                         bytes_transferred);
938             }
939         }
940     }
941 
942     return pages;
943 }
944 
945 /*
946  * Find the next dirty page and update any state associated with
947  * the search process.
948  *
949  * Returns: True if a page is found
950  *
951  * @f: Current migration stream.
952  * @pss: Data about the state of the current dirty page scan.
953  * @*again: Set to false if the search has scanned the whole of RAM
954  */
955 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
956                              bool *again)
957 {
958     pss->offset = migration_bitmap_find_and_reset_dirty(pss->block,
959                                                        pss->offset);
960     if (pss->complete_round && pss->block == last_seen_block &&
961         pss->offset >= last_offset) {
962         /*
963          * We've been once around the RAM and haven't found anything.
964          * Give up.
965          */
966         *again = false;
967         return false;
968     }
969     if (pss->offset >= pss->block->used_length) {
970         /* Didn't find anything in this RAM Block */
971         pss->offset = 0;
972         pss->block = QLIST_NEXT_RCU(pss->block, next);
973         if (!pss->block) {
974             /* Hit the end of the list */
975             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
976             /* Flag that we've looped */
977             pss->complete_round = true;
978             ram_bulk_stage = false;
979             if (migrate_use_xbzrle()) {
980                 /* If xbzrle is on, stop using the data compression at this
981                  * point. In theory, xbzrle can do better than compression.
982                  */
983                 flush_compressed_data(f);
984                 compression_switch = false;
985             }
986         }
987         /* Didn't find anything this time, but try again on the new block */
988         *again = true;
989         return false;
990     } else {
991         /* Can go around again, but... */
992         *again = true;
993         /* We've found something so probably don't need to */
994         return true;
995     }
996 }
997 
998 /**
999  * ram_find_and_save_block: Finds a dirty page and sends it to f
1000  *
1001  * Called within an RCU critical section.
1002  *
1003  * Returns:  The number of pages written
1004  *           0 means no dirty pages
1005  *
1006  * @f: QEMUFile where to send the data
1007  * @last_stage: if we are at the completion stage
1008  * @bytes_transferred: increase it with the number of transferred bytes
1009  */
1010 
1011 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1012                                    uint64_t *bytes_transferred)
1013 {
1014     PageSearchStatus pss;
1015     int pages = 0;
1016     bool again, found;
1017 
1018     pss.block = last_seen_block;
1019     pss.offset = last_offset;
1020     pss.complete_round = false;
1021 
1022     if (!pss.block) {
1023         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1024     }
1025 
1026     do {
1027         found = find_dirty_block(f, &pss, &again);
1028 
1029         if (found) {
1030             if (compression_switch && migrate_use_compression()) {
1031                 pages = ram_save_compressed_page(f, pss.block, pss.offset,
1032                                                  last_stage,
1033                                                  bytes_transferred);
1034             } else {
1035                 pages = ram_save_page(f, pss.block, pss.offset, last_stage,
1036                                       bytes_transferred);
1037             }
1038 
1039             /* if page is unmodified, continue to the next */
1040             if (pages > 0) {
1041                 last_sent_block = pss.block;
1042             }
1043         }
1044     } while (!pages && again);
1045 
1046     last_seen_block = pss.block;
1047     last_offset = pss.offset;
1048 
1049     return pages;
1050 }
1051 
1052 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1053 {
1054     uint64_t pages = size / TARGET_PAGE_SIZE;
1055     if (zero) {
1056         acct_info.dup_pages += pages;
1057     } else {
1058         acct_info.norm_pages += pages;
1059         bytes_transferred += size;
1060         qemu_update_position(f, size);
1061     }
1062 }
1063 
1064 static ram_addr_t ram_save_remaining(void)
1065 {
1066     return migration_dirty_pages;
1067 }
1068 
1069 uint64_t ram_bytes_remaining(void)
1070 {
1071     return ram_save_remaining() * TARGET_PAGE_SIZE;
1072 }
1073 
1074 uint64_t ram_bytes_transferred(void)
1075 {
1076     return bytes_transferred;
1077 }
1078 
1079 uint64_t ram_bytes_total(void)
1080 {
1081     RAMBlock *block;
1082     uint64_t total = 0;
1083 
1084     rcu_read_lock();
1085     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1086         total += block->used_length;
1087     rcu_read_unlock();
1088     return total;
1089 }
1090 
1091 void free_xbzrle_decoded_buf(void)
1092 {
1093     g_free(xbzrle_decoded_buf);
1094     xbzrle_decoded_buf = NULL;
1095 }
1096 
1097 static void migration_bitmap_free(struct BitmapRcu *bmap)
1098 {
1099     g_free(bmap->bmap);
1100     g_free(bmap);
1101 }
1102 
1103 static void ram_migration_cleanup(void *opaque)
1104 {
1105     /* caller have hold iothread lock or is in a bh, so there is
1106      * no writing race against this migration_bitmap
1107      */
1108     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1109     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1110     if (bitmap) {
1111         memory_global_dirty_log_stop();
1112         call_rcu(bitmap, migration_bitmap_free, rcu);
1113     }
1114 
1115     XBZRLE_cache_lock();
1116     if (XBZRLE.cache) {
1117         cache_fini(XBZRLE.cache);
1118         g_free(XBZRLE.encoded_buf);
1119         g_free(XBZRLE.current_buf);
1120         XBZRLE.cache = NULL;
1121         XBZRLE.encoded_buf = NULL;
1122         XBZRLE.current_buf = NULL;
1123     }
1124     XBZRLE_cache_unlock();
1125 }
1126 
1127 static void reset_ram_globals(void)
1128 {
1129     last_seen_block = NULL;
1130     last_sent_block = NULL;
1131     last_offset = 0;
1132     last_version = ram_list.version;
1133     ram_bulk_stage = true;
1134 }
1135 
1136 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1137 
1138 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1139 {
1140     /* called in qemu main thread, so there is
1141      * no writing race against this migration_bitmap
1142      */
1143     if (migration_bitmap_rcu) {
1144         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1145         bitmap = g_new(struct BitmapRcu, 1);
1146         bitmap->bmap = bitmap_new(new);
1147 
1148         /* prevent migration_bitmap content from being set bit
1149          * by migration_bitmap_sync_range() at the same time.
1150          * it is safe to migration if migration_bitmap is cleared bit
1151          * at the same time.
1152          */
1153         qemu_mutex_lock(&migration_bitmap_mutex);
1154         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1155         bitmap_set(bitmap->bmap, old, new - old);
1156         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1157         qemu_mutex_unlock(&migration_bitmap_mutex);
1158         migration_dirty_pages += new - old;
1159         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1160     }
1161 }
1162 
1163 /*
1164  * 'expected' is the value you expect the bitmap mostly to be full
1165  * of; it won't bother printing lines that are all this value.
1166  * If 'todump' is null the migration bitmap is dumped.
1167  */
1168 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1169 {
1170     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1171 
1172     int64_t cur;
1173     int64_t linelen = 128;
1174     char linebuf[129];
1175 
1176     if (!todump) {
1177         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1178     }
1179 
1180     for (cur = 0; cur < ram_pages; cur += linelen) {
1181         int64_t curb;
1182         bool found = false;
1183         /*
1184          * Last line; catch the case where the line length
1185          * is longer than remaining ram
1186          */
1187         if (cur + linelen > ram_pages) {
1188             linelen = ram_pages - cur;
1189         }
1190         for (curb = 0; curb < linelen; curb++) {
1191             bool thisbit = test_bit(cur + curb, todump);
1192             linebuf[curb] = thisbit ? '1' : '.';
1193             found = found || (thisbit != expected);
1194         }
1195         if (found) {
1196             linebuf[curb] = '\0';
1197             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1198         }
1199     }
1200 }
1201 
1202 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1203  * long-running RCU critical section.  When rcu-reclaims in the code
1204  * start to become numerous it will be necessary to reduce the
1205  * granularity of these critical sections.
1206  */
1207 
1208 static int ram_save_setup(QEMUFile *f, void *opaque)
1209 {
1210     RAMBlock *block;
1211     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1212 
1213     dirty_rate_high_cnt = 0;
1214     bitmap_sync_count = 0;
1215     migration_bitmap_sync_init();
1216     qemu_mutex_init(&migration_bitmap_mutex);
1217 
1218     if (migrate_use_xbzrle()) {
1219         XBZRLE_cache_lock();
1220         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1221                                   TARGET_PAGE_SIZE,
1222                                   TARGET_PAGE_SIZE);
1223         if (!XBZRLE.cache) {
1224             XBZRLE_cache_unlock();
1225             error_report("Error creating cache");
1226             return -1;
1227         }
1228         XBZRLE_cache_unlock();
1229 
1230         /* We prefer not to abort if there is no memory */
1231         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1232         if (!XBZRLE.encoded_buf) {
1233             error_report("Error allocating encoded_buf");
1234             return -1;
1235         }
1236 
1237         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1238         if (!XBZRLE.current_buf) {
1239             error_report("Error allocating current_buf");
1240             g_free(XBZRLE.encoded_buf);
1241             XBZRLE.encoded_buf = NULL;
1242             return -1;
1243         }
1244 
1245         acct_clear();
1246     }
1247 
1248     /* iothread lock needed for ram_list.dirty_memory[] */
1249     qemu_mutex_lock_iothread();
1250     qemu_mutex_lock_ramlist();
1251     rcu_read_lock();
1252     bytes_transferred = 0;
1253     reset_ram_globals();
1254 
1255     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1256     migration_bitmap_rcu = g_new(struct BitmapRcu, 1);
1257     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1258     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1259 
1260     /*
1261      * Count the total number of pages used by ram blocks not including any
1262      * gaps due to alignment or unplugs.
1263      */
1264     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1265 
1266     memory_global_dirty_log_start();
1267     migration_bitmap_sync();
1268     qemu_mutex_unlock_ramlist();
1269     qemu_mutex_unlock_iothread();
1270 
1271     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1272 
1273     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1274         qemu_put_byte(f, strlen(block->idstr));
1275         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1276         qemu_put_be64(f, block->used_length);
1277     }
1278 
1279     rcu_read_unlock();
1280 
1281     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1282     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1283 
1284     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1285 
1286     return 0;
1287 }
1288 
1289 static int ram_save_iterate(QEMUFile *f, void *opaque)
1290 {
1291     int ret;
1292     int i;
1293     int64_t t0;
1294     int pages_sent = 0;
1295 
1296     rcu_read_lock();
1297     if (ram_list.version != last_version) {
1298         reset_ram_globals();
1299     }
1300 
1301     /* Read version before ram_list.blocks */
1302     smp_rmb();
1303 
1304     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1305 
1306     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1307     i = 0;
1308     while ((ret = qemu_file_rate_limit(f)) == 0) {
1309         int pages;
1310 
1311         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1312         /* no more pages to sent */
1313         if (pages == 0) {
1314             break;
1315         }
1316         pages_sent += pages;
1317         acct_info.iterations++;
1318 
1319         /* we want to check in the 1st loop, just in case it was the 1st time
1320            and we had to sync the dirty bitmap.
1321            qemu_get_clock_ns() is a bit expensive, so we only check each some
1322            iterations
1323         */
1324         if ((i & 63) == 0) {
1325             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1326             if (t1 > MAX_WAIT) {
1327                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
1328                         t1, i);
1329                 break;
1330             }
1331         }
1332         i++;
1333     }
1334     flush_compressed_data(f);
1335     rcu_read_unlock();
1336 
1337     /*
1338      * Must occur before EOS (or any QEMUFile operation)
1339      * because of RDMA protocol.
1340      */
1341     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1342 
1343     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1344     bytes_transferred += 8;
1345 
1346     ret = qemu_file_get_error(f);
1347     if (ret < 0) {
1348         return ret;
1349     }
1350 
1351     return pages_sent;
1352 }
1353 
1354 /* Called with iothread lock */
1355 static int ram_save_complete(QEMUFile *f, void *opaque)
1356 {
1357     rcu_read_lock();
1358 
1359     migration_bitmap_sync();
1360 
1361     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
1362 
1363     /* try transferring iterative blocks of memory */
1364 
1365     /* flush all remaining blocks regardless of rate limiting */
1366     while (true) {
1367         int pages;
1368 
1369         pages = ram_find_and_save_block(f, true, &bytes_transferred);
1370         /* no more blocks to sent */
1371         if (pages == 0) {
1372             break;
1373         }
1374     }
1375 
1376     flush_compressed_data(f);
1377     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
1378 
1379     rcu_read_unlock();
1380 
1381     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1382 
1383     return 0;
1384 }
1385 
1386 static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
1387 {
1388     uint64_t remaining_size;
1389 
1390     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1391 
1392     if (remaining_size < max_size) {
1393         qemu_mutex_lock_iothread();
1394         rcu_read_lock();
1395         migration_bitmap_sync();
1396         rcu_read_unlock();
1397         qemu_mutex_unlock_iothread();
1398         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1399     }
1400     return remaining_size;
1401 }
1402 
1403 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
1404 {
1405     unsigned int xh_len;
1406     int xh_flags;
1407 
1408     if (!xbzrle_decoded_buf) {
1409         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1410     }
1411 
1412     /* extract RLE header */
1413     xh_flags = qemu_get_byte(f);
1414     xh_len = qemu_get_be16(f);
1415 
1416     if (xh_flags != ENCODING_FLAG_XBZRLE) {
1417         error_report("Failed to load XBZRLE page - wrong compression!");
1418         return -1;
1419     }
1420 
1421     if (xh_len > TARGET_PAGE_SIZE) {
1422         error_report("Failed to load XBZRLE page - len overflow!");
1423         return -1;
1424     }
1425     /* load data and decode */
1426     qemu_get_buffer(f, xbzrle_decoded_buf, xh_len);
1427 
1428     /* decode RLE */
1429     if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host,
1430                              TARGET_PAGE_SIZE) == -1) {
1431         error_report("Failed to load XBZRLE page - decode error!");
1432         return -1;
1433     }
1434 
1435     return 0;
1436 }
1437 
1438 /* Must be called from within a rcu critical section.
1439  * Returns a pointer from within the RCU-protected ram_list.
1440  */
1441 static inline void *host_from_stream_offset(QEMUFile *f,
1442                                             ram_addr_t offset,
1443                                             int flags)
1444 {
1445     static RAMBlock *block = NULL;
1446     char id[256];
1447     uint8_t len;
1448 
1449     if (flags & RAM_SAVE_FLAG_CONTINUE) {
1450         if (!block || block->max_length <= offset) {
1451             error_report("Ack, bad migration stream!");
1452             return NULL;
1453         }
1454 
1455         return block->host + offset;
1456     }
1457 
1458     len = qemu_get_byte(f);
1459     qemu_get_buffer(f, (uint8_t *)id, len);
1460     id[len] = 0;
1461 
1462     block = qemu_ram_block_by_name(id);
1463     if (block && block->max_length > offset) {
1464         return block->host + offset;
1465     }
1466 
1467     error_report("Can't find block %s", id);
1468     return NULL;
1469 }
1470 
1471 /*
1472  * If a page (or a whole RDMA chunk) has been
1473  * determined to be zero, then zap it.
1474  */
1475 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
1476 {
1477     if (ch != 0 || !is_zero_range(host, size)) {
1478         memset(host, ch, size);
1479     }
1480 }
1481 
1482 static void *do_data_decompress(void *opaque)
1483 {
1484     DecompressParam *param = opaque;
1485     unsigned long pagesize;
1486 
1487     while (!quit_decomp_thread) {
1488         qemu_mutex_lock(&param->mutex);
1489         while (!param->start && !quit_decomp_thread) {
1490             qemu_cond_wait(&param->cond, &param->mutex);
1491             pagesize = TARGET_PAGE_SIZE;
1492             if (!quit_decomp_thread) {
1493                 /* uncompress() will return failed in some case, especially
1494                  * when the page is dirted when doing the compression, it's
1495                  * not a problem because the dirty page will be retransferred
1496                  * and uncompress() won't break the data in other pages.
1497                  */
1498                 uncompress((Bytef *)param->des, &pagesize,
1499                            (const Bytef *)param->compbuf, param->len);
1500             }
1501             param->start = false;
1502         }
1503         qemu_mutex_unlock(&param->mutex);
1504     }
1505 
1506     return NULL;
1507 }
1508 
1509 void migrate_decompress_threads_create(void)
1510 {
1511     int i, thread_count;
1512 
1513     thread_count = migrate_decompress_threads();
1514     decompress_threads = g_new0(QemuThread, thread_count);
1515     decomp_param = g_new0(DecompressParam, thread_count);
1516     compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1517     quit_decomp_thread = false;
1518     for (i = 0; i < thread_count; i++) {
1519         qemu_mutex_init(&decomp_param[i].mutex);
1520         qemu_cond_init(&decomp_param[i].cond);
1521         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1522         qemu_thread_create(decompress_threads + i, "decompress",
1523                            do_data_decompress, decomp_param + i,
1524                            QEMU_THREAD_JOINABLE);
1525     }
1526 }
1527 
1528 void migrate_decompress_threads_join(void)
1529 {
1530     int i, thread_count;
1531 
1532     quit_decomp_thread = true;
1533     thread_count = migrate_decompress_threads();
1534     for (i = 0; i < thread_count; i++) {
1535         qemu_mutex_lock(&decomp_param[i].mutex);
1536         qemu_cond_signal(&decomp_param[i].cond);
1537         qemu_mutex_unlock(&decomp_param[i].mutex);
1538     }
1539     for (i = 0; i < thread_count; i++) {
1540         qemu_thread_join(decompress_threads + i);
1541         qemu_mutex_destroy(&decomp_param[i].mutex);
1542         qemu_cond_destroy(&decomp_param[i].cond);
1543         g_free(decomp_param[i].compbuf);
1544     }
1545     g_free(decompress_threads);
1546     g_free(decomp_param);
1547     g_free(compressed_data_buf);
1548     decompress_threads = NULL;
1549     decomp_param = NULL;
1550     compressed_data_buf = NULL;
1551 }
1552 
1553 static void decompress_data_with_multi_threads(uint8_t *compbuf,
1554                                                void *host, int len)
1555 {
1556     int idx, thread_count;
1557 
1558     thread_count = migrate_decompress_threads();
1559     while (true) {
1560         for (idx = 0; idx < thread_count; idx++) {
1561             if (!decomp_param[idx].start) {
1562                 memcpy(decomp_param[idx].compbuf, compbuf, len);
1563                 decomp_param[idx].des = host;
1564                 decomp_param[idx].len = len;
1565                 start_decompression(&decomp_param[idx]);
1566                 break;
1567             }
1568         }
1569         if (idx < thread_count) {
1570             break;
1571         }
1572     }
1573 }
1574 
1575 static int ram_load(QEMUFile *f, void *opaque, int version_id)
1576 {
1577     int flags = 0, ret = 0;
1578     static uint64_t seq_iter;
1579     int len = 0;
1580 
1581     seq_iter++;
1582 
1583     if (version_id != 4) {
1584         ret = -EINVAL;
1585     }
1586 
1587     /* This RCU critical section can be very long running.
1588      * When RCU reclaims in the code start to become numerous,
1589      * it will be necessary to reduce the granularity of this
1590      * critical section.
1591      */
1592     rcu_read_lock();
1593     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
1594         ram_addr_t addr, total_ram_bytes;
1595         void *host = NULL;
1596         uint8_t ch;
1597 
1598         addr = qemu_get_be64(f);
1599         flags = addr & ~TARGET_PAGE_MASK;
1600         addr &= TARGET_PAGE_MASK;
1601 
1602         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
1603                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
1604             host = host_from_stream_offset(f, addr, flags);
1605             if (!host) {
1606                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1607                 ret = -EINVAL;
1608                 break;
1609             }
1610         }
1611 
1612         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
1613         case RAM_SAVE_FLAG_MEM_SIZE:
1614             /* Synchronize RAM block list */
1615             total_ram_bytes = addr;
1616             while (!ret && total_ram_bytes) {
1617                 RAMBlock *block;
1618                 char id[256];
1619                 ram_addr_t length;
1620 
1621                 len = qemu_get_byte(f);
1622                 qemu_get_buffer(f, (uint8_t *)id, len);
1623                 id[len] = 0;
1624                 length = qemu_get_be64(f);
1625 
1626                 block = qemu_ram_block_by_name(id);
1627                 if (block) {
1628                     if (length != block->used_length) {
1629                         Error *local_err = NULL;
1630 
1631                         ret = qemu_ram_resize(block->offset, length,
1632                                               &local_err);
1633                         if (local_err) {
1634                             error_report_err(local_err);
1635                         }
1636                     }
1637                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
1638                                           block->idstr);
1639                 } else {
1640                     error_report("Unknown ramblock \"%s\", cannot "
1641                                  "accept migration", id);
1642                     ret = -EINVAL;
1643                 }
1644 
1645                 total_ram_bytes -= length;
1646             }
1647             break;
1648 
1649         case RAM_SAVE_FLAG_COMPRESS:
1650             ch = qemu_get_byte(f);
1651             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
1652             break;
1653 
1654         case RAM_SAVE_FLAG_PAGE:
1655             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
1656             break;
1657 
1658         case RAM_SAVE_FLAG_COMPRESS_PAGE:
1659             len = qemu_get_be32(f);
1660             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
1661                 error_report("Invalid compressed data length: %d", len);
1662                 ret = -EINVAL;
1663                 break;
1664             }
1665             qemu_get_buffer(f, compressed_data_buf, len);
1666             decompress_data_with_multi_threads(compressed_data_buf, host, len);
1667             break;
1668 
1669         case RAM_SAVE_FLAG_XBZRLE:
1670             if (load_xbzrle(f, addr, host) < 0) {
1671                 error_report("Failed to decompress XBZRLE page at "
1672                              RAM_ADDR_FMT, addr);
1673                 ret = -EINVAL;
1674                 break;
1675             }
1676             break;
1677         case RAM_SAVE_FLAG_EOS:
1678             /* normal exit */
1679             break;
1680         default:
1681             if (flags & RAM_SAVE_FLAG_HOOK) {
1682                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
1683             } else {
1684                 error_report("Unknown combination of migration flags: %#x",
1685                              flags);
1686                 ret = -EINVAL;
1687             }
1688         }
1689         if (!ret) {
1690             ret = qemu_file_get_error(f);
1691         }
1692     }
1693 
1694     rcu_read_unlock();
1695     DPRINTF("Completed load of VM with exit code %d seq iteration "
1696             "%" PRIu64 "\n", ret, seq_iter);
1697     return ret;
1698 }
1699 
1700 static SaveVMHandlers savevm_ram_handlers = {
1701     .save_live_setup = ram_save_setup,
1702     .save_live_iterate = ram_save_iterate,
1703     .save_live_complete_precopy = ram_save_complete,
1704     .save_live_pending = ram_save_pending,
1705     .load_state = ram_load,
1706     .cleanup = ram_migration_cleanup,
1707 };
1708 
1709 void ram_mig_init(void)
1710 {
1711     qemu_mutex_init(&XBZRLE.lock);
1712     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
1713 }
1714