xref: /openbmc/qemu/migration/ram.c (revision 4a09d0bb)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "migration/postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 #include "migration/colo.h"
47 
48 static int dirty_rate_high_cnt;
49 
50 static uint64_t bitmap_sync_count;
51 
52 /***********************************************************/
53 /* ram save/restore */
54 
55 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
56 #define RAM_SAVE_FLAG_COMPRESS 0x02
57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
58 #define RAM_SAVE_FLAG_PAGE     0x08
59 #define RAM_SAVE_FLAG_EOS      0x10
60 #define RAM_SAVE_FLAG_CONTINUE 0x20
61 #define RAM_SAVE_FLAG_XBZRLE   0x40
62 /* 0x80 is reserved in migration.h start with 0x100 next */
63 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
64 
65 static uint8_t *ZERO_TARGET_PAGE;
66 
67 static inline bool is_zero_range(uint8_t *p, uint64_t size)
68 {
69     return buffer_is_zero(p, size);
70 }
71 
72 /* struct contains XBZRLE cache and a static page
73    used by the compression */
74 static struct {
75     /* buffer used for XBZRLE encoding */
76     uint8_t *encoded_buf;
77     /* buffer for storing page content */
78     uint8_t *current_buf;
79     /* Cache for XBZRLE, Protected by lock. */
80     PageCache *cache;
81     QemuMutex lock;
82 } XBZRLE;
83 
84 /* buffer used for XBZRLE decoding */
85 static uint8_t *xbzrle_decoded_buf;
86 
87 static void XBZRLE_cache_lock(void)
88 {
89     if (migrate_use_xbzrle())
90         qemu_mutex_lock(&XBZRLE.lock);
91 }
92 
93 static void XBZRLE_cache_unlock(void)
94 {
95     if (migrate_use_xbzrle())
96         qemu_mutex_unlock(&XBZRLE.lock);
97 }
98 
99 /*
100  * called from qmp_migrate_set_cache_size in main thread, possibly while
101  * a migration is in progress.
102  * A running migration maybe using the cache and might finish during this
103  * call, hence changes to the cache are protected by XBZRLE.lock().
104  */
105 int64_t xbzrle_cache_resize(int64_t new_size)
106 {
107     PageCache *new_cache;
108     int64_t ret;
109 
110     if (new_size < TARGET_PAGE_SIZE) {
111         return -1;
112     }
113 
114     XBZRLE_cache_lock();
115 
116     if (XBZRLE.cache != NULL) {
117         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
118             goto out_new_size;
119         }
120         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
121                                         TARGET_PAGE_SIZE);
122         if (!new_cache) {
123             error_report("Error creating cache");
124             ret = -1;
125             goto out;
126         }
127 
128         cache_fini(XBZRLE.cache);
129         XBZRLE.cache = new_cache;
130     }
131 
132 out_new_size:
133     ret = pow2floor(new_size);
134 out:
135     XBZRLE_cache_unlock();
136     return ret;
137 }
138 
139 /* accounting for migration statistics */
140 typedef struct AccountingInfo {
141     uint64_t dup_pages;
142     uint64_t skipped_pages;
143     uint64_t norm_pages;
144     uint64_t iterations;
145     uint64_t xbzrle_bytes;
146     uint64_t xbzrle_pages;
147     uint64_t xbzrle_cache_miss;
148     double xbzrle_cache_miss_rate;
149     uint64_t xbzrle_overflows;
150 } AccountingInfo;
151 
152 static AccountingInfo acct_info;
153 
154 static void acct_clear(void)
155 {
156     memset(&acct_info, 0, sizeof(acct_info));
157 }
158 
159 uint64_t dup_mig_bytes_transferred(void)
160 {
161     return acct_info.dup_pages * TARGET_PAGE_SIZE;
162 }
163 
164 uint64_t dup_mig_pages_transferred(void)
165 {
166     return acct_info.dup_pages;
167 }
168 
169 uint64_t skipped_mig_bytes_transferred(void)
170 {
171     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
172 }
173 
174 uint64_t skipped_mig_pages_transferred(void)
175 {
176     return acct_info.skipped_pages;
177 }
178 
179 uint64_t norm_mig_bytes_transferred(void)
180 {
181     return acct_info.norm_pages * TARGET_PAGE_SIZE;
182 }
183 
184 uint64_t norm_mig_pages_transferred(void)
185 {
186     return acct_info.norm_pages;
187 }
188 
189 uint64_t xbzrle_mig_bytes_transferred(void)
190 {
191     return acct_info.xbzrle_bytes;
192 }
193 
194 uint64_t xbzrle_mig_pages_transferred(void)
195 {
196     return acct_info.xbzrle_pages;
197 }
198 
199 uint64_t xbzrle_mig_pages_cache_miss(void)
200 {
201     return acct_info.xbzrle_cache_miss;
202 }
203 
204 double xbzrle_mig_cache_miss_rate(void)
205 {
206     return acct_info.xbzrle_cache_miss_rate;
207 }
208 
209 uint64_t xbzrle_mig_pages_overflow(void)
210 {
211     return acct_info.xbzrle_overflows;
212 }
213 
214 /* This is the last block that we have visited serching for dirty pages
215  */
216 static RAMBlock *last_seen_block;
217 /* This is the last block from where we have sent data */
218 static RAMBlock *last_sent_block;
219 static ram_addr_t last_offset;
220 static QemuMutex migration_bitmap_mutex;
221 static uint64_t migration_dirty_pages;
222 static uint32_t last_version;
223 static bool ram_bulk_stage;
224 
225 /* used by the search for pages to send */
226 struct PageSearchStatus {
227     /* Current block being searched */
228     RAMBlock    *block;
229     /* Current offset to search from */
230     ram_addr_t   offset;
231     /* Set once we wrap around */
232     bool         complete_round;
233 };
234 typedef struct PageSearchStatus PageSearchStatus;
235 
236 static struct BitmapRcu {
237     struct rcu_head rcu;
238     /* Main migration bitmap */
239     unsigned long *bmap;
240     /* bitmap of pages that haven't been sent even once
241      * only maintained and used in postcopy at the moment
242      * where it's used to send the dirtymap at the start
243      * of the postcopy phase
244      */
245     unsigned long *unsentmap;
246 } *migration_bitmap_rcu;
247 
248 struct CompressParam {
249     bool done;
250     bool quit;
251     QEMUFile *file;
252     QemuMutex mutex;
253     QemuCond cond;
254     RAMBlock *block;
255     ram_addr_t offset;
256 };
257 typedef struct CompressParam CompressParam;
258 
259 struct DecompressParam {
260     bool done;
261     bool quit;
262     QemuMutex mutex;
263     QemuCond cond;
264     void *des;
265     uint8_t *compbuf;
266     int len;
267 };
268 typedef struct DecompressParam DecompressParam;
269 
270 static CompressParam *comp_param;
271 static QemuThread *compress_threads;
272 /* comp_done_cond is used to wake up the migration thread when
273  * one of the compression threads has finished the compression.
274  * comp_done_lock is used to co-work with comp_done_cond.
275  */
276 static QemuMutex comp_done_lock;
277 static QemuCond comp_done_cond;
278 /* The empty QEMUFileOps will be used by file in CompressParam */
279 static const QEMUFileOps empty_ops = { };
280 
281 static bool compression_switch;
282 static DecompressParam *decomp_param;
283 static QemuThread *decompress_threads;
284 static QemuMutex decomp_done_lock;
285 static QemuCond decomp_done_cond;
286 
287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
288                                 ram_addr_t offset);
289 
290 static void *do_data_compress(void *opaque)
291 {
292     CompressParam *param = opaque;
293     RAMBlock *block;
294     ram_addr_t offset;
295 
296     qemu_mutex_lock(&param->mutex);
297     while (!param->quit) {
298         if (param->block) {
299             block = param->block;
300             offset = param->offset;
301             param->block = NULL;
302             qemu_mutex_unlock(&param->mutex);
303 
304             do_compress_ram_page(param->file, block, offset);
305 
306             qemu_mutex_lock(&comp_done_lock);
307             param->done = true;
308             qemu_cond_signal(&comp_done_cond);
309             qemu_mutex_unlock(&comp_done_lock);
310 
311             qemu_mutex_lock(&param->mutex);
312         } else {
313             qemu_cond_wait(&param->cond, &param->mutex);
314         }
315     }
316     qemu_mutex_unlock(&param->mutex);
317 
318     return NULL;
319 }
320 
321 static inline void terminate_compression_threads(void)
322 {
323     int idx, thread_count;
324 
325     thread_count = migrate_compress_threads();
326     for (idx = 0; idx < thread_count; idx++) {
327         qemu_mutex_lock(&comp_param[idx].mutex);
328         comp_param[idx].quit = true;
329         qemu_cond_signal(&comp_param[idx].cond);
330         qemu_mutex_unlock(&comp_param[idx].mutex);
331     }
332 }
333 
334 void migrate_compress_threads_join(void)
335 {
336     int i, thread_count;
337 
338     if (!migrate_use_compression()) {
339         return;
340     }
341     terminate_compression_threads();
342     thread_count = migrate_compress_threads();
343     for (i = 0; i < thread_count; i++) {
344         qemu_thread_join(compress_threads + i);
345         qemu_fclose(comp_param[i].file);
346         qemu_mutex_destroy(&comp_param[i].mutex);
347         qemu_cond_destroy(&comp_param[i].cond);
348     }
349     qemu_mutex_destroy(&comp_done_lock);
350     qemu_cond_destroy(&comp_done_cond);
351     g_free(compress_threads);
352     g_free(comp_param);
353     compress_threads = NULL;
354     comp_param = NULL;
355 }
356 
357 void migrate_compress_threads_create(void)
358 {
359     int i, thread_count;
360 
361     if (!migrate_use_compression()) {
362         return;
363     }
364     compression_switch = true;
365     thread_count = migrate_compress_threads();
366     compress_threads = g_new0(QemuThread, thread_count);
367     comp_param = g_new0(CompressParam, thread_count);
368     qemu_cond_init(&comp_done_cond);
369     qemu_mutex_init(&comp_done_lock);
370     for (i = 0; i < thread_count; i++) {
371         /* comp_param[i].file is just used as a dummy buffer to save data,
372          * set its ops to empty.
373          */
374         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
375         comp_param[i].done = true;
376         comp_param[i].quit = false;
377         qemu_mutex_init(&comp_param[i].mutex);
378         qemu_cond_init(&comp_param[i].cond);
379         qemu_thread_create(compress_threads + i, "compress",
380                            do_data_compress, comp_param + i,
381                            QEMU_THREAD_JOINABLE);
382     }
383 }
384 
385 /**
386  * save_page_header: Write page header to wire
387  *
388  * If this is the 1st block, it also writes the block identification
389  *
390  * Returns: Number of bytes written
391  *
392  * @f: QEMUFile where to send the data
393  * @block: block that contains the page we want to send
394  * @offset: offset inside the block for the page
395  *          in the lower bits, it contains flags
396  */
397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
398 {
399     size_t size, len;
400 
401     qemu_put_be64(f, offset);
402     size = 8;
403 
404     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
405         len = strlen(block->idstr);
406         qemu_put_byte(f, len);
407         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
408         size += 1 + len;
409     }
410     return size;
411 }
412 
413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
414  * If guest dirty memory rate is reduced below the rate at which we can
415  * transfer pages to the destination then we should be able to complete
416  * migration. Some workloads dirty memory way too fast and will not effectively
417  * converge, even with auto-converge.
418  */
419 static void mig_throttle_guest_down(void)
420 {
421     MigrationState *s = migrate_get_current();
422     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
423     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
424 
425     /* We have not started throttling yet. Let's start it. */
426     if (!cpu_throttle_active()) {
427         cpu_throttle_set(pct_initial);
428     } else {
429         /* Throttling already on, just increase the rate */
430         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
431     }
432 }
433 
434 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
435  * The important thing is that a stale (not-yet-0'd) page be replaced
436  * by the new data.
437  * As a bonus, if the page wasn't in the cache it gets added so that
438  * when a small write is made into the 0'd page it gets XBZRLE sent
439  */
440 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
441 {
442     if (ram_bulk_stage || !migrate_use_xbzrle()) {
443         return;
444     }
445 
446     /* We don't care if this fails to allocate a new cache page
447      * as long as it updated an old one */
448     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
449                  bitmap_sync_count);
450 }
451 
452 #define ENCODING_FLAG_XBZRLE 0x1
453 
454 /**
455  * save_xbzrle_page: compress and send current page
456  *
457  * Returns: 1 means that we wrote the page
458  *          0 means that page is identical to the one already sent
459  *          -1 means that xbzrle would be longer than normal
460  *
461  * @f: QEMUFile where to send the data
462  * @current_data:
463  * @current_addr:
464  * @block: block that contains the page we want to send
465  * @offset: offset inside the block for the page
466  * @last_stage: if we are at the completion stage
467  * @bytes_transferred: increase it with the number of transferred bytes
468  */
469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
470                             ram_addr_t current_addr, RAMBlock *block,
471                             ram_addr_t offset, bool last_stage,
472                             uint64_t *bytes_transferred)
473 {
474     int encoded_len = 0, bytes_xbzrle;
475     uint8_t *prev_cached_page;
476 
477     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
478         acct_info.xbzrle_cache_miss++;
479         if (!last_stage) {
480             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
481                              bitmap_sync_count) == -1) {
482                 return -1;
483             } else {
484                 /* update *current_data when the page has been
485                    inserted into cache */
486                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
487             }
488         }
489         return -1;
490     }
491 
492     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
493 
494     /* save current buffer into memory */
495     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
496 
497     /* XBZRLE encoding (if there is no overflow) */
498     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
499                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
500                                        TARGET_PAGE_SIZE);
501     if (encoded_len == 0) {
502         trace_save_xbzrle_page_skipping();
503         return 0;
504     } else if (encoded_len == -1) {
505         trace_save_xbzrle_page_overflow();
506         acct_info.xbzrle_overflows++;
507         /* update data in the cache */
508         if (!last_stage) {
509             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
510             *current_data = prev_cached_page;
511         }
512         return -1;
513     }
514 
515     /* we need to update the data in the cache, in order to get the same data */
516     if (!last_stage) {
517         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
518     }
519 
520     /* Send XBZRLE based compressed page */
521     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
522     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
523     qemu_put_be16(f, encoded_len);
524     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
525     bytes_xbzrle += encoded_len + 1 + 2;
526     acct_info.xbzrle_pages++;
527     acct_info.xbzrle_bytes += bytes_xbzrle;
528     *bytes_transferred += bytes_xbzrle;
529 
530     return 1;
531 }
532 
533 /* Called with rcu_read_lock() to protect migration_bitmap
534  * rb: The RAMBlock  to search for dirty pages in
535  * start: Start address (typically so we can continue from previous page)
536  * ram_addr_abs: Pointer into which to store the address of the dirty page
537  *               within the global ram_addr space
538  *
539  * Returns: byte offset within memory region of the start of a dirty page
540  */
541 static inline
542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
543                                        ram_addr_t start,
544                                        ram_addr_t *ram_addr_abs)
545 {
546     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
547     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
548     uint64_t rb_size = rb->used_length;
549     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
550     unsigned long *bitmap;
551 
552     unsigned long next;
553 
554     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
555     if (ram_bulk_stage && nr > base) {
556         next = nr + 1;
557     } else {
558         next = find_next_bit(bitmap, size, nr);
559     }
560 
561     *ram_addr_abs = next << TARGET_PAGE_BITS;
562     return (next - base) << TARGET_PAGE_BITS;
563 }
564 
565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
566 {
567     bool ret;
568     int nr = addr >> TARGET_PAGE_BITS;
569     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
570 
571     ret = test_and_clear_bit(nr, bitmap);
572 
573     if (ret) {
574         migration_dirty_pages--;
575     }
576     return ret;
577 }
578 
579 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
580 {
581     unsigned long *bitmap;
582     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
583     migration_dirty_pages +=
584         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
585 }
586 
587 /* Fix me: there are too many global variables used in migration process. */
588 static int64_t start_time;
589 static int64_t bytes_xfer_prev;
590 static int64_t num_dirty_pages_period;
591 static uint64_t xbzrle_cache_miss_prev;
592 static uint64_t iterations_prev;
593 
594 static void migration_bitmap_sync_init(void)
595 {
596     start_time = 0;
597     bytes_xfer_prev = 0;
598     num_dirty_pages_period = 0;
599     xbzrle_cache_miss_prev = 0;
600     iterations_prev = 0;
601 }
602 
603 static void migration_bitmap_sync(void)
604 {
605     RAMBlock *block;
606     uint64_t num_dirty_pages_init = migration_dirty_pages;
607     MigrationState *s = migrate_get_current();
608     int64_t end_time;
609     int64_t bytes_xfer_now;
610 
611     bitmap_sync_count++;
612 
613     if (!bytes_xfer_prev) {
614         bytes_xfer_prev = ram_bytes_transferred();
615     }
616 
617     if (!start_time) {
618         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
619     }
620 
621     trace_migration_bitmap_sync_start();
622     memory_global_dirty_log_sync();
623 
624     qemu_mutex_lock(&migration_bitmap_mutex);
625     rcu_read_lock();
626     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
627         migration_bitmap_sync_range(block->offset, block->used_length);
628     }
629     rcu_read_unlock();
630     qemu_mutex_unlock(&migration_bitmap_mutex);
631 
632     trace_migration_bitmap_sync_end(migration_dirty_pages
633                                     - num_dirty_pages_init);
634     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
635     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
636 
637     /* more than 1 second = 1000 millisecons */
638     if (end_time > start_time + 1000) {
639         if (migrate_auto_converge()) {
640             /* The following detection logic can be refined later. For now:
641                Check to see if the dirtied bytes is 50% more than the approx.
642                amount of bytes that just got transferred since the last time we
643                were in this routine. If that happens twice, start or increase
644                throttling */
645             bytes_xfer_now = ram_bytes_transferred();
646 
647             if (s->dirty_pages_rate &&
648                (num_dirty_pages_period * TARGET_PAGE_SIZE >
649                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
650                (dirty_rate_high_cnt++ >= 2)) {
651                     trace_migration_throttle();
652                     dirty_rate_high_cnt = 0;
653                     mig_throttle_guest_down();
654              }
655              bytes_xfer_prev = bytes_xfer_now;
656         }
657 
658         if (migrate_use_xbzrle()) {
659             if (iterations_prev != acct_info.iterations) {
660                 acct_info.xbzrle_cache_miss_rate =
661                    (double)(acct_info.xbzrle_cache_miss -
662                             xbzrle_cache_miss_prev) /
663                    (acct_info.iterations - iterations_prev);
664             }
665             iterations_prev = acct_info.iterations;
666             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
667         }
668         s->dirty_pages_rate = num_dirty_pages_period * 1000
669             / (end_time - start_time);
670         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
671         start_time = end_time;
672         num_dirty_pages_period = 0;
673     }
674     s->dirty_sync_count = bitmap_sync_count;
675     if (migrate_use_events()) {
676         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
677     }
678 }
679 
680 /**
681  * save_zero_page: Send the zero page to the stream
682  *
683  * Returns: Number of pages written.
684  *
685  * @f: QEMUFile where to send the data
686  * @block: block that contains the page we want to send
687  * @offset: offset inside the block for the page
688  * @p: pointer to the page
689  * @bytes_transferred: increase it with the number of transferred bytes
690  */
691 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
692                           uint8_t *p, uint64_t *bytes_transferred)
693 {
694     int pages = -1;
695 
696     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
697         acct_info.dup_pages++;
698         *bytes_transferred += save_page_header(f, block,
699                                                offset | RAM_SAVE_FLAG_COMPRESS);
700         qemu_put_byte(f, 0);
701         *bytes_transferred += 1;
702         pages = 1;
703     }
704 
705     return pages;
706 }
707 
708 /**
709  * ram_save_page: Send the given page to the stream
710  *
711  * Returns: Number of pages written.
712  *          < 0 - error
713  *          >=0 - Number of pages written - this might legally be 0
714  *                if xbzrle noticed the page was the same.
715  *
716  * @f: QEMUFile where to send the data
717  * @block: block that contains the page we want to send
718  * @offset: offset inside the block for the page
719  * @last_stage: if we are at the completion stage
720  * @bytes_transferred: increase it with the number of transferred bytes
721  */
722 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
723                          bool last_stage, uint64_t *bytes_transferred)
724 {
725     int pages = -1;
726     uint64_t bytes_xmit;
727     ram_addr_t current_addr;
728     uint8_t *p;
729     int ret;
730     bool send_async = true;
731     RAMBlock *block = pss->block;
732     ram_addr_t offset = pss->offset;
733 
734     p = block->host + offset;
735 
736     /* In doubt sent page as normal */
737     bytes_xmit = 0;
738     ret = ram_control_save_page(f, block->offset,
739                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
740     if (bytes_xmit) {
741         *bytes_transferred += bytes_xmit;
742         pages = 1;
743     }
744 
745     XBZRLE_cache_lock();
746 
747     current_addr = block->offset + offset;
748 
749     if (block == last_sent_block) {
750         offset |= RAM_SAVE_FLAG_CONTINUE;
751     }
752     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
753         if (ret != RAM_SAVE_CONTROL_DELAYED) {
754             if (bytes_xmit > 0) {
755                 acct_info.norm_pages++;
756             } else if (bytes_xmit == 0) {
757                 acct_info.dup_pages++;
758             }
759         }
760     } else {
761         pages = save_zero_page(f, block, offset, p, bytes_transferred);
762         if (pages > 0) {
763             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
764              * page would be stale
765              */
766             xbzrle_cache_zero_page(current_addr);
767         } else if (!ram_bulk_stage &&
768                    !migration_in_postcopy(migrate_get_current()) &&
769                    migrate_use_xbzrle()) {
770             pages = save_xbzrle_page(f, &p, current_addr, block,
771                                      offset, last_stage, bytes_transferred);
772             if (!last_stage) {
773                 /* Can't send this cached data async, since the cache page
774                  * might get updated before it gets to the wire
775                  */
776                 send_async = false;
777             }
778         }
779     }
780 
781     /* XBZRLE overflow or normal page */
782     if (pages == -1) {
783         *bytes_transferred += save_page_header(f, block,
784                                                offset | RAM_SAVE_FLAG_PAGE);
785         if (send_async) {
786             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
787         } else {
788             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
789         }
790         *bytes_transferred += TARGET_PAGE_SIZE;
791         pages = 1;
792         acct_info.norm_pages++;
793     }
794 
795     XBZRLE_cache_unlock();
796 
797     return pages;
798 }
799 
800 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
801                                 ram_addr_t offset)
802 {
803     int bytes_sent, blen;
804     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
805 
806     bytes_sent = save_page_header(f, block, offset |
807                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
808     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
809                                      migrate_compress_level());
810     if (blen < 0) {
811         bytes_sent = 0;
812         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
813         error_report("compressed data failed!");
814     } else {
815         bytes_sent += blen;
816     }
817 
818     return bytes_sent;
819 }
820 
821 static uint64_t bytes_transferred;
822 
823 static void flush_compressed_data(QEMUFile *f)
824 {
825     int idx, len, thread_count;
826 
827     if (!migrate_use_compression()) {
828         return;
829     }
830     thread_count = migrate_compress_threads();
831 
832     qemu_mutex_lock(&comp_done_lock);
833     for (idx = 0; idx < thread_count; idx++) {
834         while (!comp_param[idx].done) {
835             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
836         }
837     }
838     qemu_mutex_unlock(&comp_done_lock);
839 
840     for (idx = 0; idx < thread_count; idx++) {
841         qemu_mutex_lock(&comp_param[idx].mutex);
842         if (!comp_param[idx].quit) {
843             len = qemu_put_qemu_file(f, comp_param[idx].file);
844             bytes_transferred += len;
845         }
846         qemu_mutex_unlock(&comp_param[idx].mutex);
847     }
848 }
849 
850 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
851                                        ram_addr_t offset)
852 {
853     param->block = block;
854     param->offset = offset;
855 }
856 
857 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
858                                            ram_addr_t offset,
859                                            uint64_t *bytes_transferred)
860 {
861     int idx, thread_count, bytes_xmit = -1, pages = -1;
862 
863     thread_count = migrate_compress_threads();
864     qemu_mutex_lock(&comp_done_lock);
865     while (true) {
866         for (idx = 0; idx < thread_count; idx++) {
867             if (comp_param[idx].done) {
868                 comp_param[idx].done = false;
869                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
870                 qemu_mutex_lock(&comp_param[idx].mutex);
871                 set_compress_params(&comp_param[idx], block, offset);
872                 qemu_cond_signal(&comp_param[idx].cond);
873                 qemu_mutex_unlock(&comp_param[idx].mutex);
874                 pages = 1;
875                 acct_info.norm_pages++;
876                 *bytes_transferred += bytes_xmit;
877                 break;
878             }
879         }
880         if (pages > 0) {
881             break;
882         } else {
883             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
884         }
885     }
886     qemu_mutex_unlock(&comp_done_lock);
887 
888     return pages;
889 }
890 
891 /**
892  * ram_save_compressed_page: compress the given page and send it to the stream
893  *
894  * Returns: Number of pages written.
895  *
896  * @f: QEMUFile where to send the data
897  * @block: block that contains the page we want to send
898  * @offset: offset inside the block for the page
899  * @last_stage: if we are at the completion stage
900  * @bytes_transferred: increase it with the number of transferred bytes
901  */
902 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
903                                     bool last_stage,
904                                     uint64_t *bytes_transferred)
905 {
906     int pages = -1;
907     uint64_t bytes_xmit = 0;
908     uint8_t *p;
909     int ret, blen;
910     RAMBlock *block = pss->block;
911     ram_addr_t offset = pss->offset;
912 
913     p = block->host + offset;
914 
915     ret = ram_control_save_page(f, block->offset,
916                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
917     if (bytes_xmit) {
918         *bytes_transferred += bytes_xmit;
919         pages = 1;
920     }
921     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
922         if (ret != RAM_SAVE_CONTROL_DELAYED) {
923             if (bytes_xmit > 0) {
924                 acct_info.norm_pages++;
925             } else if (bytes_xmit == 0) {
926                 acct_info.dup_pages++;
927             }
928         }
929     } else {
930         /* When starting the process of a new block, the first page of
931          * the block should be sent out before other pages in the same
932          * block, and all the pages in last block should have been sent
933          * out, keeping this order is important, because the 'cont' flag
934          * is used to avoid resending the block name.
935          */
936         if (block != last_sent_block) {
937             flush_compressed_data(f);
938             pages = save_zero_page(f, block, offset, p, bytes_transferred);
939             if (pages == -1) {
940                 /* Make sure the first page is sent out before other pages */
941                 bytes_xmit = save_page_header(f, block, offset |
942                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
943                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
944                                                  migrate_compress_level());
945                 if (blen > 0) {
946                     *bytes_transferred += bytes_xmit + blen;
947                     acct_info.norm_pages++;
948                     pages = 1;
949                 } else {
950                     qemu_file_set_error(f, blen);
951                     error_report("compressed data failed!");
952                 }
953             }
954         } else {
955             offset |= RAM_SAVE_FLAG_CONTINUE;
956             pages = save_zero_page(f, block, offset, p, bytes_transferred);
957             if (pages == -1) {
958                 pages = compress_page_with_multi_thread(f, block, offset,
959                                                         bytes_transferred);
960             }
961         }
962     }
963 
964     return pages;
965 }
966 
967 /*
968  * Find the next dirty page and update any state associated with
969  * the search process.
970  *
971  * Returns: True if a page is found
972  *
973  * @f: Current migration stream.
974  * @pss: Data about the state of the current dirty page scan.
975  * @*again: Set to false if the search has scanned the whole of RAM
976  * *ram_addr_abs: Pointer into which to store the address of the dirty page
977  *               within the global ram_addr space
978  */
979 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
980                              bool *again, ram_addr_t *ram_addr_abs)
981 {
982     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
983                                               ram_addr_abs);
984     if (pss->complete_round && pss->block == last_seen_block &&
985         pss->offset >= last_offset) {
986         /*
987          * We've been once around the RAM and haven't found anything.
988          * Give up.
989          */
990         *again = false;
991         return false;
992     }
993     if (pss->offset >= pss->block->used_length) {
994         /* Didn't find anything in this RAM Block */
995         pss->offset = 0;
996         pss->block = QLIST_NEXT_RCU(pss->block, next);
997         if (!pss->block) {
998             /* Hit the end of the list */
999             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1000             /* Flag that we've looped */
1001             pss->complete_round = true;
1002             ram_bulk_stage = false;
1003             if (migrate_use_xbzrle()) {
1004                 /* If xbzrle is on, stop using the data compression at this
1005                  * point. In theory, xbzrle can do better than compression.
1006                  */
1007                 flush_compressed_data(f);
1008                 compression_switch = false;
1009             }
1010         }
1011         /* Didn't find anything this time, but try again on the new block */
1012         *again = true;
1013         return false;
1014     } else {
1015         /* Can go around again, but... */
1016         *again = true;
1017         /* We've found something so probably don't need to */
1018         return true;
1019     }
1020 }
1021 
1022 /*
1023  * Helper for 'get_queued_page' - gets a page off the queue
1024  *      ms:      MigrationState in
1025  * *offset:      Used to return the offset within the RAMBlock
1026  * ram_addr_abs: global offset in the dirty/sent bitmaps
1027  *
1028  * Returns:      block (or NULL if none available)
1029  */
1030 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1031                               ram_addr_t *ram_addr_abs)
1032 {
1033     RAMBlock *block = NULL;
1034 
1035     qemu_mutex_lock(&ms->src_page_req_mutex);
1036     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1037         struct MigrationSrcPageRequest *entry =
1038                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1039         block = entry->rb;
1040         *offset = entry->offset;
1041         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1042                         TARGET_PAGE_MASK;
1043 
1044         if (entry->len > TARGET_PAGE_SIZE) {
1045             entry->len -= TARGET_PAGE_SIZE;
1046             entry->offset += TARGET_PAGE_SIZE;
1047         } else {
1048             memory_region_unref(block->mr);
1049             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1050             g_free(entry);
1051         }
1052     }
1053     qemu_mutex_unlock(&ms->src_page_req_mutex);
1054 
1055     return block;
1056 }
1057 
1058 /*
1059  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1060  * that are already sent (!dirty)
1061  *
1062  *      ms:      MigrationState in
1063  *     pss:      PageSearchStatus structure updated with found block/offset
1064  * ram_addr_abs: global offset in the dirty/sent bitmaps
1065  *
1066  * Returns:      true if a queued page is found
1067  */
1068 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1069                             ram_addr_t *ram_addr_abs)
1070 {
1071     RAMBlock  *block;
1072     ram_addr_t offset;
1073     bool dirty;
1074 
1075     do {
1076         block = unqueue_page(ms, &offset, ram_addr_abs);
1077         /*
1078          * We're sending this page, and since it's postcopy nothing else
1079          * will dirty it, and we must make sure it doesn't get sent again
1080          * even if this queue request was received after the background
1081          * search already sent it.
1082          */
1083         if (block) {
1084             unsigned long *bitmap;
1085             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1086             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1087             if (!dirty) {
1088                 trace_get_queued_page_not_dirty(
1089                     block->idstr, (uint64_t)offset,
1090                     (uint64_t)*ram_addr_abs,
1091                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1092                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1093             } else {
1094                 trace_get_queued_page(block->idstr,
1095                                       (uint64_t)offset,
1096                                       (uint64_t)*ram_addr_abs);
1097             }
1098         }
1099 
1100     } while (block && !dirty);
1101 
1102     if (block) {
1103         /*
1104          * As soon as we start servicing pages out of order, then we have
1105          * to kill the bulk stage, since the bulk stage assumes
1106          * in (migration_bitmap_find_and_reset_dirty) that every page is
1107          * dirty, that's no longer true.
1108          */
1109         ram_bulk_stage = false;
1110 
1111         /*
1112          * We want the background search to continue from the queued page
1113          * since the guest is likely to want other pages near to the page
1114          * it just requested.
1115          */
1116         pss->block = block;
1117         pss->offset = offset;
1118     }
1119 
1120     return !!block;
1121 }
1122 
1123 /**
1124  * flush_page_queue: Flush any remaining pages in the ram request queue
1125  *    it should be empty at the end anyway, but in error cases there may be
1126  *    some left.
1127  *
1128  * ms: MigrationState
1129  */
1130 void flush_page_queue(MigrationState *ms)
1131 {
1132     struct MigrationSrcPageRequest *mspr, *next_mspr;
1133     /* This queue generally should be empty - but in the case of a failed
1134      * migration might have some droppings in.
1135      */
1136     rcu_read_lock();
1137     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1138         memory_region_unref(mspr->rb->mr);
1139         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1140         g_free(mspr);
1141     }
1142     rcu_read_unlock();
1143 }
1144 
1145 /**
1146  * Queue the pages for transmission, e.g. a request from postcopy destination
1147  *   ms: MigrationStatus in which the queue is held
1148  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1149  *   start: Offset from the start of the RAMBlock
1150  *   len: Length (in bytes) to send
1151  *   Return: 0 on success
1152  */
1153 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1154                          ram_addr_t start, ram_addr_t len)
1155 {
1156     RAMBlock *ramblock;
1157 
1158     ms->postcopy_requests++;
1159     rcu_read_lock();
1160     if (!rbname) {
1161         /* Reuse last RAMBlock */
1162         ramblock = ms->last_req_rb;
1163 
1164         if (!ramblock) {
1165             /*
1166              * Shouldn't happen, we can't reuse the last RAMBlock if
1167              * it's the 1st request.
1168              */
1169             error_report("ram_save_queue_pages no previous block");
1170             goto err;
1171         }
1172     } else {
1173         ramblock = qemu_ram_block_by_name(rbname);
1174 
1175         if (!ramblock) {
1176             /* We shouldn't be asked for a non-existent RAMBlock */
1177             error_report("ram_save_queue_pages no block '%s'", rbname);
1178             goto err;
1179         }
1180         ms->last_req_rb = ramblock;
1181     }
1182     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1183     if (start+len > ramblock->used_length) {
1184         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1185                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1186                      __func__, start, len, ramblock->used_length);
1187         goto err;
1188     }
1189 
1190     struct MigrationSrcPageRequest *new_entry =
1191         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1192     new_entry->rb = ramblock;
1193     new_entry->offset = start;
1194     new_entry->len = len;
1195 
1196     memory_region_ref(ramblock->mr);
1197     qemu_mutex_lock(&ms->src_page_req_mutex);
1198     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1199     qemu_mutex_unlock(&ms->src_page_req_mutex);
1200     rcu_read_unlock();
1201 
1202     return 0;
1203 
1204 err:
1205     rcu_read_unlock();
1206     return -1;
1207 }
1208 
1209 /**
1210  * ram_save_target_page: Save one target page
1211  *
1212  *
1213  * @f: QEMUFile where to send the data
1214  * @block: pointer to block that contains the page we want to send
1215  * @offset: offset inside the block for the page;
1216  * @last_stage: if we are at the completion stage
1217  * @bytes_transferred: increase it with the number of transferred bytes
1218  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1219  *
1220  * Returns: Number of pages written.
1221  */
1222 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1223                                 PageSearchStatus *pss,
1224                                 bool last_stage,
1225                                 uint64_t *bytes_transferred,
1226                                 ram_addr_t dirty_ram_abs)
1227 {
1228     int res = 0;
1229 
1230     /* Check the pages is dirty and if it is send it */
1231     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1232         unsigned long *unsentmap;
1233         if (compression_switch && migrate_use_compression()) {
1234             res = ram_save_compressed_page(f, pss,
1235                                            last_stage,
1236                                            bytes_transferred);
1237         } else {
1238             res = ram_save_page(f, pss, last_stage,
1239                                 bytes_transferred);
1240         }
1241 
1242         if (res < 0) {
1243             return res;
1244         }
1245         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1246         if (unsentmap) {
1247             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1248         }
1249         /* Only update last_sent_block if a block was actually sent; xbzrle
1250          * might have decided the page was identical so didn't bother writing
1251          * to the stream.
1252          */
1253         if (res > 0) {
1254             last_sent_block = pss->block;
1255         }
1256     }
1257 
1258     return res;
1259 }
1260 
1261 /**
1262  * ram_save_host_page: Starting at *offset send pages up to the end
1263  *                     of the current host page.  It's valid for the initial
1264  *                     offset to point into the middle of a host page
1265  *                     in which case the remainder of the hostpage is sent.
1266  *                     Only dirty target pages are sent.
1267  *
1268  * Returns: Number of pages written.
1269  *
1270  * @f: QEMUFile where to send the data
1271  * @block: pointer to block that contains the page we want to send
1272  * @offset: offset inside the block for the page; updated to last target page
1273  *          sent
1274  * @last_stage: if we are at the completion stage
1275  * @bytes_transferred: increase it with the number of transferred bytes
1276  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1277  */
1278 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1279                               PageSearchStatus *pss,
1280                               bool last_stage,
1281                               uint64_t *bytes_transferred,
1282                               ram_addr_t dirty_ram_abs)
1283 {
1284     int tmppages, pages = 0;
1285     do {
1286         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1287                                         bytes_transferred, dirty_ram_abs);
1288         if (tmppages < 0) {
1289             return tmppages;
1290         }
1291 
1292         pages += tmppages;
1293         pss->offset += TARGET_PAGE_SIZE;
1294         dirty_ram_abs += TARGET_PAGE_SIZE;
1295     } while (pss->offset & (qemu_host_page_size - 1));
1296 
1297     /* The offset we leave with is the last one we looked at */
1298     pss->offset -= TARGET_PAGE_SIZE;
1299     return pages;
1300 }
1301 
1302 /**
1303  * ram_find_and_save_block: Finds a dirty page and sends it to f
1304  *
1305  * Called within an RCU critical section.
1306  *
1307  * Returns:  The number of pages written
1308  *           0 means no dirty pages
1309  *
1310  * @f: QEMUFile where to send the data
1311  * @last_stage: if we are at the completion stage
1312  * @bytes_transferred: increase it with the number of transferred bytes
1313  *
1314  * On systems where host-page-size > target-page-size it will send all the
1315  * pages in a host page that are dirty.
1316  */
1317 
1318 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1319                                    uint64_t *bytes_transferred)
1320 {
1321     PageSearchStatus pss;
1322     MigrationState *ms = migrate_get_current();
1323     int pages = 0;
1324     bool again, found;
1325     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1326                                  ram_addr_t space */
1327 
1328     pss.block = last_seen_block;
1329     pss.offset = last_offset;
1330     pss.complete_round = false;
1331 
1332     if (!pss.block) {
1333         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1334     }
1335 
1336     do {
1337         again = true;
1338         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1339 
1340         if (!found) {
1341             /* priority queue empty, so just search for something dirty */
1342             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1343         }
1344 
1345         if (found) {
1346             pages = ram_save_host_page(ms, f, &pss,
1347                                        last_stage, bytes_transferred,
1348                                        dirty_ram_abs);
1349         }
1350     } while (!pages && again);
1351 
1352     last_seen_block = pss.block;
1353     last_offset = pss.offset;
1354 
1355     return pages;
1356 }
1357 
1358 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1359 {
1360     uint64_t pages = size / TARGET_PAGE_SIZE;
1361     if (zero) {
1362         acct_info.dup_pages += pages;
1363     } else {
1364         acct_info.norm_pages += pages;
1365         bytes_transferred += size;
1366         qemu_update_position(f, size);
1367     }
1368 }
1369 
1370 static ram_addr_t ram_save_remaining(void)
1371 {
1372     return migration_dirty_pages;
1373 }
1374 
1375 uint64_t ram_bytes_remaining(void)
1376 {
1377     return ram_save_remaining() * TARGET_PAGE_SIZE;
1378 }
1379 
1380 uint64_t ram_bytes_transferred(void)
1381 {
1382     return bytes_transferred;
1383 }
1384 
1385 uint64_t ram_bytes_total(void)
1386 {
1387     RAMBlock *block;
1388     uint64_t total = 0;
1389 
1390     rcu_read_lock();
1391     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1392         total += block->used_length;
1393     rcu_read_unlock();
1394     return total;
1395 }
1396 
1397 void free_xbzrle_decoded_buf(void)
1398 {
1399     g_free(xbzrle_decoded_buf);
1400     xbzrle_decoded_buf = NULL;
1401 }
1402 
1403 static void migration_bitmap_free(struct BitmapRcu *bmap)
1404 {
1405     g_free(bmap->bmap);
1406     g_free(bmap->unsentmap);
1407     g_free(bmap);
1408 }
1409 
1410 static void ram_migration_cleanup(void *opaque)
1411 {
1412     /* caller have hold iothread lock or is in a bh, so there is
1413      * no writing race against this migration_bitmap
1414      */
1415     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1416     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1417     if (bitmap) {
1418         memory_global_dirty_log_stop();
1419         call_rcu(bitmap, migration_bitmap_free, rcu);
1420     }
1421 
1422     XBZRLE_cache_lock();
1423     if (XBZRLE.cache) {
1424         cache_fini(XBZRLE.cache);
1425         g_free(XBZRLE.encoded_buf);
1426         g_free(XBZRLE.current_buf);
1427         g_free(ZERO_TARGET_PAGE);
1428         XBZRLE.cache = NULL;
1429         XBZRLE.encoded_buf = NULL;
1430         XBZRLE.current_buf = NULL;
1431     }
1432     XBZRLE_cache_unlock();
1433 }
1434 
1435 static void reset_ram_globals(void)
1436 {
1437     last_seen_block = NULL;
1438     last_sent_block = NULL;
1439     last_offset = 0;
1440     last_version = ram_list.version;
1441     ram_bulk_stage = true;
1442 }
1443 
1444 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1445 
1446 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1447 {
1448     /* called in qemu main thread, so there is
1449      * no writing race against this migration_bitmap
1450      */
1451     if (migration_bitmap_rcu) {
1452         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1453         bitmap = g_new(struct BitmapRcu, 1);
1454         bitmap->bmap = bitmap_new(new);
1455 
1456         /* prevent migration_bitmap content from being set bit
1457          * by migration_bitmap_sync_range() at the same time.
1458          * it is safe to migration if migration_bitmap is cleared bit
1459          * at the same time.
1460          */
1461         qemu_mutex_lock(&migration_bitmap_mutex);
1462         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1463         bitmap_set(bitmap->bmap, old, new - old);
1464 
1465         /* We don't have a way to safely extend the sentmap
1466          * with RCU; so mark it as missing, entry to postcopy
1467          * will fail.
1468          */
1469         bitmap->unsentmap = NULL;
1470 
1471         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1472         qemu_mutex_unlock(&migration_bitmap_mutex);
1473         migration_dirty_pages += new - old;
1474         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1475     }
1476 }
1477 
1478 /*
1479  * 'expected' is the value you expect the bitmap mostly to be full
1480  * of; it won't bother printing lines that are all this value.
1481  * If 'todump' is null the migration bitmap is dumped.
1482  */
1483 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1484 {
1485     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1486 
1487     int64_t cur;
1488     int64_t linelen = 128;
1489     char linebuf[129];
1490 
1491     if (!todump) {
1492         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1493     }
1494 
1495     for (cur = 0; cur < ram_pages; cur += linelen) {
1496         int64_t curb;
1497         bool found = false;
1498         /*
1499          * Last line; catch the case where the line length
1500          * is longer than remaining ram
1501          */
1502         if (cur + linelen > ram_pages) {
1503             linelen = ram_pages - cur;
1504         }
1505         for (curb = 0; curb < linelen; curb++) {
1506             bool thisbit = test_bit(cur + curb, todump);
1507             linebuf[curb] = thisbit ? '1' : '.';
1508             found = found || (thisbit != expected);
1509         }
1510         if (found) {
1511             linebuf[curb] = '\0';
1512             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1513         }
1514     }
1515 }
1516 
1517 /* **** functions for postcopy ***** */
1518 
1519 /*
1520  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1521  * Note: At this point the 'unsentmap' is the processed bitmap combined
1522  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1523  * start,length: Indexes into the bitmap for the first bit
1524  *            representing the named block and length in target-pages
1525  */
1526 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1527                                         PostcopyDiscardState *pds,
1528                                         unsigned long start,
1529                                         unsigned long length)
1530 {
1531     unsigned long end = start + length; /* one after the end */
1532     unsigned long current;
1533     unsigned long *unsentmap;
1534 
1535     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1536     for (current = start; current < end; ) {
1537         unsigned long one = find_next_bit(unsentmap, end, current);
1538 
1539         if (one <= end) {
1540             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1541             unsigned long discard_length;
1542 
1543             if (zero >= end) {
1544                 discard_length = end - one;
1545             } else {
1546                 discard_length = zero - one;
1547             }
1548             if (discard_length) {
1549                 postcopy_discard_send_range(ms, pds, one, discard_length);
1550             }
1551             current = one + discard_length;
1552         } else {
1553             current = one;
1554         }
1555     }
1556 
1557     return 0;
1558 }
1559 
1560 /*
1561  * Utility for the outgoing postcopy code.
1562  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1563  *   passing it bitmap indexes and name.
1564  * Returns: 0 on success
1565  * (qemu_ram_foreach_block ends up passing unscaled lengths
1566  *  which would mean postcopy code would have to deal with target page)
1567  */
1568 static int postcopy_each_ram_send_discard(MigrationState *ms)
1569 {
1570     struct RAMBlock *block;
1571     int ret;
1572 
1573     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1574         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1575         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1576                                                                first,
1577                                                                block->idstr);
1578 
1579         /*
1580          * Postcopy sends chunks of bitmap over the wire, but it
1581          * just needs indexes at this point, avoids it having
1582          * target page specific code.
1583          */
1584         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1585                                     block->used_length >> TARGET_PAGE_BITS);
1586         postcopy_discard_send_finish(ms, pds);
1587         if (ret) {
1588             return ret;
1589         }
1590     }
1591 
1592     return 0;
1593 }
1594 
1595 /*
1596  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1597  *   the two bitmaps, that are similar, but one is inverted.
1598  *
1599  * We search for runs of target-pages that don't start or end on a
1600  * host page boundary;
1601  * unsent_pass=true: Cleans up partially unsent host pages by searching
1602  *                 the unsentmap
1603  * unsent_pass=false: Cleans up partially dirty host pages by searching
1604  *                 the main migration bitmap
1605  *
1606  */
1607 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1608                                           RAMBlock *block,
1609                                           PostcopyDiscardState *pds)
1610 {
1611     unsigned long *bitmap;
1612     unsigned long *unsentmap;
1613     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1614     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1615     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1616     unsigned long last = first + (len - 1);
1617     unsigned long run_start;
1618 
1619     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1620     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1621 
1622     if (unsent_pass) {
1623         /* Find a sent page */
1624         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1625     } else {
1626         /* Find a dirty page */
1627         run_start = find_next_bit(bitmap, last + 1, first);
1628     }
1629 
1630     while (run_start <= last) {
1631         bool do_fixup = false;
1632         unsigned long fixup_start_addr;
1633         unsigned long host_offset;
1634 
1635         /*
1636          * If the start of this run of pages is in the middle of a host
1637          * page, then we need to fixup this host page.
1638          */
1639         host_offset = run_start % host_ratio;
1640         if (host_offset) {
1641             do_fixup = true;
1642             run_start -= host_offset;
1643             fixup_start_addr = run_start;
1644             /* For the next pass */
1645             run_start = run_start + host_ratio;
1646         } else {
1647             /* Find the end of this run */
1648             unsigned long run_end;
1649             if (unsent_pass) {
1650                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1651             } else {
1652                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1653             }
1654             /*
1655              * If the end isn't at the start of a host page, then the
1656              * run doesn't finish at the end of a host page
1657              * and we need to discard.
1658              */
1659             host_offset = run_end % host_ratio;
1660             if (host_offset) {
1661                 do_fixup = true;
1662                 fixup_start_addr = run_end - host_offset;
1663                 /*
1664                  * This host page has gone, the next loop iteration starts
1665                  * from after the fixup
1666                  */
1667                 run_start = fixup_start_addr + host_ratio;
1668             } else {
1669                 /*
1670                  * No discards on this iteration, next loop starts from
1671                  * next sent/dirty page
1672                  */
1673                 run_start = run_end + 1;
1674             }
1675         }
1676 
1677         if (do_fixup) {
1678             unsigned long page;
1679 
1680             /* Tell the destination to discard this page */
1681             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1682                 /* For the unsent_pass we:
1683                  *     discard partially sent pages
1684                  * For the !unsent_pass (dirty) we:
1685                  *     discard partially dirty pages that were sent
1686                  *     (any partially sent pages were already discarded
1687                  *     by the previous unsent_pass)
1688                  */
1689                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1690                                             host_ratio);
1691             }
1692 
1693             /* Clean up the bitmap */
1694             for (page = fixup_start_addr;
1695                  page < fixup_start_addr + host_ratio; page++) {
1696                 /* All pages in this host page are now not sent */
1697                 set_bit(page, unsentmap);
1698 
1699                 /*
1700                  * Remark them as dirty, updating the count for any pages
1701                  * that weren't previously dirty.
1702                  */
1703                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1704             }
1705         }
1706 
1707         if (unsent_pass) {
1708             /* Find the next sent page for the next iteration */
1709             run_start = find_next_zero_bit(unsentmap, last + 1,
1710                                            run_start);
1711         } else {
1712             /* Find the next dirty page for the next iteration */
1713             run_start = find_next_bit(bitmap, last + 1, run_start);
1714         }
1715     }
1716 }
1717 
1718 /*
1719  * Utility for the outgoing postcopy code.
1720  *
1721  * Discard any partially sent host-page size chunks, mark any partially
1722  * dirty host-page size chunks as all dirty.
1723  *
1724  * Returns: 0 on success
1725  */
1726 static int postcopy_chunk_hostpages(MigrationState *ms)
1727 {
1728     struct RAMBlock *block;
1729 
1730     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1731         /* Easy case - TPS==HPS - nothing to be done */
1732         return 0;
1733     }
1734 
1735     /* Easiest way to make sure we don't resume in the middle of a host-page */
1736     last_seen_block = NULL;
1737     last_sent_block = NULL;
1738     last_offset     = 0;
1739 
1740     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1741         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1742 
1743         PostcopyDiscardState *pds =
1744                          postcopy_discard_send_init(ms, first, block->idstr);
1745 
1746         /* First pass: Discard all partially sent host pages */
1747         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1748         /*
1749          * Second pass: Ensure that all partially dirty host pages are made
1750          * fully dirty.
1751          */
1752         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1753 
1754         postcopy_discard_send_finish(ms, pds);
1755     } /* ram_list loop */
1756 
1757     return 0;
1758 }
1759 
1760 /*
1761  * Transmit the set of pages to be discarded after precopy to the target
1762  * these are pages that:
1763  *     a) Have been previously transmitted but are now dirty again
1764  *     b) Pages that have never been transmitted, this ensures that
1765  *        any pages on the destination that have been mapped by background
1766  *        tasks get discarded (transparent huge pages is the specific concern)
1767  * Hopefully this is pretty sparse
1768  */
1769 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1770 {
1771     int ret;
1772     unsigned long *bitmap, *unsentmap;
1773 
1774     rcu_read_lock();
1775 
1776     /* This should be our last sync, the src is now paused */
1777     migration_bitmap_sync();
1778 
1779     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1780     if (!unsentmap) {
1781         /* We don't have a safe way to resize the sentmap, so
1782          * if the bitmap was resized it will be NULL at this
1783          * point.
1784          */
1785         error_report("migration ram resized during precopy phase");
1786         rcu_read_unlock();
1787         return -EINVAL;
1788     }
1789 
1790     /* Deal with TPS != HPS */
1791     ret = postcopy_chunk_hostpages(ms);
1792     if (ret) {
1793         rcu_read_unlock();
1794         return ret;
1795     }
1796 
1797     /*
1798      * Update the unsentmap to be unsentmap = unsentmap | dirty
1799      */
1800     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1801     bitmap_or(unsentmap, unsentmap, bitmap,
1802                last_ram_offset() >> TARGET_PAGE_BITS);
1803 
1804 
1805     trace_ram_postcopy_send_discard_bitmap();
1806 #ifdef DEBUG_POSTCOPY
1807     ram_debug_dump_bitmap(unsentmap, true);
1808 #endif
1809 
1810     ret = postcopy_each_ram_send_discard(ms);
1811     rcu_read_unlock();
1812 
1813     return ret;
1814 }
1815 
1816 /*
1817  * At the start of the postcopy phase of migration, any now-dirty
1818  * precopied pages are discarded.
1819  *
1820  * start, length describe a byte address range within the RAMBlock
1821  *
1822  * Returns 0 on success.
1823  */
1824 int ram_discard_range(MigrationIncomingState *mis,
1825                       const char *block_name,
1826                       uint64_t start, size_t length)
1827 {
1828     int ret = -1;
1829 
1830     rcu_read_lock();
1831     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1832 
1833     if (!rb) {
1834         error_report("ram_discard_range: Failed to find block '%s'",
1835                      block_name);
1836         goto err;
1837     }
1838 
1839     uint8_t *host_startaddr = rb->host + start;
1840 
1841     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1842         error_report("ram_discard_range: Unaligned start address: %p",
1843                      host_startaddr);
1844         goto err;
1845     }
1846 
1847     if ((start + length) <= rb->used_length) {
1848         uint8_t *host_endaddr = host_startaddr + length;
1849         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1850             error_report("ram_discard_range: Unaligned end address: %p",
1851                          host_endaddr);
1852             goto err;
1853         }
1854         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1855     } else {
1856         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1857                      "/%zx/" RAM_ADDR_FMT")",
1858                      block_name, start, length, rb->used_length);
1859     }
1860 
1861 err:
1862     rcu_read_unlock();
1863 
1864     return ret;
1865 }
1866 
1867 static int ram_save_init_globals(void)
1868 {
1869     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1870 
1871     dirty_rate_high_cnt = 0;
1872     bitmap_sync_count = 0;
1873     migration_bitmap_sync_init();
1874     qemu_mutex_init(&migration_bitmap_mutex);
1875 
1876     if (migrate_use_xbzrle()) {
1877         XBZRLE_cache_lock();
1878         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1879         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1880                                   TARGET_PAGE_SIZE,
1881                                   TARGET_PAGE_SIZE);
1882         if (!XBZRLE.cache) {
1883             XBZRLE_cache_unlock();
1884             error_report("Error creating cache");
1885             return -1;
1886         }
1887         XBZRLE_cache_unlock();
1888 
1889         /* We prefer not to abort if there is no memory */
1890         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1891         if (!XBZRLE.encoded_buf) {
1892             error_report("Error allocating encoded_buf");
1893             return -1;
1894         }
1895 
1896         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1897         if (!XBZRLE.current_buf) {
1898             error_report("Error allocating current_buf");
1899             g_free(XBZRLE.encoded_buf);
1900             XBZRLE.encoded_buf = NULL;
1901             return -1;
1902         }
1903 
1904         acct_clear();
1905     }
1906 
1907     /* For memory_global_dirty_log_start below.  */
1908     qemu_mutex_lock_iothread();
1909 
1910     qemu_mutex_lock_ramlist();
1911     rcu_read_lock();
1912     bytes_transferred = 0;
1913     reset_ram_globals();
1914 
1915     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1916     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1917     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1918     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1919 
1920     if (migrate_postcopy_ram()) {
1921         migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1922         bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1923     }
1924 
1925     /*
1926      * Count the total number of pages used by ram blocks not including any
1927      * gaps due to alignment or unplugs.
1928      */
1929     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1930 
1931     memory_global_dirty_log_start();
1932     migration_bitmap_sync();
1933     qemu_mutex_unlock_ramlist();
1934     qemu_mutex_unlock_iothread();
1935     rcu_read_unlock();
1936 
1937     return 0;
1938 }
1939 
1940 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1941  * long-running RCU critical section.  When rcu-reclaims in the code
1942  * start to become numerous it will be necessary to reduce the
1943  * granularity of these critical sections.
1944  */
1945 
1946 static int ram_save_setup(QEMUFile *f, void *opaque)
1947 {
1948     RAMBlock *block;
1949 
1950     /* migration has already setup the bitmap, reuse it. */
1951     if (!migration_in_colo_state()) {
1952         if (ram_save_init_globals() < 0) {
1953             return -1;
1954          }
1955     }
1956 
1957     rcu_read_lock();
1958 
1959     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1960 
1961     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1962         qemu_put_byte(f, strlen(block->idstr));
1963         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1964         qemu_put_be64(f, block->used_length);
1965     }
1966 
1967     rcu_read_unlock();
1968 
1969     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1970     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1971 
1972     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1973 
1974     return 0;
1975 }
1976 
1977 static int ram_save_iterate(QEMUFile *f, void *opaque)
1978 {
1979     int ret;
1980     int i;
1981     int64_t t0;
1982     int done = 0;
1983 
1984     rcu_read_lock();
1985     if (ram_list.version != last_version) {
1986         reset_ram_globals();
1987     }
1988 
1989     /* Read version before ram_list.blocks */
1990     smp_rmb();
1991 
1992     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1993 
1994     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1995     i = 0;
1996     while ((ret = qemu_file_rate_limit(f)) == 0) {
1997         int pages;
1998 
1999         pages = ram_find_and_save_block(f, false, &bytes_transferred);
2000         /* no more pages to sent */
2001         if (pages == 0) {
2002             done = 1;
2003             break;
2004         }
2005         acct_info.iterations++;
2006 
2007         /* we want to check in the 1st loop, just in case it was the 1st time
2008            and we had to sync the dirty bitmap.
2009            qemu_get_clock_ns() is a bit expensive, so we only check each some
2010            iterations
2011         */
2012         if ((i & 63) == 0) {
2013             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2014             if (t1 > MAX_WAIT) {
2015                 trace_ram_save_iterate_big_wait(t1, i);
2016                 break;
2017             }
2018         }
2019         i++;
2020     }
2021     flush_compressed_data(f);
2022     rcu_read_unlock();
2023 
2024     /*
2025      * Must occur before EOS (or any QEMUFile operation)
2026      * because of RDMA protocol.
2027      */
2028     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2029 
2030     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2031     bytes_transferred += 8;
2032 
2033     ret = qemu_file_get_error(f);
2034     if (ret < 0) {
2035         return ret;
2036     }
2037 
2038     return done;
2039 }
2040 
2041 /* Called with iothread lock */
2042 static int ram_save_complete(QEMUFile *f, void *opaque)
2043 {
2044     rcu_read_lock();
2045 
2046     if (!migration_in_postcopy(migrate_get_current())) {
2047         migration_bitmap_sync();
2048     }
2049 
2050     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2051 
2052     /* try transferring iterative blocks of memory */
2053 
2054     /* flush all remaining blocks regardless of rate limiting */
2055     while (true) {
2056         int pages;
2057 
2058         pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2059                                         &bytes_transferred);
2060         /* no more blocks to sent */
2061         if (pages == 0) {
2062             break;
2063         }
2064     }
2065 
2066     flush_compressed_data(f);
2067     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2068 
2069     rcu_read_unlock();
2070 
2071     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2072 
2073     return 0;
2074 }
2075 
2076 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2077                              uint64_t *non_postcopiable_pending,
2078                              uint64_t *postcopiable_pending)
2079 {
2080     uint64_t remaining_size;
2081 
2082     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2083 
2084     if (!migration_in_postcopy(migrate_get_current()) &&
2085         remaining_size < max_size) {
2086         qemu_mutex_lock_iothread();
2087         rcu_read_lock();
2088         migration_bitmap_sync();
2089         rcu_read_unlock();
2090         qemu_mutex_unlock_iothread();
2091         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2092     }
2093 
2094     /* We can do postcopy, and all the data is postcopiable */
2095     *postcopiable_pending += remaining_size;
2096 }
2097 
2098 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2099 {
2100     unsigned int xh_len;
2101     int xh_flags;
2102     uint8_t *loaded_data;
2103 
2104     if (!xbzrle_decoded_buf) {
2105         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2106     }
2107     loaded_data = xbzrle_decoded_buf;
2108 
2109     /* extract RLE header */
2110     xh_flags = qemu_get_byte(f);
2111     xh_len = qemu_get_be16(f);
2112 
2113     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2114         error_report("Failed to load XBZRLE page - wrong compression!");
2115         return -1;
2116     }
2117 
2118     if (xh_len > TARGET_PAGE_SIZE) {
2119         error_report("Failed to load XBZRLE page - len overflow!");
2120         return -1;
2121     }
2122     /* load data and decode */
2123     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2124 
2125     /* decode RLE */
2126     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2127                              TARGET_PAGE_SIZE) == -1) {
2128         error_report("Failed to load XBZRLE page - decode error!");
2129         return -1;
2130     }
2131 
2132     return 0;
2133 }
2134 
2135 /* Must be called from within a rcu critical section.
2136  * Returns a pointer from within the RCU-protected ram_list.
2137  */
2138 /*
2139  * Read a RAMBlock ID from the stream f.
2140  *
2141  * f: Stream to read from
2142  * flags: Page flags (mostly to see if it's a continuation of previous block)
2143  */
2144 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2145                                               int flags)
2146 {
2147     static RAMBlock *block = NULL;
2148     char id[256];
2149     uint8_t len;
2150 
2151     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2152         if (!block) {
2153             error_report("Ack, bad migration stream!");
2154             return NULL;
2155         }
2156         return block;
2157     }
2158 
2159     len = qemu_get_byte(f);
2160     qemu_get_buffer(f, (uint8_t *)id, len);
2161     id[len] = 0;
2162 
2163     block = qemu_ram_block_by_name(id);
2164     if (!block) {
2165         error_report("Can't find block %s", id);
2166         return NULL;
2167     }
2168 
2169     return block;
2170 }
2171 
2172 static inline void *host_from_ram_block_offset(RAMBlock *block,
2173                                                ram_addr_t offset)
2174 {
2175     if (!offset_in_ramblock(block, offset)) {
2176         return NULL;
2177     }
2178 
2179     return block->host + offset;
2180 }
2181 
2182 /*
2183  * If a page (or a whole RDMA chunk) has been
2184  * determined to be zero, then zap it.
2185  */
2186 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2187 {
2188     if (ch != 0 || !is_zero_range(host, size)) {
2189         memset(host, ch, size);
2190     }
2191 }
2192 
2193 static void *do_data_decompress(void *opaque)
2194 {
2195     DecompressParam *param = opaque;
2196     unsigned long pagesize;
2197     uint8_t *des;
2198     int len;
2199 
2200     qemu_mutex_lock(&param->mutex);
2201     while (!param->quit) {
2202         if (param->des) {
2203             des = param->des;
2204             len = param->len;
2205             param->des = 0;
2206             qemu_mutex_unlock(&param->mutex);
2207 
2208             pagesize = TARGET_PAGE_SIZE;
2209             /* uncompress() will return failed in some case, especially
2210              * when the page is dirted when doing the compression, it's
2211              * not a problem because the dirty page will be retransferred
2212              * and uncompress() won't break the data in other pages.
2213              */
2214             uncompress((Bytef *)des, &pagesize,
2215                        (const Bytef *)param->compbuf, len);
2216 
2217             qemu_mutex_lock(&decomp_done_lock);
2218             param->done = true;
2219             qemu_cond_signal(&decomp_done_cond);
2220             qemu_mutex_unlock(&decomp_done_lock);
2221 
2222             qemu_mutex_lock(&param->mutex);
2223         } else {
2224             qemu_cond_wait(&param->cond, &param->mutex);
2225         }
2226     }
2227     qemu_mutex_unlock(&param->mutex);
2228 
2229     return NULL;
2230 }
2231 
2232 static void wait_for_decompress_done(void)
2233 {
2234     int idx, thread_count;
2235 
2236     if (!migrate_use_compression()) {
2237         return;
2238     }
2239 
2240     thread_count = migrate_decompress_threads();
2241     qemu_mutex_lock(&decomp_done_lock);
2242     for (idx = 0; idx < thread_count; idx++) {
2243         while (!decomp_param[idx].done) {
2244             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2245         }
2246     }
2247     qemu_mutex_unlock(&decomp_done_lock);
2248 }
2249 
2250 void migrate_decompress_threads_create(void)
2251 {
2252     int i, thread_count;
2253 
2254     thread_count = migrate_decompress_threads();
2255     decompress_threads = g_new0(QemuThread, thread_count);
2256     decomp_param = g_new0(DecompressParam, thread_count);
2257     qemu_mutex_init(&decomp_done_lock);
2258     qemu_cond_init(&decomp_done_cond);
2259     for (i = 0; i < thread_count; i++) {
2260         qemu_mutex_init(&decomp_param[i].mutex);
2261         qemu_cond_init(&decomp_param[i].cond);
2262         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2263         decomp_param[i].done = true;
2264         decomp_param[i].quit = false;
2265         qemu_thread_create(decompress_threads + i, "decompress",
2266                            do_data_decompress, decomp_param + i,
2267                            QEMU_THREAD_JOINABLE);
2268     }
2269 }
2270 
2271 void migrate_decompress_threads_join(void)
2272 {
2273     int i, thread_count;
2274 
2275     thread_count = migrate_decompress_threads();
2276     for (i = 0; i < thread_count; i++) {
2277         qemu_mutex_lock(&decomp_param[i].mutex);
2278         decomp_param[i].quit = true;
2279         qemu_cond_signal(&decomp_param[i].cond);
2280         qemu_mutex_unlock(&decomp_param[i].mutex);
2281     }
2282     for (i = 0; i < thread_count; i++) {
2283         qemu_thread_join(decompress_threads + i);
2284         qemu_mutex_destroy(&decomp_param[i].mutex);
2285         qemu_cond_destroy(&decomp_param[i].cond);
2286         g_free(decomp_param[i].compbuf);
2287     }
2288     g_free(decompress_threads);
2289     g_free(decomp_param);
2290     decompress_threads = NULL;
2291     decomp_param = NULL;
2292 }
2293 
2294 static void decompress_data_with_multi_threads(QEMUFile *f,
2295                                                void *host, int len)
2296 {
2297     int idx, thread_count;
2298 
2299     thread_count = migrate_decompress_threads();
2300     qemu_mutex_lock(&decomp_done_lock);
2301     while (true) {
2302         for (idx = 0; idx < thread_count; idx++) {
2303             if (decomp_param[idx].done) {
2304                 decomp_param[idx].done = false;
2305                 qemu_mutex_lock(&decomp_param[idx].mutex);
2306                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2307                 decomp_param[idx].des = host;
2308                 decomp_param[idx].len = len;
2309                 qemu_cond_signal(&decomp_param[idx].cond);
2310                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2311                 break;
2312             }
2313         }
2314         if (idx < thread_count) {
2315             break;
2316         } else {
2317             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2318         }
2319     }
2320     qemu_mutex_unlock(&decomp_done_lock);
2321 }
2322 
2323 /*
2324  * Allocate data structures etc needed by incoming migration with postcopy-ram
2325  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2326  */
2327 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2328 {
2329     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2330 
2331     return postcopy_ram_incoming_init(mis, ram_pages);
2332 }
2333 
2334 /*
2335  * Called in postcopy mode by ram_load().
2336  * rcu_read_lock is taken prior to this being called.
2337  */
2338 static int ram_load_postcopy(QEMUFile *f)
2339 {
2340     int flags = 0, ret = 0;
2341     bool place_needed = false;
2342     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2343     MigrationIncomingState *mis = migration_incoming_get_current();
2344     /* Temporary page that is later 'placed' */
2345     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2346     void *last_host = NULL;
2347     bool all_zero = false;
2348 
2349     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2350         ram_addr_t addr;
2351         void *host = NULL;
2352         void *page_buffer = NULL;
2353         void *place_source = NULL;
2354         uint8_t ch;
2355 
2356         addr = qemu_get_be64(f);
2357         flags = addr & ~TARGET_PAGE_MASK;
2358         addr &= TARGET_PAGE_MASK;
2359 
2360         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2361         place_needed = false;
2362         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2363             RAMBlock *block = ram_block_from_stream(f, flags);
2364 
2365             host = host_from_ram_block_offset(block, addr);
2366             if (!host) {
2367                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2368                 ret = -EINVAL;
2369                 break;
2370             }
2371             /*
2372              * Postcopy requires that we place whole host pages atomically.
2373              * To make it atomic, the data is read into a temporary page
2374              * that's moved into place later.
2375              * The migration protocol uses,  possibly smaller, target-pages
2376              * however the source ensures it always sends all the components
2377              * of a host page in order.
2378              */
2379             page_buffer = postcopy_host_page +
2380                           ((uintptr_t)host & ~qemu_host_page_mask);
2381             /* If all TP are zero then we can optimise the place */
2382             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2383                 all_zero = true;
2384             } else {
2385                 /* not the 1st TP within the HP */
2386                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2387                     error_report("Non-sequential target page %p/%p",
2388                                   host, last_host);
2389                     ret = -EINVAL;
2390                     break;
2391                 }
2392             }
2393 
2394 
2395             /*
2396              * If it's the last part of a host page then we place the host
2397              * page
2398              */
2399             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2400                                      ~qemu_host_page_mask) == 0;
2401             place_source = postcopy_host_page;
2402         }
2403         last_host = host;
2404 
2405         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2406         case RAM_SAVE_FLAG_COMPRESS:
2407             ch = qemu_get_byte(f);
2408             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2409             if (ch) {
2410                 all_zero = false;
2411             }
2412             break;
2413 
2414         case RAM_SAVE_FLAG_PAGE:
2415             all_zero = false;
2416             if (!place_needed || !matching_page_sizes) {
2417                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2418             } else {
2419                 /* Avoids the qemu_file copy during postcopy, which is
2420                  * going to do a copy later; can only do it when we
2421                  * do this read in one go (matching page sizes)
2422                  */
2423                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2424                                          TARGET_PAGE_SIZE);
2425             }
2426             break;
2427         case RAM_SAVE_FLAG_EOS:
2428             /* normal exit */
2429             break;
2430         default:
2431             error_report("Unknown combination of migration flags: %#x"
2432                          " (postcopy mode)", flags);
2433             ret = -EINVAL;
2434         }
2435 
2436         if (place_needed) {
2437             /* This gets called at the last target page in the host page */
2438             if (all_zero) {
2439                 ret = postcopy_place_page_zero(mis,
2440                                                host + TARGET_PAGE_SIZE -
2441                                                qemu_host_page_size);
2442             } else {
2443                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2444                                                qemu_host_page_size,
2445                                                place_source);
2446             }
2447         }
2448         if (!ret) {
2449             ret = qemu_file_get_error(f);
2450         }
2451     }
2452 
2453     return ret;
2454 }
2455 
2456 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2457 {
2458     int flags = 0, ret = 0;
2459     static uint64_t seq_iter;
2460     int len = 0;
2461     /*
2462      * If system is running in postcopy mode, page inserts to host memory must
2463      * be atomic
2464      */
2465     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2466 
2467     seq_iter++;
2468 
2469     if (version_id != 4) {
2470         ret = -EINVAL;
2471     }
2472 
2473     /* This RCU critical section can be very long running.
2474      * When RCU reclaims in the code start to become numerous,
2475      * it will be necessary to reduce the granularity of this
2476      * critical section.
2477      */
2478     rcu_read_lock();
2479 
2480     if (postcopy_running) {
2481         ret = ram_load_postcopy(f);
2482     }
2483 
2484     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2485         ram_addr_t addr, total_ram_bytes;
2486         void *host = NULL;
2487         uint8_t ch;
2488 
2489         addr = qemu_get_be64(f);
2490         flags = addr & ~TARGET_PAGE_MASK;
2491         addr &= TARGET_PAGE_MASK;
2492 
2493         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2494                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2495             RAMBlock *block = ram_block_from_stream(f, flags);
2496 
2497             host = host_from_ram_block_offset(block, addr);
2498             if (!host) {
2499                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2500                 ret = -EINVAL;
2501                 break;
2502             }
2503         }
2504 
2505         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2506         case RAM_SAVE_FLAG_MEM_SIZE:
2507             /* Synchronize RAM block list */
2508             total_ram_bytes = addr;
2509             while (!ret && total_ram_bytes) {
2510                 RAMBlock *block;
2511                 char id[256];
2512                 ram_addr_t length;
2513 
2514                 len = qemu_get_byte(f);
2515                 qemu_get_buffer(f, (uint8_t *)id, len);
2516                 id[len] = 0;
2517                 length = qemu_get_be64(f);
2518 
2519                 block = qemu_ram_block_by_name(id);
2520                 if (block) {
2521                     if (length != block->used_length) {
2522                         Error *local_err = NULL;
2523 
2524                         ret = qemu_ram_resize(block, length,
2525                                               &local_err);
2526                         if (local_err) {
2527                             error_report_err(local_err);
2528                         }
2529                     }
2530                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2531                                           block->idstr);
2532                 } else {
2533                     error_report("Unknown ramblock \"%s\", cannot "
2534                                  "accept migration", id);
2535                     ret = -EINVAL;
2536                 }
2537 
2538                 total_ram_bytes -= length;
2539             }
2540             break;
2541 
2542         case RAM_SAVE_FLAG_COMPRESS:
2543             ch = qemu_get_byte(f);
2544             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2545             break;
2546 
2547         case RAM_SAVE_FLAG_PAGE:
2548             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2549             break;
2550 
2551         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2552             len = qemu_get_be32(f);
2553             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2554                 error_report("Invalid compressed data length: %d", len);
2555                 ret = -EINVAL;
2556                 break;
2557             }
2558             decompress_data_with_multi_threads(f, host, len);
2559             break;
2560 
2561         case RAM_SAVE_FLAG_XBZRLE:
2562             if (load_xbzrle(f, addr, host) < 0) {
2563                 error_report("Failed to decompress XBZRLE page at "
2564                              RAM_ADDR_FMT, addr);
2565                 ret = -EINVAL;
2566                 break;
2567             }
2568             break;
2569         case RAM_SAVE_FLAG_EOS:
2570             /* normal exit */
2571             break;
2572         default:
2573             if (flags & RAM_SAVE_FLAG_HOOK) {
2574                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2575             } else {
2576                 error_report("Unknown combination of migration flags: %#x",
2577                              flags);
2578                 ret = -EINVAL;
2579             }
2580         }
2581         if (!ret) {
2582             ret = qemu_file_get_error(f);
2583         }
2584     }
2585 
2586     wait_for_decompress_done();
2587     rcu_read_unlock();
2588     trace_ram_load_complete(ret, seq_iter);
2589     return ret;
2590 }
2591 
2592 static SaveVMHandlers savevm_ram_handlers = {
2593     .save_live_setup = ram_save_setup,
2594     .save_live_iterate = ram_save_iterate,
2595     .save_live_complete_postcopy = ram_save_complete,
2596     .save_live_complete_precopy = ram_save_complete,
2597     .save_live_pending = ram_save_pending,
2598     .load_state = ram_load,
2599     .cleanup = ram_migration_cleanup,
2600 };
2601 
2602 void ram_mig_init(void)
2603 {
2604     qemu_mutex_init(&XBZRLE.lock);
2605     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2606 }
2607