xref: /openbmc/qemu/migration/ram.c (revision 795c40b8)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 #include "migration/colo.h"
47 
48 /***********************************************************/
49 /* ram save/restore */
50 
51 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
52  * worked for pages that where filled with the same char.  We switched
53  * it to only search for the zero value.  And to avoid confusion with
54  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
55  */
56 
57 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
58 #define RAM_SAVE_FLAG_ZERO     0x02
59 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
60 #define RAM_SAVE_FLAG_PAGE     0x08
61 #define RAM_SAVE_FLAG_EOS      0x10
62 #define RAM_SAVE_FLAG_CONTINUE 0x20
63 #define RAM_SAVE_FLAG_XBZRLE   0x40
64 /* 0x80 is reserved in migration.h start with 0x100 next */
65 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
66 
67 static uint8_t *ZERO_TARGET_PAGE;
68 
69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
70 {
71     return buffer_is_zero(p, size);
72 }
73 
74 /* struct contains XBZRLE cache and a static page
75    used by the compression */
76 static struct {
77     /* buffer used for XBZRLE encoding */
78     uint8_t *encoded_buf;
79     /* buffer for storing page content */
80     uint8_t *current_buf;
81     /* Cache for XBZRLE, Protected by lock. */
82     PageCache *cache;
83     QemuMutex lock;
84 } XBZRLE;
85 
86 /* buffer used for XBZRLE decoding */
87 static uint8_t *xbzrle_decoded_buf;
88 
89 static void XBZRLE_cache_lock(void)
90 {
91     if (migrate_use_xbzrle())
92         qemu_mutex_lock(&XBZRLE.lock);
93 }
94 
95 static void XBZRLE_cache_unlock(void)
96 {
97     if (migrate_use_xbzrle())
98         qemu_mutex_unlock(&XBZRLE.lock);
99 }
100 
101 /**
102  * xbzrle_cache_resize: resize the xbzrle cache
103  *
104  * This function is called from qmp_migrate_set_cache_size in main
105  * thread, possibly while a migration is in progress.  A running
106  * migration may be using the cache and might finish during this call,
107  * hence changes to the cache are protected by XBZRLE.lock().
108  *
109  * Returns the new_size or negative in case of error.
110  *
111  * @new_size: new cache size
112  */
113 int64_t xbzrle_cache_resize(int64_t new_size)
114 {
115     PageCache *new_cache;
116     int64_t ret;
117 
118     if (new_size < TARGET_PAGE_SIZE) {
119         return -1;
120     }
121 
122     XBZRLE_cache_lock();
123 
124     if (XBZRLE.cache != NULL) {
125         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
126             goto out_new_size;
127         }
128         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
129                                         TARGET_PAGE_SIZE);
130         if (!new_cache) {
131             error_report("Error creating cache");
132             ret = -1;
133             goto out;
134         }
135 
136         cache_fini(XBZRLE.cache);
137         XBZRLE.cache = new_cache;
138     }
139 
140 out_new_size:
141     ret = pow2floor(new_size);
142 out:
143     XBZRLE_cache_unlock();
144     return ret;
145 }
146 
147 /*
148  * An outstanding page request, on the source, having been received
149  * and queued
150  */
151 struct RAMSrcPageRequest {
152     RAMBlock *rb;
153     hwaddr    offset;
154     hwaddr    len;
155 
156     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
157 };
158 
159 /* State of RAM for migration */
160 struct RAMState {
161     /* QEMUFile used for this migration */
162     QEMUFile *f;
163     /* Last block that we have visited searching for dirty pages */
164     RAMBlock *last_seen_block;
165     /* Last block from where we have sent data */
166     RAMBlock *last_sent_block;
167     /* Last dirty target page we have sent */
168     ram_addr_t last_page;
169     /* last ram version we have seen */
170     uint32_t last_version;
171     /* We are in the first round */
172     bool ram_bulk_stage;
173     /* How many times we have dirty too many pages */
174     int dirty_rate_high_cnt;
175     /* How many times we have synchronized the bitmap */
176     uint64_t bitmap_sync_count;
177     /* these variables are used for bitmap sync */
178     /* last time we did a full bitmap_sync */
179     int64_t time_last_bitmap_sync;
180     /* bytes transferred at start_time */
181     uint64_t bytes_xfer_prev;
182     /* number of dirty pages since start_time */
183     uint64_t num_dirty_pages_period;
184     /* xbzrle misses since the beginning of the period */
185     uint64_t xbzrle_cache_miss_prev;
186     /* number of iterations at the beginning of period */
187     uint64_t iterations_prev;
188     /* Accounting fields */
189     /* number of zero pages.  It used to be pages filled by the same char. */
190     uint64_t zero_pages;
191     /* number of normal transferred pages */
192     uint64_t norm_pages;
193     /* Iterations since start */
194     uint64_t iterations;
195     /* xbzrle transmitted bytes.  Notice that this is with
196      * compression, they can't be calculated from the pages */
197     uint64_t xbzrle_bytes;
198     /* xbzrle transmmited pages */
199     uint64_t xbzrle_pages;
200     /* xbzrle number of cache miss */
201     uint64_t xbzrle_cache_miss;
202     /* xbzrle miss rate */
203     double xbzrle_cache_miss_rate;
204     /* xbzrle number of overflows */
205     uint64_t xbzrle_overflows;
206     /* number of dirty bits in the bitmap */
207     uint64_t migration_dirty_pages;
208     /* total number of bytes transferred */
209     uint64_t bytes_transferred;
210     /* number of dirtied pages in the last second */
211     uint64_t dirty_pages_rate;
212     /* Count of requests incoming from destination */
213     uint64_t postcopy_requests;
214     /* protects modification of the bitmap */
215     QemuMutex bitmap_mutex;
216     /* The RAMBlock used in the last src_page_requests */
217     RAMBlock *last_req_rb;
218     /* Queue of outstanding page requests from the destination */
219     QemuMutex src_page_req_mutex;
220     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
221 };
222 typedef struct RAMState RAMState;
223 
224 static RAMState ram_state;
225 
226 uint64_t dup_mig_pages_transferred(void)
227 {
228     return ram_state.zero_pages;
229 }
230 
231 uint64_t norm_mig_pages_transferred(void)
232 {
233     return ram_state.norm_pages;
234 }
235 
236 uint64_t xbzrle_mig_bytes_transferred(void)
237 {
238     return ram_state.xbzrle_bytes;
239 }
240 
241 uint64_t xbzrle_mig_pages_transferred(void)
242 {
243     return ram_state.xbzrle_pages;
244 }
245 
246 uint64_t xbzrle_mig_pages_cache_miss(void)
247 {
248     return ram_state.xbzrle_cache_miss;
249 }
250 
251 double xbzrle_mig_cache_miss_rate(void)
252 {
253     return ram_state.xbzrle_cache_miss_rate;
254 }
255 
256 uint64_t xbzrle_mig_pages_overflow(void)
257 {
258     return ram_state.xbzrle_overflows;
259 }
260 
261 uint64_t ram_bytes_transferred(void)
262 {
263     return ram_state.bytes_transferred;
264 }
265 
266 uint64_t ram_bytes_remaining(void)
267 {
268     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
269 }
270 
271 uint64_t ram_dirty_sync_count(void)
272 {
273     return ram_state.bitmap_sync_count;
274 }
275 
276 uint64_t ram_dirty_pages_rate(void)
277 {
278     return ram_state.dirty_pages_rate;
279 }
280 
281 uint64_t ram_postcopy_requests(void)
282 {
283     return ram_state.postcopy_requests;
284 }
285 
286 /* used by the search for pages to send */
287 struct PageSearchStatus {
288     /* Current block being searched */
289     RAMBlock    *block;
290     /* Current page to search from */
291     unsigned long page;
292     /* Set once we wrap around */
293     bool         complete_round;
294 };
295 typedef struct PageSearchStatus PageSearchStatus;
296 
297 struct CompressParam {
298     bool done;
299     bool quit;
300     QEMUFile *file;
301     QemuMutex mutex;
302     QemuCond cond;
303     RAMBlock *block;
304     ram_addr_t offset;
305 };
306 typedef struct CompressParam CompressParam;
307 
308 struct DecompressParam {
309     bool done;
310     bool quit;
311     QemuMutex mutex;
312     QemuCond cond;
313     void *des;
314     uint8_t *compbuf;
315     int len;
316 };
317 typedef struct DecompressParam DecompressParam;
318 
319 static CompressParam *comp_param;
320 static QemuThread *compress_threads;
321 /* comp_done_cond is used to wake up the migration thread when
322  * one of the compression threads has finished the compression.
323  * comp_done_lock is used to co-work with comp_done_cond.
324  */
325 static QemuMutex comp_done_lock;
326 static QemuCond comp_done_cond;
327 /* The empty QEMUFileOps will be used by file in CompressParam */
328 static const QEMUFileOps empty_ops = { };
329 
330 static DecompressParam *decomp_param;
331 static QemuThread *decompress_threads;
332 static QemuMutex decomp_done_lock;
333 static QemuCond decomp_done_cond;
334 
335 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
336                                 ram_addr_t offset);
337 
338 static void *do_data_compress(void *opaque)
339 {
340     CompressParam *param = opaque;
341     RAMBlock *block;
342     ram_addr_t offset;
343 
344     qemu_mutex_lock(&param->mutex);
345     while (!param->quit) {
346         if (param->block) {
347             block = param->block;
348             offset = param->offset;
349             param->block = NULL;
350             qemu_mutex_unlock(&param->mutex);
351 
352             do_compress_ram_page(param->file, block, offset);
353 
354             qemu_mutex_lock(&comp_done_lock);
355             param->done = true;
356             qemu_cond_signal(&comp_done_cond);
357             qemu_mutex_unlock(&comp_done_lock);
358 
359             qemu_mutex_lock(&param->mutex);
360         } else {
361             qemu_cond_wait(&param->cond, &param->mutex);
362         }
363     }
364     qemu_mutex_unlock(&param->mutex);
365 
366     return NULL;
367 }
368 
369 static inline void terminate_compression_threads(void)
370 {
371     int idx, thread_count;
372 
373     thread_count = migrate_compress_threads();
374 
375     for (idx = 0; idx < thread_count; idx++) {
376         qemu_mutex_lock(&comp_param[idx].mutex);
377         comp_param[idx].quit = true;
378         qemu_cond_signal(&comp_param[idx].cond);
379         qemu_mutex_unlock(&comp_param[idx].mutex);
380     }
381 }
382 
383 void migrate_compress_threads_join(void)
384 {
385     int i, thread_count;
386 
387     if (!migrate_use_compression()) {
388         return;
389     }
390     terminate_compression_threads();
391     thread_count = migrate_compress_threads();
392     for (i = 0; i < thread_count; i++) {
393         qemu_thread_join(compress_threads + i);
394         qemu_fclose(comp_param[i].file);
395         qemu_mutex_destroy(&comp_param[i].mutex);
396         qemu_cond_destroy(&comp_param[i].cond);
397     }
398     qemu_mutex_destroy(&comp_done_lock);
399     qemu_cond_destroy(&comp_done_cond);
400     g_free(compress_threads);
401     g_free(comp_param);
402     compress_threads = NULL;
403     comp_param = NULL;
404 }
405 
406 void migrate_compress_threads_create(void)
407 {
408     int i, thread_count;
409 
410     if (!migrate_use_compression()) {
411         return;
412     }
413     thread_count = migrate_compress_threads();
414     compress_threads = g_new0(QemuThread, thread_count);
415     comp_param = g_new0(CompressParam, thread_count);
416     qemu_cond_init(&comp_done_cond);
417     qemu_mutex_init(&comp_done_lock);
418     for (i = 0; i < thread_count; i++) {
419         /* comp_param[i].file is just used as a dummy buffer to save data,
420          * set its ops to empty.
421          */
422         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
423         comp_param[i].done = true;
424         comp_param[i].quit = false;
425         qemu_mutex_init(&comp_param[i].mutex);
426         qemu_cond_init(&comp_param[i].cond);
427         qemu_thread_create(compress_threads + i, "compress",
428                            do_data_compress, comp_param + i,
429                            QEMU_THREAD_JOINABLE);
430     }
431 }
432 
433 /**
434  * save_page_header: write page header to wire
435  *
436  * If this is the 1st block, it also writes the block identification
437  *
438  * Returns the number of bytes written
439  *
440  * @f: QEMUFile where to send the data
441  * @block: block that contains the page we want to send
442  * @offset: offset inside the block for the page
443  *          in the lower bits, it contains flags
444  */
445 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
446                                ram_addr_t offset)
447 {
448     size_t size, len;
449 
450     if (block == rs->last_sent_block) {
451         offset |= RAM_SAVE_FLAG_CONTINUE;
452     }
453     qemu_put_be64(f, offset);
454     size = 8;
455 
456     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
457         len = strlen(block->idstr);
458         qemu_put_byte(f, len);
459         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
460         size += 1 + len;
461         rs->last_sent_block = block;
462     }
463     return size;
464 }
465 
466 /**
467  * mig_throttle_guest_down: throotle down the guest
468  *
469  * Reduce amount of guest cpu execution to hopefully slow down memory
470  * writes. If guest dirty memory rate is reduced below the rate at
471  * which we can transfer pages to the destination then we should be
472  * able to complete migration. Some workloads dirty memory way too
473  * fast and will not effectively converge, even with auto-converge.
474  */
475 static void mig_throttle_guest_down(void)
476 {
477     MigrationState *s = migrate_get_current();
478     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
479     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
480 
481     /* We have not started throttling yet. Let's start it. */
482     if (!cpu_throttle_active()) {
483         cpu_throttle_set(pct_initial);
484     } else {
485         /* Throttling already on, just increase the rate */
486         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
487     }
488 }
489 
490 /**
491  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
492  *
493  * @rs: current RAM state
494  * @current_addr: address for the zero page
495  *
496  * Update the xbzrle cache to reflect a page that's been sent as all 0.
497  * The important thing is that a stale (not-yet-0'd) page be replaced
498  * by the new data.
499  * As a bonus, if the page wasn't in the cache it gets added so that
500  * when a small write is made into the 0'd page it gets XBZRLE sent.
501  */
502 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
503 {
504     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
505         return;
506     }
507 
508     /* We don't care if this fails to allocate a new cache page
509      * as long as it updated an old one */
510     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
511                  rs->bitmap_sync_count);
512 }
513 
514 #define ENCODING_FLAG_XBZRLE 0x1
515 
516 /**
517  * save_xbzrle_page: compress and send current page
518  *
519  * Returns: 1 means that we wrote the page
520  *          0 means that page is identical to the one already sent
521  *          -1 means that xbzrle would be longer than normal
522  *
523  * @rs: current RAM state
524  * @current_data: pointer to the address of the page contents
525  * @current_addr: addr of the page
526  * @block: block that contains the page we want to send
527  * @offset: offset inside the block for the page
528  * @last_stage: if we are at the completion stage
529  */
530 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
531                             ram_addr_t current_addr, RAMBlock *block,
532                             ram_addr_t offset, bool last_stage)
533 {
534     int encoded_len = 0, bytes_xbzrle;
535     uint8_t *prev_cached_page;
536 
537     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
538         rs->xbzrle_cache_miss++;
539         if (!last_stage) {
540             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
541                              rs->bitmap_sync_count) == -1) {
542                 return -1;
543             } else {
544                 /* update *current_data when the page has been
545                    inserted into cache */
546                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
547             }
548         }
549         return -1;
550     }
551 
552     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
553 
554     /* save current buffer into memory */
555     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
556 
557     /* XBZRLE encoding (if there is no overflow) */
558     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
559                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
560                                        TARGET_PAGE_SIZE);
561     if (encoded_len == 0) {
562         trace_save_xbzrle_page_skipping();
563         return 0;
564     } else if (encoded_len == -1) {
565         trace_save_xbzrle_page_overflow();
566         rs->xbzrle_overflows++;
567         /* update data in the cache */
568         if (!last_stage) {
569             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
570             *current_data = prev_cached_page;
571         }
572         return -1;
573     }
574 
575     /* we need to update the data in the cache, in order to get the same data */
576     if (!last_stage) {
577         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
578     }
579 
580     /* Send XBZRLE based compressed page */
581     bytes_xbzrle = save_page_header(rs, rs->f, block,
582                                     offset | RAM_SAVE_FLAG_XBZRLE);
583     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
584     qemu_put_be16(rs->f, encoded_len);
585     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
586     bytes_xbzrle += encoded_len + 1 + 2;
587     rs->xbzrle_pages++;
588     rs->xbzrle_bytes += bytes_xbzrle;
589     rs->bytes_transferred += bytes_xbzrle;
590 
591     return 1;
592 }
593 
594 /**
595  * migration_bitmap_find_dirty: find the next dirty page from start
596  *
597  * Called with rcu_read_lock() to protect migration_bitmap
598  *
599  * Returns the byte offset within memory region of the start of a dirty page
600  *
601  * @rs: current RAM state
602  * @rb: RAMBlock where to search for dirty pages
603  * @start: page where we start the search
604  */
605 static inline
606 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
607                                           unsigned long start)
608 {
609     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
610     unsigned long *bitmap = rb->bmap;
611     unsigned long next;
612 
613     if (rs->ram_bulk_stage && start > 0) {
614         next = start + 1;
615     } else {
616         next = find_next_bit(bitmap, size, start);
617     }
618 
619     return next;
620 }
621 
622 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
623                                                 RAMBlock *rb,
624                                                 unsigned long page)
625 {
626     bool ret;
627 
628     ret = test_and_clear_bit(page, rb->bmap);
629 
630     if (ret) {
631         rs->migration_dirty_pages--;
632     }
633     return ret;
634 }
635 
636 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
637                                         ram_addr_t start, ram_addr_t length)
638 {
639     rs->migration_dirty_pages +=
640         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
641                                               &rs->num_dirty_pages_period);
642 }
643 
644 /**
645  * ram_pagesize_summary: calculate all the pagesizes of a VM
646  *
647  * Returns a summary bitmap of the page sizes of all RAMBlocks
648  *
649  * For VMs with just normal pages this is equivalent to the host page
650  * size. If it's got some huge pages then it's the OR of all the
651  * different page sizes.
652  */
653 uint64_t ram_pagesize_summary(void)
654 {
655     RAMBlock *block;
656     uint64_t summary = 0;
657 
658     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
659         summary |= block->page_size;
660     }
661 
662     return summary;
663 }
664 
665 static void migration_bitmap_sync(RAMState *rs)
666 {
667     RAMBlock *block;
668     int64_t end_time;
669     uint64_t bytes_xfer_now;
670 
671     rs->bitmap_sync_count++;
672 
673     if (!rs->bytes_xfer_prev) {
674         rs->bytes_xfer_prev = ram_bytes_transferred();
675     }
676 
677     if (!rs->time_last_bitmap_sync) {
678         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
679     }
680 
681     trace_migration_bitmap_sync_start();
682     memory_global_dirty_log_sync();
683 
684     qemu_mutex_lock(&rs->bitmap_mutex);
685     rcu_read_lock();
686     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
687         migration_bitmap_sync_range(rs, block, 0, block->used_length);
688     }
689     rcu_read_unlock();
690     qemu_mutex_unlock(&rs->bitmap_mutex);
691 
692     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
693 
694     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
695 
696     /* more than 1 second = 1000 millisecons */
697     if (end_time > rs->time_last_bitmap_sync + 1000) {
698         if (migrate_auto_converge()) {
699             /* The following detection logic can be refined later. For now:
700                Check to see if the dirtied bytes is 50% more than the approx.
701                amount of bytes that just got transferred since the last time we
702                were in this routine. If that happens twice, start or increase
703                throttling */
704             bytes_xfer_now = ram_bytes_transferred();
705 
706             if (rs->dirty_pages_rate &&
707                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
708                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
709                (rs->dirty_rate_high_cnt++ >= 2)) {
710                     trace_migration_throttle();
711                     rs->dirty_rate_high_cnt = 0;
712                     mig_throttle_guest_down();
713              }
714              rs->bytes_xfer_prev = bytes_xfer_now;
715         }
716 
717         if (migrate_use_xbzrle()) {
718             if (rs->iterations_prev != rs->iterations) {
719                 rs->xbzrle_cache_miss_rate =
720                    (double)(rs->xbzrle_cache_miss -
721                             rs->xbzrle_cache_miss_prev) /
722                    (rs->iterations - rs->iterations_prev);
723             }
724             rs->iterations_prev = rs->iterations;
725             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
726         }
727         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
728             / (end_time - rs->time_last_bitmap_sync);
729         rs->time_last_bitmap_sync = end_time;
730         rs->num_dirty_pages_period = 0;
731     }
732     if (migrate_use_events()) {
733         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
734     }
735 }
736 
737 /**
738  * save_zero_page: send the zero page to the stream
739  *
740  * Returns the number of pages written.
741  *
742  * @rs: current RAM state
743  * @block: block that contains the page we want to send
744  * @offset: offset inside the block for the page
745  * @p: pointer to the page
746  */
747 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
748                           uint8_t *p)
749 {
750     int pages = -1;
751 
752     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
753         rs->zero_pages++;
754         rs->bytes_transferred +=
755             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
756         qemu_put_byte(rs->f, 0);
757         rs->bytes_transferred += 1;
758         pages = 1;
759     }
760 
761     return pages;
762 }
763 
764 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
765 {
766     if (!migrate_release_ram() || !migration_in_postcopy()) {
767         return;
768     }
769 
770     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
771 }
772 
773 /**
774  * ram_save_page: send the given page to the stream
775  *
776  * Returns the number of pages written.
777  *          < 0 - error
778  *          >=0 - Number of pages written - this might legally be 0
779  *                if xbzrle noticed the page was the same.
780  *
781  * @rs: current RAM state
782  * @block: block that contains the page we want to send
783  * @offset: offset inside the block for the page
784  * @last_stage: if we are at the completion stage
785  */
786 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
787 {
788     int pages = -1;
789     uint64_t bytes_xmit;
790     ram_addr_t current_addr;
791     uint8_t *p;
792     int ret;
793     bool send_async = true;
794     RAMBlock *block = pss->block;
795     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
796 
797     p = block->host + offset;
798     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
799 
800     /* In doubt sent page as normal */
801     bytes_xmit = 0;
802     ret = ram_control_save_page(rs->f, block->offset,
803                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
804     if (bytes_xmit) {
805         rs->bytes_transferred += bytes_xmit;
806         pages = 1;
807     }
808 
809     XBZRLE_cache_lock();
810 
811     current_addr = block->offset + offset;
812 
813     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
814         if (ret != RAM_SAVE_CONTROL_DELAYED) {
815             if (bytes_xmit > 0) {
816                 rs->norm_pages++;
817             } else if (bytes_xmit == 0) {
818                 rs->zero_pages++;
819             }
820         }
821     } else {
822         pages = save_zero_page(rs, block, offset, p);
823         if (pages > 0) {
824             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
825              * page would be stale
826              */
827             xbzrle_cache_zero_page(rs, current_addr);
828             ram_release_pages(block->idstr, offset, pages);
829         } else if (!rs->ram_bulk_stage &&
830                    !migration_in_postcopy() && migrate_use_xbzrle()) {
831             pages = save_xbzrle_page(rs, &p, current_addr, block,
832                                      offset, last_stage);
833             if (!last_stage) {
834                 /* Can't send this cached data async, since the cache page
835                  * might get updated before it gets to the wire
836                  */
837                 send_async = false;
838             }
839         }
840     }
841 
842     /* XBZRLE overflow or normal page */
843     if (pages == -1) {
844         rs->bytes_transferred += save_page_header(rs, rs->f, block,
845                                                   offset | RAM_SAVE_FLAG_PAGE);
846         if (send_async) {
847             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
848                                   migrate_release_ram() &
849                                   migration_in_postcopy());
850         } else {
851             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
852         }
853         rs->bytes_transferred += TARGET_PAGE_SIZE;
854         pages = 1;
855         rs->norm_pages++;
856     }
857 
858     XBZRLE_cache_unlock();
859 
860     return pages;
861 }
862 
863 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
864                                 ram_addr_t offset)
865 {
866     RAMState *rs = &ram_state;
867     int bytes_sent, blen;
868     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
869 
870     bytes_sent = save_page_header(rs, f, block, offset |
871                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
872     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
873                                      migrate_compress_level());
874     if (blen < 0) {
875         bytes_sent = 0;
876         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
877         error_report("compressed data failed!");
878     } else {
879         bytes_sent += blen;
880         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
881     }
882 
883     return bytes_sent;
884 }
885 
886 static void flush_compressed_data(RAMState *rs)
887 {
888     int idx, len, thread_count;
889 
890     if (!migrate_use_compression()) {
891         return;
892     }
893     thread_count = migrate_compress_threads();
894 
895     qemu_mutex_lock(&comp_done_lock);
896     for (idx = 0; idx < thread_count; idx++) {
897         while (!comp_param[idx].done) {
898             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
899         }
900     }
901     qemu_mutex_unlock(&comp_done_lock);
902 
903     for (idx = 0; idx < thread_count; idx++) {
904         qemu_mutex_lock(&comp_param[idx].mutex);
905         if (!comp_param[idx].quit) {
906             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
907             rs->bytes_transferred += len;
908         }
909         qemu_mutex_unlock(&comp_param[idx].mutex);
910     }
911 }
912 
913 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
914                                        ram_addr_t offset)
915 {
916     param->block = block;
917     param->offset = offset;
918 }
919 
920 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
921                                            ram_addr_t offset)
922 {
923     int idx, thread_count, bytes_xmit = -1, pages = -1;
924 
925     thread_count = migrate_compress_threads();
926     qemu_mutex_lock(&comp_done_lock);
927     while (true) {
928         for (idx = 0; idx < thread_count; idx++) {
929             if (comp_param[idx].done) {
930                 comp_param[idx].done = false;
931                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
932                 qemu_mutex_lock(&comp_param[idx].mutex);
933                 set_compress_params(&comp_param[idx], block, offset);
934                 qemu_cond_signal(&comp_param[idx].cond);
935                 qemu_mutex_unlock(&comp_param[idx].mutex);
936                 pages = 1;
937                 rs->norm_pages++;
938                 rs->bytes_transferred += bytes_xmit;
939                 break;
940             }
941         }
942         if (pages > 0) {
943             break;
944         } else {
945             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
946         }
947     }
948     qemu_mutex_unlock(&comp_done_lock);
949 
950     return pages;
951 }
952 
953 /**
954  * ram_save_compressed_page: compress the given page and send it to the stream
955  *
956  * Returns the number of pages written.
957  *
958  * @rs: current RAM state
959  * @block: block that contains the page we want to send
960  * @offset: offset inside the block for the page
961  * @last_stage: if we are at the completion stage
962  */
963 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
964                                     bool last_stage)
965 {
966     int pages = -1;
967     uint64_t bytes_xmit = 0;
968     uint8_t *p;
969     int ret, blen;
970     RAMBlock *block = pss->block;
971     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
972 
973     p = block->host + offset;
974 
975     ret = ram_control_save_page(rs->f, block->offset,
976                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
977     if (bytes_xmit) {
978         rs->bytes_transferred += bytes_xmit;
979         pages = 1;
980     }
981     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
982         if (ret != RAM_SAVE_CONTROL_DELAYED) {
983             if (bytes_xmit > 0) {
984                 rs->norm_pages++;
985             } else if (bytes_xmit == 0) {
986                 rs->zero_pages++;
987             }
988         }
989     } else {
990         /* When starting the process of a new block, the first page of
991          * the block should be sent out before other pages in the same
992          * block, and all the pages in last block should have been sent
993          * out, keeping this order is important, because the 'cont' flag
994          * is used to avoid resending the block name.
995          */
996         if (block != rs->last_sent_block) {
997             flush_compressed_data(rs);
998             pages = save_zero_page(rs, block, offset, p);
999             if (pages == -1) {
1000                 /* Make sure the first page is sent out before other pages */
1001                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1002                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1003                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1004                                                  migrate_compress_level());
1005                 if (blen > 0) {
1006                     rs->bytes_transferred += bytes_xmit + blen;
1007                     rs->norm_pages++;
1008                     pages = 1;
1009                 } else {
1010                     qemu_file_set_error(rs->f, blen);
1011                     error_report("compressed data failed!");
1012                 }
1013             }
1014             if (pages > 0) {
1015                 ram_release_pages(block->idstr, offset, pages);
1016             }
1017         } else {
1018             pages = save_zero_page(rs, block, offset, p);
1019             if (pages == -1) {
1020                 pages = compress_page_with_multi_thread(rs, block, offset);
1021             } else {
1022                 ram_release_pages(block->idstr, offset, pages);
1023             }
1024         }
1025     }
1026 
1027     return pages;
1028 }
1029 
1030 /**
1031  * find_dirty_block: find the next dirty page and update any state
1032  * associated with the search process.
1033  *
1034  * Returns if a page is found
1035  *
1036  * @rs: current RAM state
1037  * @pss: data about the state of the current dirty page scan
1038  * @again: set to false if the search has scanned the whole of RAM
1039  */
1040 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1041 {
1042     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1043     if (pss->complete_round && pss->block == rs->last_seen_block &&
1044         pss->page >= rs->last_page) {
1045         /*
1046          * We've been once around the RAM and haven't found anything.
1047          * Give up.
1048          */
1049         *again = false;
1050         return false;
1051     }
1052     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1053         /* Didn't find anything in this RAM Block */
1054         pss->page = 0;
1055         pss->block = QLIST_NEXT_RCU(pss->block, next);
1056         if (!pss->block) {
1057             /* Hit the end of the list */
1058             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1059             /* Flag that we've looped */
1060             pss->complete_round = true;
1061             rs->ram_bulk_stage = false;
1062             if (migrate_use_xbzrle()) {
1063                 /* If xbzrle is on, stop using the data compression at this
1064                  * point. In theory, xbzrle can do better than compression.
1065                  */
1066                 flush_compressed_data(rs);
1067             }
1068         }
1069         /* Didn't find anything this time, but try again on the new block */
1070         *again = true;
1071         return false;
1072     } else {
1073         /* Can go around again, but... */
1074         *again = true;
1075         /* We've found something so probably don't need to */
1076         return true;
1077     }
1078 }
1079 
1080 /**
1081  * unqueue_page: gets a page of the queue
1082  *
1083  * Helper for 'get_queued_page' - gets a page off the queue
1084  *
1085  * Returns the block of the page (or NULL if none available)
1086  *
1087  * @rs: current RAM state
1088  * @offset: used to return the offset within the RAMBlock
1089  */
1090 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1091 {
1092     RAMBlock *block = NULL;
1093 
1094     qemu_mutex_lock(&rs->src_page_req_mutex);
1095     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1096         struct RAMSrcPageRequest *entry =
1097                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1098         block = entry->rb;
1099         *offset = entry->offset;
1100 
1101         if (entry->len > TARGET_PAGE_SIZE) {
1102             entry->len -= TARGET_PAGE_SIZE;
1103             entry->offset += TARGET_PAGE_SIZE;
1104         } else {
1105             memory_region_unref(block->mr);
1106             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1107             g_free(entry);
1108         }
1109     }
1110     qemu_mutex_unlock(&rs->src_page_req_mutex);
1111 
1112     return block;
1113 }
1114 
1115 /**
1116  * get_queued_page: unqueue a page from the postocpy requests
1117  *
1118  * Skips pages that are already sent (!dirty)
1119  *
1120  * Returns if a queued page is found
1121  *
1122  * @rs: current RAM state
1123  * @pss: data about the state of the current dirty page scan
1124  */
1125 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1126 {
1127     RAMBlock  *block;
1128     ram_addr_t offset;
1129     bool dirty;
1130 
1131     do {
1132         block = unqueue_page(rs, &offset);
1133         /*
1134          * We're sending this page, and since it's postcopy nothing else
1135          * will dirty it, and we must make sure it doesn't get sent again
1136          * even if this queue request was received after the background
1137          * search already sent it.
1138          */
1139         if (block) {
1140             unsigned long page;
1141 
1142             page = offset >> TARGET_PAGE_BITS;
1143             dirty = test_bit(page, block->bmap);
1144             if (!dirty) {
1145                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1146                        page, test_bit(page, block->unsentmap));
1147             } else {
1148                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1149             }
1150         }
1151 
1152     } while (block && !dirty);
1153 
1154     if (block) {
1155         /*
1156          * As soon as we start servicing pages out of order, then we have
1157          * to kill the bulk stage, since the bulk stage assumes
1158          * in (migration_bitmap_find_and_reset_dirty) that every page is
1159          * dirty, that's no longer true.
1160          */
1161         rs->ram_bulk_stage = false;
1162 
1163         /*
1164          * We want the background search to continue from the queued page
1165          * since the guest is likely to want other pages near to the page
1166          * it just requested.
1167          */
1168         pss->block = block;
1169         pss->page = offset >> TARGET_PAGE_BITS;
1170     }
1171 
1172     return !!block;
1173 }
1174 
1175 /**
1176  * migration_page_queue_free: drop any remaining pages in the ram
1177  * request queue
1178  *
1179  * It should be empty at the end anyway, but in error cases there may
1180  * be some left.  in case that there is any page left, we drop it.
1181  *
1182  */
1183 void migration_page_queue_free(void)
1184 {
1185     struct RAMSrcPageRequest *mspr, *next_mspr;
1186     RAMState *rs = &ram_state;
1187     /* This queue generally should be empty - but in the case of a failed
1188      * migration might have some droppings in.
1189      */
1190     rcu_read_lock();
1191     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1192         memory_region_unref(mspr->rb->mr);
1193         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1194         g_free(mspr);
1195     }
1196     rcu_read_unlock();
1197 }
1198 
1199 /**
1200  * ram_save_queue_pages: queue the page for transmission
1201  *
1202  * A request from postcopy destination for example.
1203  *
1204  * Returns zero on success or negative on error
1205  *
1206  * @rbname: Name of the RAMBLock of the request. NULL means the
1207  *          same that last one.
1208  * @start: starting address from the start of the RAMBlock
1209  * @len: length (in bytes) to send
1210  */
1211 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1212 {
1213     RAMBlock *ramblock;
1214     RAMState *rs = &ram_state;
1215 
1216     rs->postcopy_requests++;
1217     rcu_read_lock();
1218     if (!rbname) {
1219         /* Reuse last RAMBlock */
1220         ramblock = rs->last_req_rb;
1221 
1222         if (!ramblock) {
1223             /*
1224              * Shouldn't happen, we can't reuse the last RAMBlock if
1225              * it's the 1st request.
1226              */
1227             error_report("ram_save_queue_pages no previous block");
1228             goto err;
1229         }
1230     } else {
1231         ramblock = qemu_ram_block_by_name(rbname);
1232 
1233         if (!ramblock) {
1234             /* We shouldn't be asked for a non-existent RAMBlock */
1235             error_report("ram_save_queue_pages no block '%s'", rbname);
1236             goto err;
1237         }
1238         rs->last_req_rb = ramblock;
1239     }
1240     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1241     if (start+len > ramblock->used_length) {
1242         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1243                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1244                      __func__, start, len, ramblock->used_length);
1245         goto err;
1246     }
1247 
1248     struct RAMSrcPageRequest *new_entry =
1249         g_malloc0(sizeof(struct RAMSrcPageRequest));
1250     new_entry->rb = ramblock;
1251     new_entry->offset = start;
1252     new_entry->len = len;
1253 
1254     memory_region_ref(ramblock->mr);
1255     qemu_mutex_lock(&rs->src_page_req_mutex);
1256     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1257     qemu_mutex_unlock(&rs->src_page_req_mutex);
1258     rcu_read_unlock();
1259 
1260     return 0;
1261 
1262 err:
1263     rcu_read_unlock();
1264     return -1;
1265 }
1266 
1267 /**
1268  * ram_save_target_page: save one target page
1269  *
1270  * Returns the number of pages written
1271  *
1272  * @rs: current RAM state
1273  * @ms: current migration state
1274  * @pss: data about the page we want to send
1275  * @last_stage: if we are at the completion stage
1276  */
1277 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1278                                 bool last_stage)
1279 {
1280     int res = 0;
1281 
1282     /* Check the pages is dirty and if it is send it */
1283     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1284         /*
1285          * If xbzrle is on, stop using the data compression after first
1286          * round of migration even if compression is enabled. In theory,
1287          * xbzrle can do better than compression.
1288          */
1289         if (migrate_use_compression() &&
1290             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1291             res = ram_save_compressed_page(rs, pss, last_stage);
1292         } else {
1293             res = ram_save_page(rs, pss, last_stage);
1294         }
1295 
1296         if (res < 0) {
1297             return res;
1298         }
1299         if (pss->block->unsentmap) {
1300             clear_bit(pss->page, pss->block->unsentmap);
1301         }
1302     }
1303 
1304     return res;
1305 }
1306 
1307 /**
1308  * ram_save_host_page: save a whole host page
1309  *
1310  * Starting at *offset send pages up to the end of the current host
1311  * page. It's valid for the initial offset to point into the middle of
1312  * a host page in which case the remainder of the hostpage is sent.
1313  * Only dirty target pages are sent. Note that the host page size may
1314  * be a huge page for this block.
1315  *
1316  * Returns the number of pages written or negative on error
1317  *
1318  * @rs: current RAM state
1319  * @ms: current migration state
1320  * @pss: data about the page we want to send
1321  * @last_stage: if we are at the completion stage
1322  */
1323 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1324                               bool last_stage)
1325 {
1326     int tmppages, pages = 0;
1327     size_t pagesize_bits =
1328         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1329 
1330     do {
1331         tmppages = ram_save_target_page(rs, pss, last_stage);
1332         if (tmppages < 0) {
1333             return tmppages;
1334         }
1335 
1336         pages += tmppages;
1337         pss->page++;
1338     } while (pss->page & (pagesize_bits - 1));
1339 
1340     /* The offset we leave with is the last one we looked at */
1341     pss->page--;
1342     return pages;
1343 }
1344 
1345 /**
1346  * ram_find_and_save_block: finds a dirty page and sends it to f
1347  *
1348  * Called within an RCU critical section.
1349  *
1350  * Returns the number of pages written where zero means no dirty pages
1351  *
1352  * @rs: current RAM state
1353  * @last_stage: if we are at the completion stage
1354  *
1355  * On systems where host-page-size > target-page-size it will send all the
1356  * pages in a host page that are dirty.
1357  */
1358 
1359 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1360 {
1361     PageSearchStatus pss;
1362     int pages = 0;
1363     bool again, found;
1364 
1365     /* No dirty page as there is zero RAM */
1366     if (!ram_bytes_total()) {
1367         return pages;
1368     }
1369 
1370     pss.block = rs->last_seen_block;
1371     pss.page = rs->last_page;
1372     pss.complete_round = false;
1373 
1374     if (!pss.block) {
1375         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1376     }
1377 
1378     do {
1379         again = true;
1380         found = get_queued_page(rs, &pss);
1381 
1382         if (!found) {
1383             /* priority queue empty, so just search for something dirty */
1384             found = find_dirty_block(rs, &pss, &again);
1385         }
1386 
1387         if (found) {
1388             pages = ram_save_host_page(rs, &pss, last_stage);
1389         }
1390     } while (!pages && again);
1391 
1392     rs->last_seen_block = pss.block;
1393     rs->last_page = pss.page;
1394 
1395     return pages;
1396 }
1397 
1398 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1399 {
1400     uint64_t pages = size / TARGET_PAGE_SIZE;
1401     RAMState *rs = &ram_state;
1402 
1403     if (zero) {
1404         rs->zero_pages += pages;
1405     } else {
1406         rs->norm_pages += pages;
1407         rs->bytes_transferred += size;
1408         qemu_update_position(f, size);
1409     }
1410 }
1411 
1412 uint64_t ram_bytes_total(void)
1413 {
1414     RAMBlock *block;
1415     uint64_t total = 0;
1416 
1417     rcu_read_lock();
1418     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1419         total += block->used_length;
1420     rcu_read_unlock();
1421     return total;
1422 }
1423 
1424 void free_xbzrle_decoded_buf(void)
1425 {
1426     g_free(xbzrle_decoded_buf);
1427     xbzrle_decoded_buf = NULL;
1428 }
1429 
1430 static void ram_migration_cleanup(void *opaque)
1431 {
1432     RAMBlock *block;
1433 
1434     /* caller have hold iothread lock or is in a bh, so there is
1435      * no writing race against this migration_bitmap
1436      */
1437     memory_global_dirty_log_stop();
1438 
1439     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1440         g_free(block->bmap);
1441         block->bmap = NULL;
1442         g_free(block->unsentmap);
1443         block->unsentmap = NULL;
1444     }
1445 
1446     XBZRLE_cache_lock();
1447     if (XBZRLE.cache) {
1448         cache_fini(XBZRLE.cache);
1449         g_free(XBZRLE.encoded_buf);
1450         g_free(XBZRLE.current_buf);
1451         g_free(ZERO_TARGET_PAGE);
1452         XBZRLE.cache = NULL;
1453         XBZRLE.encoded_buf = NULL;
1454         XBZRLE.current_buf = NULL;
1455     }
1456     XBZRLE_cache_unlock();
1457 }
1458 
1459 static void ram_state_reset(RAMState *rs)
1460 {
1461     rs->last_seen_block = NULL;
1462     rs->last_sent_block = NULL;
1463     rs->last_page = 0;
1464     rs->last_version = ram_list.version;
1465     rs->ram_bulk_stage = true;
1466 }
1467 
1468 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1469 
1470 /*
1471  * 'expected' is the value you expect the bitmap mostly to be full
1472  * of; it won't bother printing lines that are all this value.
1473  * If 'todump' is null the migration bitmap is dumped.
1474  */
1475 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1476                            unsigned long pages)
1477 {
1478     int64_t cur;
1479     int64_t linelen = 128;
1480     char linebuf[129];
1481 
1482     for (cur = 0; cur < pages; cur += linelen) {
1483         int64_t curb;
1484         bool found = false;
1485         /*
1486          * Last line; catch the case where the line length
1487          * is longer than remaining ram
1488          */
1489         if (cur + linelen > pages) {
1490             linelen = pages - cur;
1491         }
1492         for (curb = 0; curb < linelen; curb++) {
1493             bool thisbit = test_bit(cur + curb, todump);
1494             linebuf[curb] = thisbit ? '1' : '.';
1495             found = found || (thisbit != expected);
1496         }
1497         if (found) {
1498             linebuf[curb] = '\0';
1499             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1500         }
1501     }
1502 }
1503 
1504 /* **** functions for postcopy ***** */
1505 
1506 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1507 {
1508     struct RAMBlock *block;
1509 
1510     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1511         unsigned long *bitmap = block->bmap;
1512         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1513         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1514 
1515         while (run_start < range) {
1516             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1517             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1518                               (run_end - run_start) << TARGET_PAGE_BITS);
1519             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1520         }
1521     }
1522 }
1523 
1524 /**
1525  * postcopy_send_discard_bm_ram: discard a RAMBlock
1526  *
1527  * Returns zero on success
1528  *
1529  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1530  * Note: At this point the 'unsentmap' is the processed bitmap combined
1531  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1532  *
1533  * @ms: current migration state
1534  * @pds: state for postcopy
1535  * @start: RAMBlock starting page
1536  * @length: RAMBlock size
1537  */
1538 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1539                                         PostcopyDiscardState *pds,
1540                                         RAMBlock *block)
1541 {
1542     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1543     unsigned long current;
1544     unsigned long *unsentmap = block->unsentmap;
1545 
1546     for (current = 0; current < end; ) {
1547         unsigned long one = find_next_bit(unsentmap, end, current);
1548 
1549         if (one <= end) {
1550             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1551             unsigned long discard_length;
1552 
1553             if (zero >= end) {
1554                 discard_length = end - one;
1555             } else {
1556                 discard_length = zero - one;
1557             }
1558             if (discard_length) {
1559                 postcopy_discard_send_range(ms, pds, one, discard_length);
1560             }
1561             current = one + discard_length;
1562         } else {
1563             current = one;
1564         }
1565     }
1566 
1567     return 0;
1568 }
1569 
1570 /**
1571  * postcopy_each_ram_send_discard: discard all RAMBlocks
1572  *
1573  * Returns 0 for success or negative for error
1574  *
1575  * Utility for the outgoing postcopy code.
1576  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1577  *   passing it bitmap indexes and name.
1578  * (qemu_ram_foreach_block ends up passing unscaled lengths
1579  *  which would mean postcopy code would have to deal with target page)
1580  *
1581  * @ms: current migration state
1582  */
1583 static int postcopy_each_ram_send_discard(MigrationState *ms)
1584 {
1585     struct RAMBlock *block;
1586     int ret;
1587 
1588     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1589         PostcopyDiscardState *pds =
1590             postcopy_discard_send_init(ms, block->idstr);
1591 
1592         /*
1593          * Postcopy sends chunks of bitmap over the wire, but it
1594          * just needs indexes at this point, avoids it having
1595          * target page specific code.
1596          */
1597         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1598         postcopy_discard_send_finish(ms, pds);
1599         if (ret) {
1600             return ret;
1601         }
1602     }
1603 
1604     return 0;
1605 }
1606 
1607 /**
1608  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1609  *
1610  * Helper for postcopy_chunk_hostpages; it's called twice to
1611  * canonicalize the two bitmaps, that are similar, but one is
1612  * inverted.
1613  *
1614  * Postcopy requires that all target pages in a hostpage are dirty or
1615  * clean, not a mix.  This function canonicalizes the bitmaps.
1616  *
1617  * @ms: current migration state
1618  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1619  *               otherwise we need to canonicalize partially dirty host pages
1620  * @block: block that contains the page we want to canonicalize
1621  * @pds: state for postcopy
1622  */
1623 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1624                                           RAMBlock *block,
1625                                           PostcopyDiscardState *pds)
1626 {
1627     RAMState *rs = &ram_state;
1628     unsigned long *bitmap = block->bmap;
1629     unsigned long *unsentmap = block->unsentmap;
1630     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1631     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1632     unsigned long run_start;
1633 
1634     if (block->page_size == TARGET_PAGE_SIZE) {
1635         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1636         return;
1637     }
1638 
1639     if (unsent_pass) {
1640         /* Find a sent page */
1641         run_start = find_next_zero_bit(unsentmap, pages, 0);
1642     } else {
1643         /* Find a dirty page */
1644         run_start = find_next_bit(bitmap, pages, 0);
1645     }
1646 
1647     while (run_start < pages) {
1648         bool do_fixup = false;
1649         unsigned long fixup_start_addr;
1650         unsigned long host_offset;
1651 
1652         /*
1653          * If the start of this run of pages is in the middle of a host
1654          * page, then we need to fixup this host page.
1655          */
1656         host_offset = run_start % host_ratio;
1657         if (host_offset) {
1658             do_fixup = true;
1659             run_start -= host_offset;
1660             fixup_start_addr = run_start;
1661             /* For the next pass */
1662             run_start = run_start + host_ratio;
1663         } else {
1664             /* Find the end of this run */
1665             unsigned long run_end;
1666             if (unsent_pass) {
1667                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1668             } else {
1669                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1670             }
1671             /*
1672              * If the end isn't at the start of a host page, then the
1673              * run doesn't finish at the end of a host page
1674              * and we need to discard.
1675              */
1676             host_offset = run_end % host_ratio;
1677             if (host_offset) {
1678                 do_fixup = true;
1679                 fixup_start_addr = run_end - host_offset;
1680                 /*
1681                  * This host page has gone, the next loop iteration starts
1682                  * from after the fixup
1683                  */
1684                 run_start = fixup_start_addr + host_ratio;
1685             } else {
1686                 /*
1687                  * No discards on this iteration, next loop starts from
1688                  * next sent/dirty page
1689                  */
1690                 run_start = run_end + 1;
1691             }
1692         }
1693 
1694         if (do_fixup) {
1695             unsigned long page;
1696 
1697             /* Tell the destination to discard this page */
1698             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1699                 /* For the unsent_pass we:
1700                  *     discard partially sent pages
1701                  * For the !unsent_pass (dirty) we:
1702                  *     discard partially dirty pages that were sent
1703                  *     (any partially sent pages were already discarded
1704                  *     by the previous unsent_pass)
1705                  */
1706                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1707                                             host_ratio);
1708             }
1709 
1710             /* Clean up the bitmap */
1711             for (page = fixup_start_addr;
1712                  page < fixup_start_addr + host_ratio; page++) {
1713                 /* All pages in this host page are now not sent */
1714                 set_bit(page, unsentmap);
1715 
1716                 /*
1717                  * Remark them as dirty, updating the count for any pages
1718                  * that weren't previously dirty.
1719                  */
1720                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1721             }
1722         }
1723 
1724         if (unsent_pass) {
1725             /* Find the next sent page for the next iteration */
1726             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1727         } else {
1728             /* Find the next dirty page for the next iteration */
1729             run_start = find_next_bit(bitmap, pages, run_start);
1730         }
1731     }
1732 }
1733 
1734 /**
1735  * postcopy_chuck_hostpages: discrad any partially sent host page
1736  *
1737  * Utility for the outgoing postcopy code.
1738  *
1739  * Discard any partially sent host-page size chunks, mark any partially
1740  * dirty host-page size chunks as all dirty.  In this case the host-page
1741  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1742  *
1743  * Returns zero on success
1744  *
1745  * @ms: current migration state
1746  * @block: block we want to work with
1747  */
1748 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1749 {
1750     PostcopyDiscardState *pds =
1751         postcopy_discard_send_init(ms, block->idstr);
1752 
1753     /* First pass: Discard all partially sent host pages */
1754     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1755     /*
1756      * Second pass: Ensure that all partially dirty host pages are made
1757      * fully dirty.
1758      */
1759     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1760 
1761     postcopy_discard_send_finish(ms, pds);
1762     return 0;
1763 }
1764 
1765 /**
1766  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1767  *
1768  * Returns zero on success
1769  *
1770  * Transmit the set of pages to be discarded after precopy to the target
1771  * these are pages that:
1772  *     a) Have been previously transmitted but are now dirty again
1773  *     b) Pages that have never been transmitted, this ensures that
1774  *        any pages on the destination that have been mapped by background
1775  *        tasks get discarded (transparent huge pages is the specific concern)
1776  * Hopefully this is pretty sparse
1777  *
1778  * @ms: current migration state
1779  */
1780 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1781 {
1782     RAMState *rs = &ram_state;
1783     RAMBlock *block;
1784     int ret;
1785 
1786     rcu_read_lock();
1787 
1788     /* This should be our last sync, the src is now paused */
1789     migration_bitmap_sync(rs);
1790 
1791     /* Easiest way to make sure we don't resume in the middle of a host-page */
1792     rs->last_seen_block = NULL;
1793     rs->last_sent_block = NULL;
1794     rs->last_page = 0;
1795 
1796     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1797         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1798         unsigned long *bitmap = block->bmap;
1799         unsigned long *unsentmap = block->unsentmap;
1800 
1801         if (!unsentmap) {
1802             /* We don't have a safe way to resize the sentmap, so
1803              * if the bitmap was resized it will be NULL at this
1804              * point.
1805              */
1806             error_report("migration ram resized during precopy phase");
1807             rcu_read_unlock();
1808             return -EINVAL;
1809         }
1810         /* Deal with TPS != HPS and huge pages */
1811         ret = postcopy_chunk_hostpages(ms, block);
1812         if (ret) {
1813             rcu_read_unlock();
1814             return ret;
1815         }
1816 
1817         /*
1818          * Update the unsentmap to be unsentmap = unsentmap | dirty
1819          */
1820         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1821 #ifdef DEBUG_POSTCOPY
1822         ram_debug_dump_bitmap(unsentmap, true, pages);
1823 #endif
1824     }
1825     trace_ram_postcopy_send_discard_bitmap();
1826 
1827     ret = postcopy_each_ram_send_discard(ms);
1828     rcu_read_unlock();
1829 
1830     return ret;
1831 }
1832 
1833 /**
1834  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1835  *
1836  * Returns zero on success
1837  *
1838  * @rbname: name of the RAMBlock of the request. NULL means the
1839  *          same that last one.
1840  * @start: RAMBlock starting page
1841  * @length: RAMBlock size
1842  */
1843 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1844 {
1845     int ret = -1;
1846 
1847     trace_ram_discard_range(rbname, start, length);
1848 
1849     rcu_read_lock();
1850     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1851 
1852     if (!rb) {
1853         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1854         goto err;
1855     }
1856 
1857     ret = ram_block_discard_range(rb, start, length);
1858 
1859 err:
1860     rcu_read_unlock();
1861 
1862     return ret;
1863 }
1864 
1865 static int ram_state_init(RAMState *rs)
1866 {
1867     memset(rs, 0, sizeof(*rs));
1868     qemu_mutex_init(&rs->bitmap_mutex);
1869     qemu_mutex_init(&rs->src_page_req_mutex);
1870     QSIMPLEQ_INIT(&rs->src_page_requests);
1871 
1872     if (migrate_use_xbzrle()) {
1873         XBZRLE_cache_lock();
1874         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1875         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1876                                   TARGET_PAGE_SIZE,
1877                                   TARGET_PAGE_SIZE);
1878         if (!XBZRLE.cache) {
1879             XBZRLE_cache_unlock();
1880             error_report("Error creating cache");
1881             return -1;
1882         }
1883         XBZRLE_cache_unlock();
1884 
1885         /* We prefer not to abort if there is no memory */
1886         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1887         if (!XBZRLE.encoded_buf) {
1888             error_report("Error allocating encoded_buf");
1889             return -1;
1890         }
1891 
1892         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1893         if (!XBZRLE.current_buf) {
1894             error_report("Error allocating current_buf");
1895             g_free(XBZRLE.encoded_buf);
1896             XBZRLE.encoded_buf = NULL;
1897             return -1;
1898         }
1899     }
1900 
1901     /* For memory_global_dirty_log_start below.  */
1902     qemu_mutex_lock_iothread();
1903 
1904     qemu_mutex_lock_ramlist();
1905     rcu_read_lock();
1906     ram_state_reset(rs);
1907 
1908     /* Skip setting bitmap if there is no RAM */
1909     if (ram_bytes_total()) {
1910         RAMBlock *block;
1911 
1912         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1913             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1914 
1915             block->bmap = bitmap_new(pages);
1916             bitmap_set(block->bmap, 0, pages);
1917             if (migrate_postcopy_ram()) {
1918                 block->unsentmap = bitmap_new(pages);
1919                 bitmap_set(block->unsentmap, 0, pages);
1920             }
1921         }
1922     }
1923 
1924     /*
1925      * Count the total number of pages used by ram blocks not including any
1926      * gaps due to alignment or unplugs.
1927      */
1928     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1929 
1930     memory_global_dirty_log_start();
1931     migration_bitmap_sync(rs);
1932     qemu_mutex_unlock_ramlist();
1933     qemu_mutex_unlock_iothread();
1934     rcu_read_unlock();
1935 
1936     return 0;
1937 }
1938 
1939 /*
1940  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1941  * long-running RCU critical section.  When rcu-reclaims in the code
1942  * start to become numerous it will be necessary to reduce the
1943  * granularity of these critical sections.
1944  */
1945 
1946 /**
1947  * ram_save_setup: Setup RAM for migration
1948  *
1949  * Returns zero to indicate success and negative for error
1950  *
1951  * @f: QEMUFile where to send the data
1952  * @opaque: RAMState pointer
1953  */
1954 static int ram_save_setup(QEMUFile *f, void *opaque)
1955 {
1956     RAMState *rs = opaque;
1957     RAMBlock *block;
1958 
1959     /* migration has already setup the bitmap, reuse it. */
1960     if (!migration_in_colo_state()) {
1961         if (ram_state_init(rs) < 0) {
1962             return -1;
1963          }
1964     }
1965     rs->f = f;
1966 
1967     rcu_read_lock();
1968 
1969     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1970 
1971     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1972         qemu_put_byte(f, strlen(block->idstr));
1973         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1974         qemu_put_be64(f, block->used_length);
1975         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1976             qemu_put_be64(f, block->page_size);
1977         }
1978     }
1979 
1980     rcu_read_unlock();
1981 
1982     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1983     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1984 
1985     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1986 
1987     return 0;
1988 }
1989 
1990 /**
1991  * ram_save_iterate: iterative stage for migration
1992  *
1993  * Returns zero to indicate success and negative for error
1994  *
1995  * @f: QEMUFile where to send the data
1996  * @opaque: RAMState pointer
1997  */
1998 static int ram_save_iterate(QEMUFile *f, void *opaque)
1999 {
2000     RAMState *rs = opaque;
2001     int ret;
2002     int i;
2003     int64_t t0;
2004     int done = 0;
2005 
2006     rcu_read_lock();
2007     if (ram_list.version != rs->last_version) {
2008         ram_state_reset(rs);
2009     }
2010 
2011     /* Read version before ram_list.blocks */
2012     smp_rmb();
2013 
2014     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2015 
2016     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2017     i = 0;
2018     while ((ret = qemu_file_rate_limit(f)) == 0) {
2019         int pages;
2020 
2021         pages = ram_find_and_save_block(rs, false);
2022         /* no more pages to sent */
2023         if (pages == 0) {
2024             done = 1;
2025             break;
2026         }
2027         rs->iterations++;
2028 
2029         /* we want to check in the 1st loop, just in case it was the 1st time
2030            and we had to sync the dirty bitmap.
2031            qemu_get_clock_ns() is a bit expensive, so we only check each some
2032            iterations
2033         */
2034         if ((i & 63) == 0) {
2035             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2036             if (t1 > MAX_WAIT) {
2037                 trace_ram_save_iterate_big_wait(t1, i);
2038                 break;
2039             }
2040         }
2041         i++;
2042     }
2043     flush_compressed_data(rs);
2044     rcu_read_unlock();
2045 
2046     /*
2047      * Must occur before EOS (or any QEMUFile operation)
2048      * because of RDMA protocol.
2049      */
2050     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2051 
2052     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2053     rs->bytes_transferred += 8;
2054 
2055     ret = qemu_file_get_error(f);
2056     if (ret < 0) {
2057         return ret;
2058     }
2059 
2060     return done;
2061 }
2062 
2063 /**
2064  * ram_save_complete: function called to send the remaining amount of ram
2065  *
2066  * Returns zero to indicate success
2067  *
2068  * Called with iothread lock
2069  *
2070  * @f: QEMUFile where to send the data
2071  * @opaque: RAMState pointer
2072  */
2073 static int ram_save_complete(QEMUFile *f, void *opaque)
2074 {
2075     RAMState *rs = opaque;
2076 
2077     rcu_read_lock();
2078 
2079     if (!migration_in_postcopy()) {
2080         migration_bitmap_sync(rs);
2081     }
2082 
2083     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2084 
2085     /* try transferring iterative blocks of memory */
2086 
2087     /* flush all remaining blocks regardless of rate limiting */
2088     while (true) {
2089         int pages;
2090 
2091         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2092         /* no more blocks to sent */
2093         if (pages == 0) {
2094             break;
2095         }
2096     }
2097 
2098     flush_compressed_data(rs);
2099     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2100 
2101     rcu_read_unlock();
2102 
2103     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2104 
2105     return 0;
2106 }
2107 
2108 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2109                              uint64_t *non_postcopiable_pending,
2110                              uint64_t *postcopiable_pending)
2111 {
2112     RAMState *rs = opaque;
2113     uint64_t remaining_size;
2114 
2115     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2116 
2117     if (!migration_in_postcopy() &&
2118         remaining_size < max_size) {
2119         qemu_mutex_lock_iothread();
2120         rcu_read_lock();
2121         migration_bitmap_sync(rs);
2122         rcu_read_unlock();
2123         qemu_mutex_unlock_iothread();
2124         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2125     }
2126 
2127     /* We can do postcopy, and all the data is postcopiable */
2128     *postcopiable_pending += remaining_size;
2129 }
2130 
2131 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2132 {
2133     unsigned int xh_len;
2134     int xh_flags;
2135     uint8_t *loaded_data;
2136 
2137     if (!xbzrle_decoded_buf) {
2138         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2139     }
2140     loaded_data = xbzrle_decoded_buf;
2141 
2142     /* extract RLE header */
2143     xh_flags = qemu_get_byte(f);
2144     xh_len = qemu_get_be16(f);
2145 
2146     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2147         error_report("Failed to load XBZRLE page - wrong compression!");
2148         return -1;
2149     }
2150 
2151     if (xh_len > TARGET_PAGE_SIZE) {
2152         error_report("Failed to load XBZRLE page - len overflow!");
2153         return -1;
2154     }
2155     /* load data and decode */
2156     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2157 
2158     /* decode RLE */
2159     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2160                              TARGET_PAGE_SIZE) == -1) {
2161         error_report("Failed to load XBZRLE page - decode error!");
2162         return -1;
2163     }
2164 
2165     return 0;
2166 }
2167 
2168 /**
2169  * ram_block_from_stream: read a RAMBlock id from the migration stream
2170  *
2171  * Must be called from within a rcu critical section.
2172  *
2173  * Returns a pointer from within the RCU-protected ram_list.
2174  *
2175  * @f: QEMUFile where to read the data from
2176  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2177  */
2178 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2179 {
2180     static RAMBlock *block = NULL;
2181     char id[256];
2182     uint8_t len;
2183 
2184     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2185         if (!block) {
2186             error_report("Ack, bad migration stream!");
2187             return NULL;
2188         }
2189         return block;
2190     }
2191 
2192     len = qemu_get_byte(f);
2193     qemu_get_buffer(f, (uint8_t *)id, len);
2194     id[len] = 0;
2195 
2196     block = qemu_ram_block_by_name(id);
2197     if (!block) {
2198         error_report("Can't find block %s", id);
2199         return NULL;
2200     }
2201 
2202     return block;
2203 }
2204 
2205 static inline void *host_from_ram_block_offset(RAMBlock *block,
2206                                                ram_addr_t offset)
2207 {
2208     if (!offset_in_ramblock(block, offset)) {
2209         return NULL;
2210     }
2211 
2212     return block->host + offset;
2213 }
2214 
2215 /**
2216  * ram_handle_compressed: handle the zero page case
2217  *
2218  * If a page (or a whole RDMA chunk) has been
2219  * determined to be zero, then zap it.
2220  *
2221  * @host: host address for the zero page
2222  * @ch: what the page is filled from.  We only support zero
2223  * @size: size of the zero page
2224  */
2225 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2226 {
2227     if (ch != 0 || !is_zero_range(host, size)) {
2228         memset(host, ch, size);
2229     }
2230 }
2231 
2232 static void *do_data_decompress(void *opaque)
2233 {
2234     DecompressParam *param = opaque;
2235     unsigned long pagesize;
2236     uint8_t *des;
2237     int len;
2238 
2239     qemu_mutex_lock(&param->mutex);
2240     while (!param->quit) {
2241         if (param->des) {
2242             des = param->des;
2243             len = param->len;
2244             param->des = 0;
2245             qemu_mutex_unlock(&param->mutex);
2246 
2247             pagesize = TARGET_PAGE_SIZE;
2248             /* uncompress() will return failed in some case, especially
2249              * when the page is dirted when doing the compression, it's
2250              * not a problem because the dirty page will be retransferred
2251              * and uncompress() won't break the data in other pages.
2252              */
2253             uncompress((Bytef *)des, &pagesize,
2254                        (const Bytef *)param->compbuf, len);
2255 
2256             qemu_mutex_lock(&decomp_done_lock);
2257             param->done = true;
2258             qemu_cond_signal(&decomp_done_cond);
2259             qemu_mutex_unlock(&decomp_done_lock);
2260 
2261             qemu_mutex_lock(&param->mutex);
2262         } else {
2263             qemu_cond_wait(&param->cond, &param->mutex);
2264         }
2265     }
2266     qemu_mutex_unlock(&param->mutex);
2267 
2268     return NULL;
2269 }
2270 
2271 static void wait_for_decompress_done(void)
2272 {
2273     int idx, thread_count;
2274 
2275     if (!migrate_use_compression()) {
2276         return;
2277     }
2278 
2279     thread_count = migrate_decompress_threads();
2280     qemu_mutex_lock(&decomp_done_lock);
2281     for (idx = 0; idx < thread_count; idx++) {
2282         while (!decomp_param[idx].done) {
2283             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2284         }
2285     }
2286     qemu_mutex_unlock(&decomp_done_lock);
2287 }
2288 
2289 void migrate_decompress_threads_create(void)
2290 {
2291     int i, thread_count;
2292 
2293     thread_count = migrate_decompress_threads();
2294     decompress_threads = g_new0(QemuThread, thread_count);
2295     decomp_param = g_new0(DecompressParam, thread_count);
2296     qemu_mutex_init(&decomp_done_lock);
2297     qemu_cond_init(&decomp_done_cond);
2298     for (i = 0; i < thread_count; i++) {
2299         qemu_mutex_init(&decomp_param[i].mutex);
2300         qemu_cond_init(&decomp_param[i].cond);
2301         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2302         decomp_param[i].done = true;
2303         decomp_param[i].quit = false;
2304         qemu_thread_create(decompress_threads + i, "decompress",
2305                            do_data_decompress, decomp_param + i,
2306                            QEMU_THREAD_JOINABLE);
2307     }
2308 }
2309 
2310 void migrate_decompress_threads_join(void)
2311 {
2312     int i, thread_count;
2313 
2314     thread_count = migrate_decompress_threads();
2315     for (i = 0; i < thread_count; i++) {
2316         qemu_mutex_lock(&decomp_param[i].mutex);
2317         decomp_param[i].quit = true;
2318         qemu_cond_signal(&decomp_param[i].cond);
2319         qemu_mutex_unlock(&decomp_param[i].mutex);
2320     }
2321     for (i = 0; i < thread_count; i++) {
2322         qemu_thread_join(decompress_threads + i);
2323         qemu_mutex_destroy(&decomp_param[i].mutex);
2324         qemu_cond_destroy(&decomp_param[i].cond);
2325         g_free(decomp_param[i].compbuf);
2326     }
2327     g_free(decompress_threads);
2328     g_free(decomp_param);
2329     decompress_threads = NULL;
2330     decomp_param = NULL;
2331 }
2332 
2333 static void decompress_data_with_multi_threads(QEMUFile *f,
2334                                                void *host, int len)
2335 {
2336     int idx, thread_count;
2337 
2338     thread_count = migrate_decompress_threads();
2339     qemu_mutex_lock(&decomp_done_lock);
2340     while (true) {
2341         for (idx = 0; idx < thread_count; idx++) {
2342             if (decomp_param[idx].done) {
2343                 decomp_param[idx].done = false;
2344                 qemu_mutex_lock(&decomp_param[idx].mutex);
2345                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2346                 decomp_param[idx].des = host;
2347                 decomp_param[idx].len = len;
2348                 qemu_cond_signal(&decomp_param[idx].cond);
2349                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2350                 break;
2351             }
2352         }
2353         if (idx < thread_count) {
2354             break;
2355         } else {
2356             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2357         }
2358     }
2359     qemu_mutex_unlock(&decomp_done_lock);
2360 }
2361 
2362 /**
2363  * ram_postcopy_incoming_init: allocate postcopy data structures
2364  *
2365  * Returns 0 for success and negative if there was one error
2366  *
2367  * @mis: current migration incoming state
2368  *
2369  * Allocate data structures etc needed by incoming migration with
2370  * postcopy-ram. postcopy-ram's similarly names
2371  * postcopy_ram_incoming_init does the work.
2372  */
2373 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2374 {
2375     unsigned long ram_pages = last_ram_page();
2376 
2377     return postcopy_ram_incoming_init(mis, ram_pages);
2378 }
2379 
2380 /**
2381  * ram_load_postcopy: load a page in postcopy case
2382  *
2383  * Returns 0 for success or -errno in case of error
2384  *
2385  * Called in postcopy mode by ram_load().
2386  * rcu_read_lock is taken prior to this being called.
2387  *
2388  * @f: QEMUFile where to send the data
2389  */
2390 static int ram_load_postcopy(QEMUFile *f)
2391 {
2392     int flags = 0, ret = 0;
2393     bool place_needed = false;
2394     bool matching_page_sizes = false;
2395     MigrationIncomingState *mis = migration_incoming_get_current();
2396     /* Temporary page that is later 'placed' */
2397     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2398     void *last_host = NULL;
2399     bool all_zero = false;
2400 
2401     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2402         ram_addr_t addr;
2403         void *host = NULL;
2404         void *page_buffer = NULL;
2405         void *place_source = NULL;
2406         RAMBlock *block = NULL;
2407         uint8_t ch;
2408 
2409         addr = qemu_get_be64(f);
2410         flags = addr & ~TARGET_PAGE_MASK;
2411         addr &= TARGET_PAGE_MASK;
2412 
2413         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2414         place_needed = false;
2415         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2416             block = ram_block_from_stream(f, flags);
2417 
2418             host = host_from_ram_block_offset(block, addr);
2419             if (!host) {
2420                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2421                 ret = -EINVAL;
2422                 break;
2423             }
2424             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2425             /*
2426              * Postcopy requires that we place whole host pages atomically;
2427              * these may be huge pages for RAMBlocks that are backed by
2428              * hugetlbfs.
2429              * To make it atomic, the data is read into a temporary page
2430              * that's moved into place later.
2431              * The migration protocol uses,  possibly smaller, target-pages
2432              * however the source ensures it always sends all the components
2433              * of a host page in order.
2434              */
2435             page_buffer = postcopy_host_page +
2436                           ((uintptr_t)host & (block->page_size - 1));
2437             /* If all TP are zero then we can optimise the place */
2438             if (!((uintptr_t)host & (block->page_size - 1))) {
2439                 all_zero = true;
2440             } else {
2441                 /* not the 1st TP within the HP */
2442                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2443                     error_report("Non-sequential target page %p/%p",
2444                                   host, last_host);
2445                     ret = -EINVAL;
2446                     break;
2447                 }
2448             }
2449 
2450 
2451             /*
2452              * If it's the last part of a host page then we place the host
2453              * page
2454              */
2455             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2456                                      (block->page_size - 1)) == 0;
2457             place_source = postcopy_host_page;
2458         }
2459         last_host = host;
2460 
2461         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2462         case RAM_SAVE_FLAG_ZERO:
2463             ch = qemu_get_byte(f);
2464             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2465             if (ch) {
2466                 all_zero = false;
2467             }
2468             break;
2469 
2470         case RAM_SAVE_FLAG_PAGE:
2471             all_zero = false;
2472             if (!place_needed || !matching_page_sizes) {
2473                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2474             } else {
2475                 /* Avoids the qemu_file copy during postcopy, which is
2476                  * going to do a copy later; can only do it when we
2477                  * do this read in one go (matching page sizes)
2478                  */
2479                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2480                                          TARGET_PAGE_SIZE);
2481             }
2482             break;
2483         case RAM_SAVE_FLAG_EOS:
2484             /* normal exit */
2485             break;
2486         default:
2487             error_report("Unknown combination of migration flags: %#x"
2488                          " (postcopy mode)", flags);
2489             ret = -EINVAL;
2490         }
2491 
2492         if (place_needed) {
2493             /* This gets called at the last target page in the host page */
2494             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2495 
2496             if (all_zero) {
2497                 ret = postcopy_place_page_zero(mis, place_dest,
2498                                                block->page_size);
2499             } else {
2500                 ret = postcopy_place_page(mis, place_dest,
2501                                           place_source, block->page_size);
2502             }
2503         }
2504         if (!ret) {
2505             ret = qemu_file_get_error(f);
2506         }
2507     }
2508 
2509     return ret;
2510 }
2511 
2512 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2513 {
2514     int flags = 0, ret = 0;
2515     static uint64_t seq_iter;
2516     int len = 0;
2517     /*
2518      * If system is running in postcopy mode, page inserts to host memory must
2519      * be atomic
2520      */
2521     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2522     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2523     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2524 
2525     seq_iter++;
2526 
2527     if (version_id != 4) {
2528         ret = -EINVAL;
2529     }
2530 
2531     /* This RCU critical section can be very long running.
2532      * When RCU reclaims in the code start to become numerous,
2533      * it will be necessary to reduce the granularity of this
2534      * critical section.
2535      */
2536     rcu_read_lock();
2537 
2538     if (postcopy_running) {
2539         ret = ram_load_postcopy(f);
2540     }
2541 
2542     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2543         ram_addr_t addr, total_ram_bytes;
2544         void *host = NULL;
2545         uint8_t ch;
2546 
2547         addr = qemu_get_be64(f);
2548         flags = addr & ~TARGET_PAGE_MASK;
2549         addr &= TARGET_PAGE_MASK;
2550 
2551         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2552                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2553             RAMBlock *block = ram_block_from_stream(f, flags);
2554 
2555             host = host_from_ram_block_offset(block, addr);
2556             if (!host) {
2557                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2558                 ret = -EINVAL;
2559                 break;
2560             }
2561             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2562         }
2563 
2564         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2565         case RAM_SAVE_FLAG_MEM_SIZE:
2566             /* Synchronize RAM block list */
2567             total_ram_bytes = addr;
2568             while (!ret && total_ram_bytes) {
2569                 RAMBlock *block;
2570                 char id[256];
2571                 ram_addr_t length;
2572 
2573                 len = qemu_get_byte(f);
2574                 qemu_get_buffer(f, (uint8_t *)id, len);
2575                 id[len] = 0;
2576                 length = qemu_get_be64(f);
2577 
2578                 block = qemu_ram_block_by_name(id);
2579                 if (block) {
2580                     if (length != block->used_length) {
2581                         Error *local_err = NULL;
2582 
2583                         ret = qemu_ram_resize(block, length,
2584                                               &local_err);
2585                         if (local_err) {
2586                             error_report_err(local_err);
2587                         }
2588                     }
2589                     /* For postcopy we need to check hugepage sizes match */
2590                     if (postcopy_advised &&
2591                         block->page_size != qemu_host_page_size) {
2592                         uint64_t remote_page_size = qemu_get_be64(f);
2593                         if (remote_page_size != block->page_size) {
2594                             error_report("Mismatched RAM page size %s "
2595                                          "(local) %zd != %" PRId64,
2596                                          id, block->page_size,
2597                                          remote_page_size);
2598                             ret = -EINVAL;
2599                         }
2600                     }
2601                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2602                                           block->idstr);
2603                 } else {
2604                     error_report("Unknown ramblock \"%s\", cannot "
2605                                  "accept migration", id);
2606                     ret = -EINVAL;
2607                 }
2608 
2609                 total_ram_bytes -= length;
2610             }
2611             break;
2612 
2613         case RAM_SAVE_FLAG_ZERO:
2614             ch = qemu_get_byte(f);
2615             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2616             break;
2617 
2618         case RAM_SAVE_FLAG_PAGE:
2619             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2620             break;
2621 
2622         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2623             len = qemu_get_be32(f);
2624             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2625                 error_report("Invalid compressed data length: %d", len);
2626                 ret = -EINVAL;
2627                 break;
2628             }
2629             decompress_data_with_multi_threads(f, host, len);
2630             break;
2631 
2632         case RAM_SAVE_FLAG_XBZRLE:
2633             if (load_xbzrle(f, addr, host) < 0) {
2634                 error_report("Failed to decompress XBZRLE page at "
2635                              RAM_ADDR_FMT, addr);
2636                 ret = -EINVAL;
2637                 break;
2638             }
2639             break;
2640         case RAM_SAVE_FLAG_EOS:
2641             /* normal exit */
2642             break;
2643         default:
2644             if (flags & RAM_SAVE_FLAG_HOOK) {
2645                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2646             } else {
2647                 error_report("Unknown combination of migration flags: %#x",
2648                              flags);
2649                 ret = -EINVAL;
2650             }
2651         }
2652         if (!ret) {
2653             ret = qemu_file_get_error(f);
2654         }
2655     }
2656 
2657     wait_for_decompress_done();
2658     rcu_read_unlock();
2659     trace_ram_load_complete(ret, seq_iter);
2660     return ret;
2661 }
2662 
2663 static SaveVMHandlers savevm_ram_handlers = {
2664     .save_live_setup = ram_save_setup,
2665     .save_live_iterate = ram_save_iterate,
2666     .save_live_complete_postcopy = ram_save_complete,
2667     .save_live_complete_precopy = ram_save_complete,
2668     .save_live_pending = ram_save_pending,
2669     .load_state = ram_load,
2670     .cleanup = ram_migration_cleanup,
2671 };
2672 
2673 void ram_mig_init(void)
2674 {
2675     qemu_mutex_init(&XBZRLE.lock);
2676     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2677 }
2678