xref: /openbmc/qemu/migration/ram.c (revision 12a6c15e)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "cpu.h"
30 #include <zlib.h>
31 #include "qapi-event.h"
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "trace.h"
46 #include "exec/ram_addr.h"
47 #include "qemu/rcu_queue.h"
48 #include "migration/colo.h"
49 
50 /***********************************************************/
51 /* ram save/restore */
52 
53 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
54  * worked for pages that where filled with the same char.  We switched
55  * it to only search for the zero value.  And to avoid confusion with
56  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
57  */
58 
59 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
60 #define RAM_SAVE_FLAG_ZERO     0x02
61 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
62 #define RAM_SAVE_FLAG_PAGE     0x08
63 #define RAM_SAVE_FLAG_EOS      0x10
64 #define RAM_SAVE_FLAG_CONTINUE 0x20
65 #define RAM_SAVE_FLAG_XBZRLE   0x40
66 /* 0x80 is reserved in migration.h start with 0x100 next */
67 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
68 
69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
70 {
71     return buffer_is_zero(p, size);
72 }
73 
74 XBZRLECacheStats xbzrle_counters;
75 
76 /* struct contains XBZRLE cache and a static page
77    used by the compression */
78 static struct {
79     /* buffer used for XBZRLE encoding */
80     uint8_t *encoded_buf;
81     /* buffer for storing page content */
82     uint8_t *current_buf;
83     /* Cache for XBZRLE, Protected by lock. */
84     PageCache *cache;
85     QemuMutex lock;
86     /* it will store a page full of zeros */
87     uint8_t *zero_target_page;
88     /* buffer used for XBZRLE decoding */
89     uint8_t *decoded_buf;
90 } XBZRLE;
91 
92 static void XBZRLE_cache_lock(void)
93 {
94     if (migrate_use_xbzrle())
95         qemu_mutex_lock(&XBZRLE.lock);
96 }
97 
98 static void XBZRLE_cache_unlock(void)
99 {
100     if (migrate_use_xbzrle())
101         qemu_mutex_unlock(&XBZRLE.lock);
102 }
103 
104 /**
105  * xbzrle_cache_resize: resize the xbzrle cache
106  *
107  * This function is called from qmp_migrate_set_cache_size in main
108  * thread, possibly while a migration is in progress.  A running
109  * migration may be using the cache and might finish during this call,
110  * hence changes to the cache are protected by XBZRLE.lock().
111  *
112  * Returns the new_size or negative in case of error.
113  *
114  * @new_size: new cache size
115  */
116 int64_t xbzrle_cache_resize(int64_t new_size)
117 {
118     PageCache *new_cache;
119     int64_t ret;
120 
121     if (new_size < TARGET_PAGE_SIZE) {
122         return -1;
123     }
124 
125     XBZRLE_cache_lock();
126 
127     if (XBZRLE.cache != NULL) {
128         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
129             goto out_new_size;
130         }
131         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
132                                         TARGET_PAGE_SIZE);
133         if (!new_cache) {
134             error_report("Error creating cache");
135             ret = -1;
136             goto out;
137         }
138 
139         cache_fini(XBZRLE.cache);
140         XBZRLE.cache = new_cache;
141     }
142 
143 out_new_size:
144     ret = pow2floor(new_size);
145 out:
146     XBZRLE_cache_unlock();
147     return ret;
148 }
149 
150 /*
151  * An outstanding page request, on the source, having been received
152  * and queued
153  */
154 struct RAMSrcPageRequest {
155     RAMBlock *rb;
156     hwaddr    offset;
157     hwaddr    len;
158 
159     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
160 };
161 
162 /* State of RAM for migration */
163 struct RAMState {
164     /* QEMUFile used for this migration */
165     QEMUFile *f;
166     /* Last block that we have visited searching for dirty pages */
167     RAMBlock *last_seen_block;
168     /* Last block from where we have sent data */
169     RAMBlock *last_sent_block;
170     /* Last dirty target page we have sent */
171     ram_addr_t last_page;
172     /* last ram version we have seen */
173     uint32_t last_version;
174     /* We are in the first round */
175     bool ram_bulk_stage;
176     /* How many times we have dirty too many pages */
177     int dirty_rate_high_cnt;
178     /* these variables are used for bitmap sync */
179     /* last time we did a full bitmap_sync */
180     int64_t time_last_bitmap_sync;
181     /* bytes transferred at start_time */
182     uint64_t bytes_xfer_prev;
183     /* number of dirty pages since start_time */
184     uint64_t num_dirty_pages_period;
185     /* xbzrle misses since the beginning of the period */
186     uint64_t xbzrle_cache_miss_prev;
187     /* number of iterations at the beginning of period */
188     uint64_t iterations_prev;
189     /* Iterations since start */
190     uint64_t iterations;
191     /* number of dirty bits in the bitmap */
192     uint64_t migration_dirty_pages;
193     /* protects modification of the bitmap */
194     QemuMutex bitmap_mutex;
195     /* The RAMBlock used in the last src_page_requests */
196     RAMBlock *last_req_rb;
197     /* Queue of outstanding page requests from the destination */
198     QemuMutex src_page_req_mutex;
199     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
200 };
201 typedef struct RAMState RAMState;
202 
203 static RAMState *ram_state;
204 
205 uint64_t ram_bytes_remaining(void)
206 {
207     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
208 }
209 
210 MigrationStats ram_counters;
211 
212 /* used by the search for pages to send */
213 struct PageSearchStatus {
214     /* Current block being searched */
215     RAMBlock    *block;
216     /* Current page to search from */
217     unsigned long page;
218     /* Set once we wrap around */
219     bool         complete_round;
220 };
221 typedef struct PageSearchStatus PageSearchStatus;
222 
223 struct CompressParam {
224     bool done;
225     bool quit;
226     QEMUFile *file;
227     QemuMutex mutex;
228     QemuCond cond;
229     RAMBlock *block;
230     ram_addr_t offset;
231 };
232 typedef struct CompressParam CompressParam;
233 
234 struct DecompressParam {
235     bool done;
236     bool quit;
237     QemuMutex mutex;
238     QemuCond cond;
239     void *des;
240     uint8_t *compbuf;
241     int len;
242 };
243 typedef struct DecompressParam DecompressParam;
244 
245 static CompressParam *comp_param;
246 static QemuThread *compress_threads;
247 /* comp_done_cond is used to wake up the migration thread when
248  * one of the compression threads has finished the compression.
249  * comp_done_lock is used to co-work with comp_done_cond.
250  */
251 static QemuMutex comp_done_lock;
252 static QemuCond comp_done_cond;
253 /* The empty QEMUFileOps will be used by file in CompressParam */
254 static const QEMUFileOps empty_ops = { };
255 
256 static DecompressParam *decomp_param;
257 static QemuThread *decompress_threads;
258 static QemuMutex decomp_done_lock;
259 static QemuCond decomp_done_cond;
260 
261 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
262                                 ram_addr_t offset);
263 
264 static void *do_data_compress(void *opaque)
265 {
266     CompressParam *param = opaque;
267     RAMBlock *block;
268     ram_addr_t offset;
269 
270     qemu_mutex_lock(&param->mutex);
271     while (!param->quit) {
272         if (param->block) {
273             block = param->block;
274             offset = param->offset;
275             param->block = NULL;
276             qemu_mutex_unlock(&param->mutex);
277 
278             do_compress_ram_page(param->file, block, offset);
279 
280             qemu_mutex_lock(&comp_done_lock);
281             param->done = true;
282             qemu_cond_signal(&comp_done_cond);
283             qemu_mutex_unlock(&comp_done_lock);
284 
285             qemu_mutex_lock(&param->mutex);
286         } else {
287             qemu_cond_wait(&param->cond, &param->mutex);
288         }
289     }
290     qemu_mutex_unlock(&param->mutex);
291 
292     return NULL;
293 }
294 
295 static inline void terminate_compression_threads(void)
296 {
297     int idx, thread_count;
298 
299     thread_count = migrate_compress_threads();
300 
301     for (idx = 0; idx < thread_count; idx++) {
302         qemu_mutex_lock(&comp_param[idx].mutex);
303         comp_param[idx].quit = true;
304         qemu_cond_signal(&comp_param[idx].cond);
305         qemu_mutex_unlock(&comp_param[idx].mutex);
306     }
307 }
308 
309 static void compress_threads_save_cleanup(void)
310 {
311     int i, thread_count;
312 
313     if (!migrate_use_compression()) {
314         return;
315     }
316     terminate_compression_threads();
317     thread_count = migrate_compress_threads();
318     for (i = 0; i < thread_count; i++) {
319         qemu_thread_join(compress_threads + i);
320         qemu_fclose(comp_param[i].file);
321         qemu_mutex_destroy(&comp_param[i].mutex);
322         qemu_cond_destroy(&comp_param[i].cond);
323     }
324     qemu_mutex_destroy(&comp_done_lock);
325     qemu_cond_destroy(&comp_done_cond);
326     g_free(compress_threads);
327     g_free(comp_param);
328     compress_threads = NULL;
329     comp_param = NULL;
330 }
331 
332 static void compress_threads_save_setup(void)
333 {
334     int i, thread_count;
335 
336     if (!migrate_use_compression()) {
337         return;
338     }
339     thread_count = migrate_compress_threads();
340     compress_threads = g_new0(QemuThread, thread_count);
341     comp_param = g_new0(CompressParam, thread_count);
342     qemu_cond_init(&comp_done_cond);
343     qemu_mutex_init(&comp_done_lock);
344     for (i = 0; i < thread_count; i++) {
345         /* comp_param[i].file is just used as a dummy buffer to save data,
346          * set its ops to empty.
347          */
348         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
349         comp_param[i].done = true;
350         comp_param[i].quit = false;
351         qemu_mutex_init(&comp_param[i].mutex);
352         qemu_cond_init(&comp_param[i].cond);
353         qemu_thread_create(compress_threads + i, "compress",
354                            do_data_compress, comp_param + i,
355                            QEMU_THREAD_JOINABLE);
356     }
357 }
358 
359 /**
360  * save_page_header: write page header to wire
361  *
362  * If this is the 1st block, it also writes the block identification
363  *
364  * Returns the number of bytes written
365  *
366  * @f: QEMUFile where to send the data
367  * @block: block that contains the page we want to send
368  * @offset: offset inside the block for the page
369  *          in the lower bits, it contains flags
370  */
371 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
372                                ram_addr_t offset)
373 {
374     size_t size, len;
375 
376     if (block == rs->last_sent_block) {
377         offset |= RAM_SAVE_FLAG_CONTINUE;
378     }
379     qemu_put_be64(f, offset);
380     size = 8;
381 
382     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
383         len = strlen(block->idstr);
384         qemu_put_byte(f, len);
385         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
386         size += 1 + len;
387         rs->last_sent_block = block;
388     }
389     return size;
390 }
391 
392 /**
393  * mig_throttle_guest_down: throotle down the guest
394  *
395  * Reduce amount of guest cpu execution to hopefully slow down memory
396  * writes. If guest dirty memory rate is reduced below the rate at
397  * which we can transfer pages to the destination then we should be
398  * able to complete migration. Some workloads dirty memory way too
399  * fast and will not effectively converge, even with auto-converge.
400  */
401 static void mig_throttle_guest_down(void)
402 {
403     MigrationState *s = migrate_get_current();
404     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
405     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
406 
407     /* We have not started throttling yet. Let's start it. */
408     if (!cpu_throttle_active()) {
409         cpu_throttle_set(pct_initial);
410     } else {
411         /* Throttling already on, just increase the rate */
412         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
413     }
414 }
415 
416 /**
417  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
418  *
419  * @rs: current RAM state
420  * @current_addr: address for the zero page
421  *
422  * Update the xbzrle cache to reflect a page that's been sent as all 0.
423  * The important thing is that a stale (not-yet-0'd) page be replaced
424  * by the new data.
425  * As a bonus, if the page wasn't in the cache it gets added so that
426  * when a small write is made into the 0'd page it gets XBZRLE sent.
427  */
428 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
429 {
430     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
431         return;
432     }
433 
434     /* We don't care if this fails to allocate a new cache page
435      * as long as it updated an old one */
436     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
437                  ram_counters.dirty_sync_count);
438 }
439 
440 #define ENCODING_FLAG_XBZRLE 0x1
441 
442 /**
443  * save_xbzrle_page: compress and send current page
444  *
445  * Returns: 1 means that we wrote the page
446  *          0 means that page is identical to the one already sent
447  *          -1 means that xbzrle would be longer than normal
448  *
449  * @rs: current RAM state
450  * @current_data: pointer to the address of the page contents
451  * @current_addr: addr of the page
452  * @block: block that contains the page we want to send
453  * @offset: offset inside the block for the page
454  * @last_stage: if we are at the completion stage
455  */
456 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
457                             ram_addr_t current_addr, RAMBlock *block,
458                             ram_addr_t offset, bool last_stage)
459 {
460     int encoded_len = 0, bytes_xbzrle;
461     uint8_t *prev_cached_page;
462 
463     if (!cache_is_cached(XBZRLE.cache, current_addr,
464                          ram_counters.dirty_sync_count)) {
465         xbzrle_counters.cache_miss++;
466         if (!last_stage) {
467             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
468                              ram_counters.dirty_sync_count) == -1) {
469                 return -1;
470             } else {
471                 /* update *current_data when the page has been
472                    inserted into cache */
473                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
474             }
475         }
476         return -1;
477     }
478 
479     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
480 
481     /* save current buffer into memory */
482     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
483 
484     /* XBZRLE encoding (if there is no overflow) */
485     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
486                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
487                                        TARGET_PAGE_SIZE);
488     if (encoded_len == 0) {
489         trace_save_xbzrle_page_skipping();
490         return 0;
491     } else if (encoded_len == -1) {
492         trace_save_xbzrle_page_overflow();
493         xbzrle_counters.overflow++;
494         /* update data in the cache */
495         if (!last_stage) {
496             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
497             *current_data = prev_cached_page;
498         }
499         return -1;
500     }
501 
502     /* we need to update the data in the cache, in order to get the same data */
503     if (!last_stage) {
504         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
505     }
506 
507     /* Send XBZRLE based compressed page */
508     bytes_xbzrle = save_page_header(rs, rs->f, block,
509                                     offset | RAM_SAVE_FLAG_XBZRLE);
510     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
511     qemu_put_be16(rs->f, encoded_len);
512     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
513     bytes_xbzrle += encoded_len + 1 + 2;
514     xbzrle_counters.pages++;
515     xbzrle_counters.bytes += bytes_xbzrle;
516     ram_counters.transferred += bytes_xbzrle;
517 
518     return 1;
519 }
520 
521 /**
522  * migration_bitmap_find_dirty: find the next dirty page from start
523  *
524  * Called with rcu_read_lock() to protect migration_bitmap
525  *
526  * Returns the byte offset within memory region of the start of a dirty page
527  *
528  * @rs: current RAM state
529  * @rb: RAMBlock where to search for dirty pages
530  * @start: page where we start the search
531  */
532 static inline
533 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
534                                           unsigned long start)
535 {
536     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
537     unsigned long *bitmap = rb->bmap;
538     unsigned long next;
539 
540     if (rs->ram_bulk_stage && start > 0) {
541         next = start + 1;
542     } else {
543         next = find_next_bit(bitmap, size, start);
544     }
545 
546     return next;
547 }
548 
549 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
550                                                 RAMBlock *rb,
551                                                 unsigned long page)
552 {
553     bool ret;
554 
555     ret = test_and_clear_bit(page, rb->bmap);
556 
557     if (ret) {
558         rs->migration_dirty_pages--;
559     }
560     return ret;
561 }
562 
563 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
564                                         ram_addr_t start, ram_addr_t length)
565 {
566     rs->migration_dirty_pages +=
567         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
568                                               &rs->num_dirty_pages_period);
569 }
570 
571 /**
572  * ram_pagesize_summary: calculate all the pagesizes of a VM
573  *
574  * Returns a summary bitmap of the page sizes of all RAMBlocks
575  *
576  * For VMs with just normal pages this is equivalent to the host page
577  * size. If it's got some huge pages then it's the OR of all the
578  * different page sizes.
579  */
580 uint64_t ram_pagesize_summary(void)
581 {
582     RAMBlock *block;
583     uint64_t summary = 0;
584 
585     RAMBLOCK_FOREACH(block) {
586         summary |= block->page_size;
587     }
588 
589     return summary;
590 }
591 
592 static void migration_bitmap_sync(RAMState *rs)
593 {
594     RAMBlock *block;
595     int64_t end_time;
596     uint64_t bytes_xfer_now;
597 
598     ram_counters.dirty_sync_count++;
599 
600     if (!rs->time_last_bitmap_sync) {
601         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
602     }
603 
604     trace_migration_bitmap_sync_start();
605     memory_global_dirty_log_sync();
606 
607     qemu_mutex_lock(&rs->bitmap_mutex);
608     rcu_read_lock();
609     RAMBLOCK_FOREACH(block) {
610         migration_bitmap_sync_range(rs, block, 0, block->used_length);
611     }
612     rcu_read_unlock();
613     qemu_mutex_unlock(&rs->bitmap_mutex);
614 
615     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
616 
617     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
618 
619     /* more than 1 second = 1000 millisecons */
620     if (end_time > rs->time_last_bitmap_sync + 1000) {
621         /* calculate period counters */
622         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
623             / (end_time - rs->time_last_bitmap_sync);
624         bytes_xfer_now = ram_counters.transferred;
625 
626         if (migrate_auto_converge()) {
627             /* The following detection logic can be refined later. For now:
628                Check to see if the dirtied bytes is 50% more than the approx.
629                amount of bytes that just got transferred since the last time we
630                were in this routine. If that happens twice, start or increase
631                throttling */
632 
633             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
634                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
635                 (++rs->dirty_rate_high_cnt >= 2)) {
636                     trace_migration_throttle();
637                     rs->dirty_rate_high_cnt = 0;
638                     mig_throttle_guest_down();
639             }
640         }
641 
642         if (migrate_use_xbzrle()) {
643             if (rs->iterations_prev != rs->iterations) {
644                 xbzrle_counters.cache_miss_rate =
645                    (double)(xbzrle_counters.cache_miss -
646                             rs->xbzrle_cache_miss_prev) /
647                    (rs->iterations - rs->iterations_prev);
648             }
649             rs->iterations_prev = rs->iterations;
650             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
651         }
652 
653         /* reset period counters */
654         rs->time_last_bitmap_sync = end_time;
655         rs->num_dirty_pages_period = 0;
656         rs->bytes_xfer_prev = bytes_xfer_now;
657     }
658     if (migrate_use_events()) {
659         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
660     }
661 }
662 
663 /**
664  * save_zero_page: send the zero page to the stream
665  *
666  * Returns the number of pages written.
667  *
668  * @rs: current RAM state
669  * @block: block that contains the page we want to send
670  * @offset: offset inside the block for the page
671  * @p: pointer to the page
672  */
673 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
674                           uint8_t *p)
675 {
676     int pages = -1;
677 
678     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
679         ram_counters.duplicate++;
680         ram_counters.transferred +=
681             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
682         qemu_put_byte(rs->f, 0);
683         ram_counters.transferred += 1;
684         pages = 1;
685     }
686 
687     return pages;
688 }
689 
690 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
691 {
692     if (!migrate_release_ram() || !migration_in_postcopy()) {
693         return;
694     }
695 
696     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
697 }
698 
699 /**
700  * ram_save_page: send the given page to the stream
701  *
702  * Returns the number of pages written.
703  *          < 0 - error
704  *          >=0 - Number of pages written - this might legally be 0
705  *                if xbzrle noticed the page was the same.
706  *
707  * @rs: current RAM state
708  * @block: block that contains the page we want to send
709  * @offset: offset inside the block for the page
710  * @last_stage: if we are at the completion stage
711  */
712 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
713 {
714     int pages = -1;
715     uint64_t bytes_xmit;
716     ram_addr_t current_addr;
717     uint8_t *p;
718     int ret;
719     bool send_async = true;
720     RAMBlock *block = pss->block;
721     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
722 
723     p = block->host + offset;
724     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
725 
726     /* In doubt sent page as normal */
727     bytes_xmit = 0;
728     ret = ram_control_save_page(rs->f, block->offset,
729                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
730     if (bytes_xmit) {
731         ram_counters.transferred += bytes_xmit;
732         pages = 1;
733     }
734 
735     XBZRLE_cache_lock();
736 
737     current_addr = block->offset + offset;
738 
739     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
740         if (ret != RAM_SAVE_CONTROL_DELAYED) {
741             if (bytes_xmit > 0) {
742                 ram_counters.normal++;
743             } else if (bytes_xmit == 0) {
744                 ram_counters.duplicate++;
745             }
746         }
747     } else {
748         pages = save_zero_page(rs, block, offset, p);
749         if (pages > 0) {
750             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
751              * page would be stale
752              */
753             xbzrle_cache_zero_page(rs, current_addr);
754             ram_release_pages(block->idstr, offset, pages);
755         } else if (!rs->ram_bulk_stage &&
756                    !migration_in_postcopy() && migrate_use_xbzrle()) {
757             pages = save_xbzrle_page(rs, &p, current_addr, block,
758                                      offset, last_stage);
759             if (!last_stage) {
760                 /* Can't send this cached data async, since the cache page
761                  * might get updated before it gets to the wire
762                  */
763                 send_async = false;
764             }
765         }
766     }
767 
768     /* XBZRLE overflow or normal page */
769     if (pages == -1) {
770         ram_counters.transferred +=
771             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
772         if (send_async) {
773             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
774                                   migrate_release_ram() &
775                                   migration_in_postcopy());
776         } else {
777             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
778         }
779         ram_counters.transferred += TARGET_PAGE_SIZE;
780         pages = 1;
781         ram_counters.normal++;
782     }
783 
784     XBZRLE_cache_unlock();
785 
786     return pages;
787 }
788 
789 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
790                                 ram_addr_t offset)
791 {
792     RAMState *rs = ram_state;
793     int bytes_sent, blen;
794     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
795 
796     bytes_sent = save_page_header(rs, f, block, offset |
797                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
798     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
799                                      migrate_compress_level());
800     if (blen < 0) {
801         bytes_sent = 0;
802         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
803         error_report("compressed data failed!");
804     } else {
805         bytes_sent += blen;
806         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
807     }
808 
809     return bytes_sent;
810 }
811 
812 static void flush_compressed_data(RAMState *rs)
813 {
814     int idx, len, thread_count;
815 
816     if (!migrate_use_compression()) {
817         return;
818     }
819     thread_count = migrate_compress_threads();
820 
821     qemu_mutex_lock(&comp_done_lock);
822     for (idx = 0; idx < thread_count; idx++) {
823         while (!comp_param[idx].done) {
824             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
825         }
826     }
827     qemu_mutex_unlock(&comp_done_lock);
828 
829     for (idx = 0; idx < thread_count; idx++) {
830         qemu_mutex_lock(&comp_param[idx].mutex);
831         if (!comp_param[idx].quit) {
832             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
833             ram_counters.transferred += len;
834         }
835         qemu_mutex_unlock(&comp_param[idx].mutex);
836     }
837 }
838 
839 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
840                                        ram_addr_t offset)
841 {
842     param->block = block;
843     param->offset = offset;
844 }
845 
846 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
847                                            ram_addr_t offset)
848 {
849     int idx, thread_count, bytes_xmit = -1, pages = -1;
850 
851     thread_count = migrate_compress_threads();
852     qemu_mutex_lock(&comp_done_lock);
853     while (true) {
854         for (idx = 0; idx < thread_count; idx++) {
855             if (comp_param[idx].done) {
856                 comp_param[idx].done = false;
857                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
858                 qemu_mutex_lock(&comp_param[idx].mutex);
859                 set_compress_params(&comp_param[idx], block, offset);
860                 qemu_cond_signal(&comp_param[idx].cond);
861                 qemu_mutex_unlock(&comp_param[idx].mutex);
862                 pages = 1;
863                 ram_counters.normal++;
864                 ram_counters.transferred += bytes_xmit;
865                 break;
866             }
867         }
868         if (pages > 0) {
869             break;
870         } else {
871             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
872         }
873     }
874     qemu_mutex_unlock(&comp_done_lock);
875 
876     return pages;
877 }
878 
879 /**
880  * ram_save_compressed_page: compress the given page and send it to the stream
881  *
882  * Returns the number of pages written.
883  *
884  * @rs: current RAM state
885  * @block: block that contains the page we want to send
886  * @offset: offset inside the block for the page
887  * @last_stage: if we are at the completion stage
888  */
889 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
890                                     bool last_stage)
891 {
892     int pages = -1;
893     uint64_t bytes_xmit = 0;
894     uint8_t *p;
895     int ret, blen;
896     RAMBlock *block = pss->block;
897     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
898 
899     p = block->host + offset;
900 
901     ret = ram_control_save_page(rs->f, block->offset,
902                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
903     if (bytes_xmit) {
904         ram_counters.transferred += bytes_xmit;
905         pages = 1;
906     }
907     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
908         if (ret != RAM_SAVE_CONTROL_DELAYED) {
909             if (bytes_xmit > 0) {
910                 ram_counters.normal++;
911             } else if (bytes_xmit == 0) {
912                 ram_counters.duplicate++;
913             }
914         }
915     } else {
916         /* When starting the process of a new block, the first page of
917          * the block should be sent out before other pages in the same
918          * block, and all the pages in last block should have been sent
919          * out, keeping this order is important, because the 'cont' flag
920          * is used to avoid resending the block name.
921          */
922         if (block != rs->last_sent_block) {
923             flush_compressed_data(rs);
924             pages = save_zero_page(rs, block, offset, p);
925             if (pages == -1) {
926                 /* Make sure the first page is sent out before other pages */
927                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
928                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
929                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
930                                                  migrate_compress_level());
931                 if (blen > 0) {
932                     ram_counters.transferred += bytes_xmit + blen;
933                     ram_counters.normal++;
934                     pages = 1;
935                 } else {
936                     qemu_file_set_error(rs->f, blen);
937                     error_report("compressed data failed!");
938                 }
939             }
940             if (pages > 0) {
941                 ram_release_pages(block->idstr, offset, pages);
942             }
943         } else {
944             pages = save_zero_page(rs, block, offset, p);
945             if (pages == -1) {
946                 pages = compress_page_with_multi_thread(rs, block, offset);
947             } else {
948                 ram_release_pages(block->idstr, offset, pages);
949             }
950         }
951     }
952 
953     return pages;
954 }
955 
956 /**
957  * find_dirty_block: find the next dirty page and update any state
958  * associated with the search process.
959  *
960  * Returns if a page is found
961  *
962  * @rs: current RAM state
963  * @pss: data about the state of the current dirty page scan
964  * @again: set to false if the search has scanned the whole of RAM
965  */
966 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
967 {
968     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
969     if (pss->complete_round && pss->block == rs->last_seen_block &&
970         pss->page >= rs->last_page) {
971         /*
972          * We've been once around the RAM and haven't found anything.
973          * Give up.
974          */
975         *again = false;
976         return false;
977     }
978     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
979         /* Didn't find anything in this RAM Block */
980         pss->page = 0;
981         pss->block = QLIST_NEXT_RCU(pss->block, next);
982         if (!pss->block) {
983             /* Hit the end of the list */
984             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
985             /* Flag that we've looped */
986             pss->complete_round = true;
987             rs->ram_bulk_stage = false;
988             if (migrate_use_xbzrle()) {
989                 /* If xbzrle is on, stop using the data compression at this
990                  * point. In theory, xbzrle can do better than compression.
991                  */
992                 flush_compressed_data(rs);
993             }
994         }
995         /* Didn't find anything this time, but try again on the new block */
996         *again = true;
997         return false;
998     } else {
999         /* Can go around again, but... */
1000         *again = true;
1001         /* We've found something so probably don't need to */
1002         return true;
1003     }
1004 }
1005 
1006 /**
1007  * unqueue_page: gets a page of the queue
1008  *
1009  * Helper for 'get_queued_page' - gets a page off the queue
1010  *
1011  * Returns the block of the page (or NULL if none available)
1012  *
1013  * @rs: current RAM state
1014  * @offset: used to return the offset within the RAMBlock
1015  */
1016 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1017 {
1018     RAMBlock *block = NULL;
1019 
1020     qemu_mutex_lock(&rs->src_page_req_mutex);
1021     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1022         struct RAMSrcPageRequest *entry =
1023                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1024         block = entry->rb;
1025         *offset = entry->offset;
1026 
1027         if (entry->len > TARGET_PAGE_SIZE) {
1028             entry->len -= TARGET_PAGE_SIZE;
1029             entry->offset += TARGET_PAGE_SIZE;
1030         } else {
1031             memory_region_unref(block->mr);
1032             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1033             g_free(entry);
1034         }
1035     }
1036     qemu_mutex_unlock(&rs->src_page_req_mutex);
1037 
1038     return block;
1039 }
1040 
1041 /**
1042  * get_queued_page: unqueue a page from the postocpy requests
1043  *
1044  * Skips pages that are already sent (!dirty)
1045  *
1046  * Returns if a queued page is found
1047  *
1048  * @rs: current RAM state
1049  * @pss: data about the state of the current dirty page scan
1050  */
1051 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1052 {
1053     RAMBlock  *block;
1054     ram_addr_t offset;
1055     bool dirty;
1056 
1057     do {
1058         block = unqueue_page(rs, &offset);
1059         /*
1060          * We're sending this page, and since it's postcopy nothing else
1061          * will dirty it, and we must make sure it doesn't get sent again
1062          * even if this queue request was received after the background
1063          * search already sent it.
1064          */
1065         if (block) {
1066             unsigned long page;
1067 
1068             page = offset >> TARGET_PAGE_BITS;
1069             dirty = test_bit(page, block->bmap);
1070             if (!dirty) {
1071                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1072                        page, test_bit(page, block->unsentmap));
1073             } else {
1074                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1075             }
1076         }
1077 
1078     } while (block && !dirty);
1079 
1080     if (block) {
1081         /*
1082          * As soon as we start servicing pages out of order, then we have
1083          * to kill the bulk stage, since the bulk stage assumes
1084          * in (migration_bitmap_find_and_reset_dirty) that every page is
1085          * dirty, that's no longer true.
1086          */
1087         rs->ram_bulk_stage = false;
1088 
1089         /*
1090          * We want the background search to continue from the queued page
1091          * since the guest is likely to want other pages near to the page
1092          * it just requested.
1093          */
1094         pss->block = block;
1095         pss->page = offset >> TARGET_PAGE_BITS;
1096     }
1097 
1098     return !!block;
1099 }
1100 
1101 /**
1102  * migration_page_queue_free: drop any remaining pages in the ram
1103  * request queue
1104  *
1105  * It should be empty at the end anyway, but in error cases there may
1106  * be some left.  in case that there is any page left, we drop it.
1107  *
1108  */
1109 static void migration_page_queue_free(RAMState *rs)
1110 {
1111     struct RAMSrcPageRequest *mspr, *next_mspr;
1112     /* This queue generally should be empty - but in the case of a failed
1113      * migration might have some droppings in.
1114      */
1115     rcu_read_lock();
1116     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1117         memory_region_unref(mspr->rb->mr);
1118         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1119         g_free(mspr);
1120     }
1121     rcu_read_unlock();
1122 }
1123 
1124 /**
1125  * ram_save_queue_pages: queue the page for transmission
1126  *
1127  * A request from postcopy destination for example.
1128  *
1129  * Returns zero on success or negative on error
1130  *
1131  * @rbname: Name of the RAMBLock of the request. NULL means the
1132  *          same that last one.
1133  * @start: starting address from the start of the RAMBlock
1134  * @len: length (in bytes) to send
1135  */
1136 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1137 {
1138     RAMBlock *ramblock;
1139     RAMState *rs = ram_state;
1140 
1141     ram_counters.postcopy_requests++;
1142     rcu_read_lock();
1143     if (!rbname) {
1144         /* Reuse last RAMBlock */
1145         ramblock = rs->last_req_rb;
1146 
1147         if (!ramblock) {
1148             /*
1149              * Shouldn't happen, we can't reuse the last RAMBlock if
1150              * it's the 1st request.
1151              */
1152             error_report("ram_save_queue_pages no previous block");
1153             goto err;
1154         }
1155     } else {
1156         ramblock = qemu_ram_block_by_name(rbname);
1157 
1158         if (!ramblock) {
1159             /* We shouldn't be asked for a non-existent RAMBlock */
1160             error_report("ram_save_queue_pages no block '%s'", rbname);
1161             goto err;
1162         }
1163         rs->last_req_rb = ramblock;
1164     }
1165     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1166     if (start+len > ramblock->used_length) {
1167         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1168                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1169                      __func__, start, len, ramblock->used_length);
1170         goto err;
1171     }
1172 
1173     struct RAMSrcPageRequest *new_entry =
1174         g_malloc0(sizeof(struct RAMSrcPageRequest));
1175     new_entry->rb = ramblock;
1176     new_entry->offset = start;
1177     new_entry->len = len;
1178 
1179     memory_region_ref(ramblock->mr);
1180     qemu_mutex_lock(&rs->src_page_req_mutex);
1181     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1182     qemu_mutex_unlock(&rs->src_page_req_mutex);
1183     rcu_read_unlock();
1184 
1185     return 0;
1186 
1187 err:
1188     rcu_read_unlock();
1189     return -1;
1190 }
1191 
1192 /**
1193  * ram_save_target_page: save one target page
1194  *
1195  * Returns the number of pages written
1196  *
1197  * @rs: current RAM state
1198  * @ms: current migration state
1199  * @pss: data about the page we want to send
1200  * @last_stage: if we are at the completion stage
1201  */
1202 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1203                                 bool last_stage)
1204 {
1205     int res = 0;
1206 
1207     /* Check the pages is dirty and if it is send it */
1208     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1209         /*
1210          * If xbzrle is on, stop using the data compression after first
1211          * round of migration even if compression is enabled. In theory,
1212          * xbzrle can do better than compression.
1213          */
1214         if (migrate_use_compression() &&
1215             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1216             res = ram_save_compressed_page(rs, pss, last_stage);
1217         } else {
1218             res = ram_save_page(rs, pss, last_stage);
1219         }
1220 
1221         if (res < 0) {
1222             return res;
1223         }
1224         if (pss->block->unsentmap) {
1225             clear_bit(pss->page, pss->block->unsentmap);
1226         }
1227     }
1228 
1229     return res;
1230 }
1231 
1232 /**
1233  * ram_save_host_page: save a whole host page
1234  *
1235  * Starting at *offset send pages up to the end of the current host
1236  * page. It's valid for the initial offset to point into the middle of
1237  * a host page in which case the remainder of the hostpage is sent.
1238  * Only dirty target pages are sent. Note that the host page size may
1239  * be a huge page for this block.
1240  * The saving stops at the boundary of the used_length of the block
1241  * if the RAMBlock isn't a multiple of the host page size.
1242  *
1243  * Returns the number of pages written or negative on error
1244  *
1245  * @rs: current RAM state
1246  * @ms: current migration state
1247  * @pss: data about the page we want to send
1248  * @last_stage: if we are at the completion stage
1249  */
1250 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1251                               bool last_stage)
1252 {
1253     int tmppages, pages = 0;
1254     size_t pagesize_bits =
1255         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1256 
1257     do {
1258         tmppages = ram_save_target_page(rs, pss, last_stage);
1259         if (tmppages < 0) {
1260             return tmppages;
1261         }
1262 
1263         pages += tmppages;
1264         pss->page++;
1265     } while ((pss->page & (pagesize_bits - 1)) &&
1266              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1267 
1268     /* The offset we leave with is the last one we looked at */
1269     pss->page--;
1270     return pages;
1271 }
1272 
1273 /**
1274  * ram_find_and_save_block: finds a dirty page and sends it to f
1275  *
1276  * Called within an RCU critical section.
1277  *
1278  * Returns the number of pages written where zero means no dirty pages
1279  *
1280  * @rs: current RAM state
1281  * @last_stage: if we are at the completion stage
1282  *
1283  * On systems where host-page-size > target-page-size it will send all the
1284  * pages in a host page that are dirty.
1285  */
1286 
1287 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1288 {
1289     PageSearchStatus pss;
1290     int pages = 0;
1291     bool again, found;
1292 
1293     /* No dirty page as there is zero RAM */
1294     if (!ram_bytes_total()) {
1295         return pages;
1296     }
1297 
1298     pss.block = rs->last_seen_block;
1299     pss.page = rs->last_page;
1300     pss.complete_round = false;
1301 
1302     if (!pss.block) {
1303         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1304     }
1305 
1306     do {
1307         again = true;
1308         found = get_queued_page(rs, &pss);
1309 
1310         if (!found) {
1311             /* priority queue empty, so just search for something dirty */
1312             found = find_dirty_block(rs, &pss, &again);
1313         }
1314 
1315         if (found) {
1316             pages = ram_save_host_page(rs, &pss, last_stage);
1317         }
1318     } while (!pages && again);
1319 
1320     rs->last_seen_block = pss.block;
1321     rs->last_page = pss.page;
1322 
1323     return pages;
1324 }
1325 
1326 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1327 {
1328     uint64_t pages = size / TARGET_PAGE_SIZE;
1329 
1330     if (zero) {
1331         ram_counters.duplicate += pages;
1332     } else {
1333         ram_counters.normal += pages;
1334         ram_counters.transferred += size;
1335         qemu_update_position(f, size);
1336     }
1337 }
1338 
1339 uint64_t ram_bytes_total(void)
1340 {
1341     RAMBlock *block;
1342     uint64_t total = 0;
1343 
1344     rcu_read_lock();
1345     RAMBLOCK_FOREACH(block) {
1346         total += block->used_length;
1347     }
1348     rcu_read_unlock();
1349     return total;
1350 }
1351 
1352 static void xbzrle_load_setup(void)
1353 {
1354     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1355 }
1356 
1357 static void xbzrle_load_cleanup(void)
1358 {
1359     g_free(XBZRLE.decoded_buf);
1360     XBZRLE.decoded_buf = NULL;
1361 }
1362 
1363 static void ram_save_cleanup(void *opaque)
1364 {
1365     RAMState **rsp = opaque;
1366     RAMBlock *block;
1367 
1368     /* caller have hold iothread lock or is in a bh, so there is
1369      * no writing race against this migration_bitmap
1370      */
1371     memory_global_dirty_log_stop();
1372 
1373     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1374         g_free(block->bmap);
1375         block->bmap = NULL;
1376         g_free(block->unsentmap);
1377         block->unsentmap = NULL;
1378     }
1379 
1380     XBZRLE_cache_lock();
1381     if (XBZRLE.cache) {
1382         cache_fini(XBZRLE.cache);
1383         g_free(XBZRLE.encoded_buf);
1384         g_free(XBZRLE.current_buf);
1385         g_free(XBZRLE.zero_target_page);
1386         XBZRLE.cache = NULL;
1387         XBZRLE.encoded_buf = NULL;
1388         XBZRLE.current_buf = NULL;
1389         XBZRLE.zero_target_page = NULL;
1390     }
1391     XBZRLE_cache_unlock();
1392     migration_page_queue_free(*rsp);
1393     compress_threads_save_cleanup();
1394     g_free(*rsp);
1395     *rsp = NULL;
1396 }
1397 
1398 static void ram_state_reset(RAMState *rs)
1399 {
1400     rs->last_seen_block = NULL;
1401     rs->last_sent_block = NULL;
1402     rs->last_page = 0;
1403     rs->last_version = ram_list.version;
1404     rs->ram_bulk_stage = true;
1405 }
1406 
1407 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1408 
1409 /*
1410  * 'expected' is the value you expect the bitmap mostly to be full
1411  * of; it won't bother printing lines that are all this value.
1412  * If 'todump' is null the migration bitmap is dumped.
1413  */
1414 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1415                            unsigned long pages)
1416 {
1417     int64_t cur;
1418     int64_t linelen = 128;
1419     char linebuf[129];
1420 
1421     for (cur = 0; cur < pages; cur += linelen) {
1422         int64_t curb;
1423         bool found = false;
1424         /*
1425          * Last line; catch the case where the line length
1426          * is longer than remaining ram
1427          */
1428         if (cur + linelen > pages) {
1429             linelen = pages - cur;
1430         }
1431         for (curb = 0; curb < linelen; curb++) {
1432             bool thisbit = test_bit(cur + curb, todump);
1433             linebuf[curb] = thisbit ? '1' : '.';
1434             found = found || (thisbit != expected);
1435         }
1436         if (found) {
1437             linebuf[curb] = '\0';
1438             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1439         }
1440     }
1441 }
1442 
1443 /* **** functions for postcopy ***** */
1444 
1445 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1446 {
1447     struct RAMBlock *block;
1448 
1449     RAMBLOCK_FOREACH(block) {
1450         unsigned long *bitmap = block->bmap;
1451         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1452         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1453 
1454         while (run_start < range) {
1455             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1456             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1457                               (run_end - run_start) << TARGET_PAGE_BITS);
1458             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1459         }
1460     }
1461 }
1462 
1463 /**
1464  * postcopy_send_discard_bm_ram: discard a RAMBlock
1465  *
1466  * Returns zero on success
1467  *
1468  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1469  * Note: At this point the 'unsentmap' is the processed bitmap combined
1470  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1471  *
1472  * @ms: current migration state
1473  * @pds: state for postcopy
1474  * @start: RAMBlock starting page
1475  * @length: RAMBlock size
1476  */
1477 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1478                                         PostcopyDiscardState *pds,
1479                                         RAMBlock *block)
1480 {
1481     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1482     unsigned long current;
1483     unsigned long *unsentmap = block->unsentmap;
1484 
1485     for (current = 0; current < end; ) {
1486         unsigned long one = find_next_bit(unsentmap, end, current);
1487 
1488         if (one <= end) {
1489             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1490             unsigned long discard_length;
1491 
1492             if (zero >= end) {
1493                 discard_length = end - one;
1494             } else {
1495                 discard_length = zero - one;
1496             }
1497             if (discard_length) {
1498                 postcopy_discard_send_range(ms, pds, one, discard_length);
1499             }
1500             current = one + discard_length;
1501         } else {
1502             current = one;
1503         }
1504     }
1505 
1506     return 0;
1507 }
1508 
1509 /**
1510  * postcopy_each_ram_send_discard: discard all RAMBlocks
1511  *
1512  * Returns 0 for success or negative for error
1513  *
1514  * Utility for the outgoing postcopy code.
1515  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1516  *   passing it bitmap indexes and name.
1517  * (qemu_ram_foreach_block ends up passing unscaled lengths
1518  *  which would mean postcopy code would have to deal with target page)
1519  *
1520  * @ms: current migration state
1521  */
1522 static int postcopy_each_ram_send_discard(MigrationState *ms)
1523 {
1524     struct RAMBlock *block;
1525     int ret;
1526 
1527     RAMBLOCK_FOREACH(block) {
1528         PostcopyDiscardState *pds =
1529             postcopy_discard_send_init(ms, block->idstr);
1530 
1531         /*
1532          * Postcopy sends chunks of bitmap over the wire, but it
1533          * just needs indexes at this point, avoids it having
1534          * target page specific code.
1535          */
1536         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1537         postcopy_discard_send_finish(ms, pds);
1538         if (ret) {
1539             return ret;
1540         }
1541     }
1542 
1543     return 0;
1544 }
1545 
1546 /**
1547  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1548  *
1549  * Helper for postcopy_chunk_hostpages; it's called twice to
1550  * canonicalize the two bitmaps, that are similar, but one is
1551  * inverted.
1552  *
1553  * Postcopy requires that all target pages in a hostpage are dirty or
1554  * clean, not a mix.  This function canonicalizes the bitmaps.
1555  *
1556  * @ms: current migration state
1557  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1558  *               otherwise we need to canonicalize partially dirty host pages
1559  * @block: block that contains the page we want to canonicalize
1560  * @pds: state for postcopy
1561  */
1562 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1563                                           RAMBlock *block,
1564                                           PostcopyDiscardState *pds)
1565 {
1566     RAMState *rs = ram_state;
1567     unsigned long *bitmap = block->bmap;
1568     unsigned long *unsentmap = block->unsentmap;
1569     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1570     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1571     unsigned long run_start;
1572 
1573     if (block->page_size == TARGET_PAGE_SIZE) {
1574         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1575         return;
1576     }
1577 
1578     if (unsent_pass) {
1579         /* Find a sent page */
1580         run_start = find_next_zero_bit(unsentmap, pages, 0);
1581     } else {
1582         /* Find a dirty page */
1583         run_start = find_next_bit(bitmap, pages, 0);
1584     }
1585 
1586     while (run_start < pages) {
1587         bool do_fixup = false;
1588         unsigned long fixup_start_addr;
1589         unsigned long host_offset;
1590 
1591         /*
1592          * If the start of this run of pages is in the middle of a host
1593          * page, then we need to fixup this host page.
1594          */
1595         host_offset = run_start % host_ratio;
1596         if (host_offset) {
1597             do_fixup = true;
1598             run_start -= host_offset;
1599             fixup_start_addr = run_start;
1600             /* For the next pass */
1601             run_start = run_start + host_ratio;
1602         } else {
1603             /* Find the end of this run */
1604             unsigned long run_end;
1605             if (unsent_pass) {
1606                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1607             } else {
1608                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1609             }
1610             /*
1611              * If the end isn't at the start of a host page, then the
1612              * run doesn't finish at the end of a host page
1613              * and we need to discard.
1614              */
1615             host_offset = run_end % host_ratio;
1616             if (host_offset) {
1617                 do_fixup = true;
1618                 fixup_start_addr = run_end - host_offset;
1619                 /*
1620                  * This host page has gone, the next loop iteration starts
1621                  * from after the fixup
1622                  */
1623                 run_start = fixup_start_addr + host_ratio;
1624             } else {
1625                 /*
1626                  * No discards on this iteration, next loop starts from
1627                  * next sent/dirty page
1628                  */
1629                 run_start = run_end + 1;
1630             }
1631         }
1632 
1633         if (do_fixup) {
1634             unsigned long page;
1635 
1636             /* Tell the destination to discard this page */
1637             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1638                 /* For the unsent_pass we:
1639                  *     discard partially sent pages
1640                  * For the !unsent_pass (dirty) we:
1641                  *     discard partially dirty pages that were sent
1642                  *     (any partially sent pages were already discarded
1643                  *     by the previous unsent_pass)
1644                  */
1645                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1646                                             host_ratio);
1647             }
1648 
1649             /* Clean up the bitmap */
1650             for (page = fixup_start_addr;
1651                  page < fixup_start_addr + host_ratio; page++) {
1652                 /* All pages in this host page are now not sent */
1653                 set_bit(page, unsentmap);
1654 
1655                 /*
1656                  * Remark them as dirty, updating the count for any pages
1657                  * that weren't previously dirty.
1658                  */
1659                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1660             }
1661         }
1662 
1663         if (unsent_pass) {
1664             /* Find the next sent page for the next iteration */
1665             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1666         } else {
1667             /* Find the next dirty page for the next iteration */
1668             run_start = find_next_bit(bitmap, pages, run_start);
1669         }
1670     }
1671 }
1672 
1673 /**
1674  * postcopy_chuck_hostpages: discrad any partially sent host page
1675  *
1676  * Utility for the outgoing postcopy code.
1677  *
1678  * Discard any partially sent host-page size chunks, mark any partially
1679  * dirty host-page size chunks as all dirty.  In this case the host-page
1680  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1681  *
1682  * Returns zero on success
1683  *
1684  * @ms: current migration state
1685  * @block: block we want to work with
1686  */
1687 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1688 {
1689     PostcopyDiscardState *pds =
1690         postcopy_discard_send_init(ms, block->idstr);
1691 
1692     /* First pass: Discard all partially sent host pages */
1693     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1694     /*
1695      * Second pass: Ensure that all partially dirty host pages are made
1696      * fully dirty.
1697      */
1698     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1699 
1700     postcopy_discard_send_finish(ms, pds);
1701     return 0;
1702 }
1703 
1704 /**
1705  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1706  *
1707  * Returns zero on success
1708  *
1709  * Transmit the set of pages to be discarded after precopy to the target
1710  * these are pages that:
1711  *     a) Have been previously transmitted but are now dirty again
1712  *     b) Pages that have never been transmitted, this ensures that
1713  *        any pages on the destination that have been mapped by background
1714  *        tasks get discarded (transparent huge pages is the specific concern)
1715  * Hopefully this is pretty sparse
1716  *
1717  * @ms: current migration state
1718  */
1719 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1720 {
1721     RAMState *rs = ram_state;
1722     RAMBlock *block;
1723     int ret;
1724 
1725     rcu_read_lock();
1726 
1727     /* This should be our last sync, the src is now paused */
1728     migration_bitmap_sync(rs);
1729 
1730     /* Easiest way to make sure we don't resume in the middle of a host-page */
1731     rs->last_seen_block = NULL;
1732     rs->last_sent_block = NULL;
1733     rs->last_page = 0;
1734 
1735     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1736         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1737         unsigned long *bitmap = block->bmap;
1738         unsigned long *unsentmap = block->unsentmap;
1739 
1740         if (!unsentmap) {
1741             /* We don't have a safe way to resize the sentmap, so
1742              * if the bitmap was resized it will be NULL at this
1743              * point.
1744              */
1745             error_report("migration ram resized during precopy phase");
1746             rcu_read_unlock();
1747             return -EINVAL;
1748         }
1749         /* Deal with TPS != HPS and huge pages */
1750         ret = postcopy_chunk_hostpages(ms, block);
1751         if (ret) {
1752             rcu_read_unlock();
1753             return ret;
1754         }
1755 
1756         /*
1757          * Update the unsentmap to be unsentmap = unsentmap | dirty
1758          */
1759         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1760 #ifdef DEBUG_POSTCOPY
1761         ram_debug_dump_bitmap(unsentmap, true, pages);
1762 #endif
1763     }
1764     trace_ram_postcopy_send_discard_bitmap();
1765 
1766     ret = postcopy_each_ram_send_discard(ms);
1767     rcu_read_unlock();
1768 
1769     return ret;
1770 }
1771 
1772 /**
1773  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1774  *
1775  * Returns zero on success
1776  *
1777  * @rbname: name of the RAMBlock of the request. NULL means the
1778  *          same that last one.
1779  * @start: RAMBlock starting page
1780  * @length: RAMBlock size
1781  */
1782 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1783 {
1784     int ret = -1;
1785 
1786     trace_ram_discard_range(rbname, start, length);
1787 
1788     rcu_read_lock();
1789     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1790 
1791     if (!rb) {
1792         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1793         goto err;
1794     }
1795 
1796     ret = ram_block_discard_range(rb, start, length);
1797 
1798 err:
1799     rcu_read_unlock();
1800 
1801     return ret;
1802 }
1803 
1804 static int ram_state_init(RAMState **rsp)
1805 {
1806     *rsp = g_new0(RAMState, 1);
1807 
1808     qemu_mutex_init(&(*rsp)->bitmap_mutex);
1809     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1810     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
1811 
1812     if (migrate_use_xbzrle()) {
1813         XBZRLE_cache_lock();
1814         XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
1815         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1816                                   TARGET_PAGE_SIZE,
1817                                   TARGET_PAGE_SIZE);
1818         if (!XBZRLE.cache) {
1819             XBZRLE_cache_unlock();
1820             error_report("Error creating cache");
1821             g_free(*rsp);
1822             *rsp = NULL;
1823             return -1;
1824         }
1825         XBZRLE_cache_unlock();
1826 
1827         /* We prefer not to abort if there is no memory */
1828         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1829         if (!XBZRLE.encoded_buf) {
1830             error_report("Error allocating encoded_buf");
1831             g_free(*rsp);
1832             *rsp = NULL;
1833             return -1;
1834         }
1835 
1836         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1837         if (!XBZRLE.current_buf) {
1838             error_report("Error allocating current_buf");
1839             g_free(XBZRLE.encoded_buf);
1840             XBZRLE.encoded_buf = NULL;
1841             g_free(*rsp);
1842             *rsp = NULL;
1843             return -1;
1844         }
1845     }
1846 
1847     /* For memory_global_dirty_log_start below.  */
1848     qemu_mutex_lock_iothread();
1849 
1850     qemu_mutex_lock_ramlist();
1851     rcu_read_lock();
1852     ram_state_reset(*rsp);
1853 
1854     /* Skip setting bitmap if there is no RAM */
1855     if (ram_bytes_total()) {
1856         RAMBlock *block;
1857 
1858         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1859             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1860 
1861             block->bmap = bitmap_new(pages);
1862             bitmap_set(block->bmap, 0, pages);
1863             if (migrate_postcopy_ram()) {
1864                 block->unsentmap = bitmap_new(pages);
1865                 bitmap_set(block->unsentmap, 0, pages);
1866             }
1867         }
1868     }
1869 
1870     /*
1871      * Count the total number of pages used by ram blocks not including any
1872      * gaps due to alignment or unplugs.
1873      */
1874     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1875 
1876     memory_global_dirty_log_start();
1877     migration_bitmap_sync(*rsp);
1878     qemu_mutex_unlock_ramlist();
1879     qemu_mutex_unlock_iothread();
1880     rcu_read_unlock();
1881 
1882     return 0;
1883 }
1884 
1885 /*
1886  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1887  * long-running RCU critical section.  When rcu-reclaims in the code
1888  * start to become numerous it will be necessary to reduce the
1889  * granularity of these critical sections.
1890  */
1891 
1892 /**
1893  * ram_save_setup: Setup RAM for migration
1894  *
1895  * Returns zero to indicate success and negative for error
1896  *
1897  * @f: QEMUFile where to send the data
1898  * @opaque: RAMState pointer
1899  */
1900 static int ram_save_setup(QEMUFile *f, void *opaque)
1901 {
1902     RAMState **rsp = opaque;
1903     RAMBlock *block;
1904 
1905     /* migration has already setup the bitmap, reuse it. */
1906     if (!migration_in_colo_state()) {
1907         if (ram_state_init(rsp) != 0) {
1908             return -1;
1909         }
1910     }
1911     (*rsp)->f = f;
1912 
1913     rcu_read_lock();
1914 
1915     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1916 
1917     RAMBLOCK_FOREACH(block) {
1918         qemu_put_byte(f, strlen(block->idstr));
1919         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1920         qemu_put_be64(f, block->used_length);
1921         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1922             qemu_put_be64(f, block->page_size);
1923         }
1924     }
1925 
1926     rcu_read_unlock();
1927     compress_threads_save_setup();
1928 
1929     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1930     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1931 
1932     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1933 
1934     return 0;
1935 }
1936 
1937 /**
1938  * ram_save_iterate: iterative stage for migration
1939  *
1940  * Returns zero to indicate success and negative for error
1941  *
1942  * @f: QEMUFile where to send the data
1943  * @opaque: RAMState pointer
1944  */
1945 static int ram_save_iterate(QEMUFile *f, void *opaque)
1946 {
1947     RAMState **temp = opaque;
1948     RAMState *rs = *temp;
1949     int ret;
1950     int i;
1951     int64_t t0;
1952     int done = 0;
1953 
1954     rcu_read_lock();
1955     if (ram_list.version != rs->last_version) {
1956         ram_state_reset(rs);
1957     }
1958 
1959     /* Read version before ram_list.blocks */
1960     smp_rmb();
1961 
1962     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1963 
1964     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1965     i = 0;
1966     while ((ret = qemu_file_rate_limit(f)) == 0) {
1967         int pages;
1968 
1969         pages = ram_find_and_save_block(rs, false);
1970         /* no more pages to sent */
1971         if (pages == 0) {
1972             done = 1;
1973             break;
1974         }
1975         rs->iterations++;
1976 
1977         /* we want to check in the 1st loop, just in case it was the 1st time
1978            and we had to sync the dirty bitmap.
1979            qemu_get_clock_ns() is a bit expensive, so we only check each some
1980            iterations
1981         */
1982         if ((i & 63) == 0) {
1983             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1984             if (t1 > MAX_WAIT) {
1985                 trace_ram_save_iterate_big_wait(t1, i);
1986                 break;
1987             }
1988         }
1989         i++;
1990     }
1991     flush_compressed_data(rs);
1992     rcu_read_unlock();
1993 
1994     /*
1995      * Must occur before EOS (or any QEMUFile operation)
1996      * because of RDMA protocol.
1997      */
1998     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1999 
2000     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2001     ram_counters.transferred += 8;
2002 
2003     ret = qemu_file_get_error(f);
2004     if (ret < 0) {
2005         return ret;
2006     }
2007 
2008     return done;
2009 }
2010 
2011 /**
2012  * ram_save_complete: function called to send the remaining amount of ram
2013  *
2014  * Returns zero to indicate success
2015  *
2016  * Called with iothread lock
2017  *
2018  * @f: QEMUFile where to send the data
2019  * @opaque: RAMState pointer
2020  */
2021 static int ram_save_complete(QEMUFile *f, void *opaque)
2022 {
2023     RAMState **temp = opaque;
2024     RAMState *rs = *temp;
2025 
2026     rcu_read_lock();
2027 
2028     if (!migration_in_postcopy()) {
2029         migration_bitmap_sync(rs);
2030     }
2031 
2032     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2033 
2034     /* try transferring iterative blocks of memory */
2035 
2036     /* flush all remaining blocks regardless of rate limiting */
2037     while (true) {
2038         int pages;
2039 
2040         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2041         /* no more blocks to sent */
2042         if (pages == 0) {
2043             break;
2044         }
2045     }
2046 
2047     flush_compressed_data(rs);
2048     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2049 
2050     rcu_read_unlock();
2051 
2052     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2053 
2054     return 0;
2055 }
2056 
2057 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2058                              uint64_t *non_postcopiable_pending,
2059                              uint64_t *postcopiable_pending)
2060 {
2061     RAMState **temp = opaque;
2062     RAMState *rs = *temp;
2063     uint64_t remaining_size;
2064 
2065     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2066 
2067     if (!migration_in_postcopy() &&
2068         remaining_size < max_size) {
2069         qemu_mutex_lock_iothread();
2070         rcu_read_lock();
2071         migration_bitmap_sync(rs);
2072         rcu_read_unlock();
2073         qemu_mutex_unlock_iothread();
2074         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2075     }
2076 
2077     /* We can do postcopy, and all the data is postcopiable */
2078     *postcopiable_pending += remaining_size;
2079 }
2080 
2081 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2082 {
2083     unsigned int xh_len;
2084     int xh_flags;
2085     uint8_t *loaded_data;
2086 
2087     /* extract RLE header */
2088     xh_flags = qemu_get_byte(f);
2089     xh_len = qemu_get_be16(f);
2090 
2091     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2092         error_report("Failed to load XBZRLE page - wrong compression!");
2093         return -1;
2094     }
2095 
2096     if (xh_len > TARGET_PAGE_SIZE) {
2097         error_report("Failed to load XBZRLE page - len overflow!");
2098         return -1;
2099     }
2100     loaded_data = XBZRLE.decoded_buf;
2101     /* load data and decode */
2102     /* it can change loaded_data to point to an internal buffer */
2103     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2104 
2105     /* decode RLE */
2106     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2107                              TARGET_PAGE_SIZE) == -1) {
2108         error_report("Failed to load XBZRLE page - decode error!");
2109         return -1;
2110     }
2111 
2112     return 0;
2113 }
2114 
2115 /**
2116  * ram_block_from_stream: read a RAMBlock id from the migration stream
2117  *
2118  * Must be called from within a rcu critical section.
2119  *
2120  * Returns a pointer from within the RCU-protected ram_list.
2121  *
2122  * @f: QEMUFile where to read the data from
2123  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2124  */
2125 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2126 {
2127     static RAMBlock *block = NULL;
2128     char id[256];
2129     uint8_t len;
2130 
2131     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2132         if (!block) {
2133             error_report("Ack, bad migration stream!");
2134             return NULL;
2135         }
2136         return block;
2137     }
2138 
2139     len = qemu_get_byte(f);
2140     qemu_get_buffer(f, (uint8_t *)id, len);
2141     id[len] = 0;
2142 
2143     block = qemu_ram_block_by_name(id);
2144     if (!block) {
2145         error_report("Can't find block %s", id);
2146         return NULL;
2147     }
2148 
2149     return block;
2150 }
2151 
2152 static inline void *host_from_ram_block_offset(RAMBlock *block,
2153                                                ram_addr_t offset)
2154 {
2155     if (!offset_in_ramblock(block, offset)) {
2156         return NULL;
2157     }
2158 
2159     return block->host + offset;
2160 }
2161 
2162 /**
2163  * ram_handle_compressed: handle the zero page case
2164  *
2165  * If a page (or a whole RDMA chunk) has been
2166  * determined to be zero, then zap it.
2167  *
2168  * @host: host address for the zero page
2169  * @ch: what the page is filled from.  We only support zero
2170  * @size: size of the zero page
2171  */
2172 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2173 {
2174     if (ch != 0 || !is_zero_range(host, size)) {
2175         memset(host, ch, size);
2176     }
2177 }
2178 
2179 static void *do_data_decompress(void *opaque)
2180 {
2181     DecompressParam *param = opaque;
2182     unsigned long pagesize;
2183     uint8_t *des;
2184     int len;
2185 
2186     qemu_mutex_lock(&param->mutex);
2187     while (!param->quit) {
2188         if (param->des) {
2189             des = param->des;
2190             len = param->len;
2191             param->des = 0;
2192             qemu_mutex_unlock(&param->mutex);
2193 
2194             pagesize = TARGET_PAGE_SIZE;
2195             /* uncompress() will return failed in some case, especially
2196              * when the page is dirted when doing the compression, it's
2197              * not a problem because the dirty page will be retransferred
2198              * and uncompress() won't break the data in other pages.
2199              */
2200             uncompress((Bytef *)des, &pagesize,
2201                        (const Bytef *)param->compbuf, len);
2202 
2203             qemu_mutex_lock(&decomp_done_lock);
2204             param->done = true;
2205             qemu_cond_signal(&decomp_done_cond);
2206             qemu_mutex_unlock(&decomp_done_lock);
2207 
2208             qemu_mutex_lock(&param->mutex);
2209         } else {
2210             qemu_cond_wait(&param->cond, &param->mutex);
2211         }
2212     }
2213     qemu_mutex_unlock(&param->mutex);
2214 
2215     return NULL;
2216 }
2217 
2218 static void wait_for_decompress_done(void)
2219 {
2220     int idx, thread_count;
2221 
2222     if (!migrate_use_compression()) {
2223         return;
2224     }
2225 
2226     thread_count = migrate_decompress_threads();
2227     qemu_mutex_lock(&decomp_done_lock);
2228     for (idx = 0; idx < thread_count; idx++) {
2229         while (!decomp_param[idx].done) {
2230             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2231         }
2232     }
2233     qemu_mutex_unlock(&decomp_done_lock);
2234 }
2235 
2236 static void compress_threads_load_setup(void)
2237 {
2238     int i, thread_count;
2239 
2240     if (!migrate_use_compression()) {
2241         return;
2242     }
2243     thread_count = migrate_decompress_threads();
2244     decompress_threads = g_new0(QemuThread, thread_count);
2245     decomp_param = g_new0(DecompressParam, thread_count);
2246     qemu_mutex_init(&decomp_done_lock);
2247     qemu_cond_init(&decomp_done_cond);
2248     for (i = 0; i < thread_count; i++) {
2249         qemu_mutex_init(&decomp_param[i].mutex);
2250         qemu_cond_init(&decomp_param[i].cond);
2251         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2252         decomp_param[i].done = true;
2253         decomp_param[i].quit = false;
2254         qemu_thread_create(decompress_threads + i, "decompress",
2255                            do_data_decompress, decomp_param + i,
2256                            QEMU_THREAD_JOINABLE);
2257     }
2258 }
2259 
2260 static void compress_threads_load_cleanup(void)
2261 {
2262     int i, thread_count;
2263 
2264     if (!migrate_use_compression()) {
2265         return;
2266     }
2267     thread_count = migrate_decompress_threads();
2268     for (i = 0; i < thread_count; i++) {
2269         qemu_mutex_lock(&decomp_param[i].mutex);
2270         decomp_param[i].quit = true;
2271         qemu_cond_signal(&decomp_param[i].cond);
2272         qemu_mutex_unlock(&decomp_param[i].mutex);
2273     }
2274     for (i = 0; i < thread_count; i++) {
2275         qemu_thread_join(decompress_threads + i);
2276         qemu_mutex_destroy(&decomp_param[i].mutex);
2277         qemu_cond_destroy(&decomp_param[i].cond);
2278         g_free(decomp_param[i].compbuf);
2279     }
2280     g_free(decompress_threads);
2281     g_free(decomp_param);
2282     decompress_threads = NULL;
2283     decomp_param = NULL;
2284 }
2285 
2286 static void decompress_data_with_multi_threads(QEMUFile *f,
2287                                                void *host, int len)
2288 {
2289     int idx, thread_count;
2290 
2291     thread_count = migrate_decompress_threads();
2292     qemu_mutex_lock(&decomp_done_lock);
2293     while (true) {
2294         for (idx = 0; idx < thread_count; idx++) {
2295             if (decomp_param[idx].done) {
2296                 decomp_param[idx].done = false;
2297                 qemu_mutex_lock(&decomp_param[idx].mutex);
2298                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2299                 decomp_param[idx].des = host;
2300                 decomp_param[idx].len = len;
2301                 qemu_cond_signal(&decomp_param[idx].cond);
2302                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2303                 break;
2304             }
2305         }
2306         if (idx < thread_count) {
2307             break;
2308         } else {
2309             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2310         }
2311     }
2312     qemu_mutex_unlock(&decomp_done_lock);
2313 }
2314 
2315 /**
2316  * ram_load_setup: Setup RAM for migration incoming side
2317  *
2318  * Returns zero to indicate success and negative for error
2319  *
2320  * @f: QEMUFile where to receive the data
2321  * @opaque: RAMState pointer
2322  */
2323 static int ram_load_setup(QEMUFile *f, void *opaque)
2324 {
2325     xbzrle_load_setup();
2326     compress_threads_load_setup();
2327     return 0;
2328 }
2329 
2330 static int ram_load_cleanup(void *opaque)
2331 {
2332     xbzrle_load_cleanup();
2333     compress_threads_load_cleanup();
2334     return 0;
2335 }
2336 
2337 /**
2338  * ram_postcopy_incoming_init: allocate postcopy data structures
2339  *
2340  * Returns 0 for success and negative if there was one error
2341  *
2342  * @mis: current migration incoming state
2343  *
2344  * Allocate data structures etc needed by incoming migration with
2345  * postcopy-ram. postcopy-ram's similarly names
2346  * postcopy_ram_incoming_init does the work.
2347  */
2348 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2349 {
2350     unsigned long ram_pages = last_ram_page();
2351 
2352     return postcopy_ram_incoming_init(mis, ram_pages);
2353 }
2354 
2355 /**
2356  * ram_load_postcopy: load a page in postcopy case
2357  *
2358  * Returns 0 for success or -errno in case of error
2359  *
2360  * Called in postcopy mode by ram_load().
2361  * rcu_read_lock is taken prior to this being called.
2362  *
2363  * @f: QEMUFile where to send the data
2364  */
2365 static int ram_load_postcopy(QEMUFile *f)
2366 {
2367     int flags = 0, ret = 0;
2368     bool place_needed = false;
2369     bool matching_page_sizes = false;
2370     MigrationIncomingState *mis = migration_incoming_get_current();
2371     /* Temporary page that is later 'placed' */
2372     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2373     void *last_host = NULL;
2374     bool all_zero = false;
2375 
2376     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2377         ram_addr_t addr;
2378         void *host = NULL;
2379         void *page_buffer = NULL;
2380         void *place_source = NULL;
2381         RAMBlock *block = NULL;
2382         uint8_t ch;
2383 
2384         addr = qemu_get_be64(f);
2385         flags = addr & ~TARGET_PAGE_MASK;
2386         addr &= TARGET_PAGE_MASK;
2387 
2388         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2389         place_needed = false;
2390         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2391             block = ram_block_from_stream(f, flags);
2392 
2393             host = host_from_ram_block_offset(block, addr);
2394             if (!host) {
2395                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2396                 ret = -EINVAL;
2397                 break;
2398             }
2399             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2400             /*
2401              * Postcopy requires that we place whole host pages atomically;
2402              * these may be huge pages for RAMBlocks that are backed by
2403              * hugetlbfs.
2404              * To make it atomic, the data is read into a temporary page
2405              * that's moved into place later.
2406              * The migration protocol uses,  possibly smaller, target-pages
2407              * however the source ensures it always sends all the components
2408              * of a host page in order.
2409              */
2410             page_buffer = postcopy_host_page +
2411                           ((uintptr_t)host & (block->page_size - 1));
2412             /* If all TP are zero then we can optimise the place */
2413             if (!((uintptr_t)host & (block->page_size - 1))) {
2414                 all_zero = true;
2415             } else {
2416                 /* not the 1st TP within the HP */
2417                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2418                     error_report("Non-sequential target page %p/%p",
2419                                   host, last_host);
2420                     ret = -EINVAL;
2421                     break;
2422                 }
2423             }
2424 
2425 
2426             /*
2427              * If it's the last part of a host page then we place the host
2428              * page
2429              */
2430             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2431                                      (block->page_size - 1)) == 0;
2432             place_source = postcopy_host_page;
2433         }
2434         last_host = host;
2435 
2436         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2437         case RAM_SAVE_FLAG_ZERO:
2438             ch = qemu_get_byte(f);
2439             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2440             if (ch) {
2441                 all_zero = false;
2442             }
2443             break;
2444 
2445         case RAM_SAVE_FLAG_PAGE:
2446             all_zero = false;
2447             if (!place_needed || !matching_page_sizes) {
2448                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2449             } else {
2450                 /* Avoids the qemu_file copy during postcopy, which is
2451                  * going to do a copy later; can only do it when we
2452                  * do this read in one go (matching page sizes)
2453                  */
2454                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2455                                          TARGET_PAGE_SIZE);
2456             }
2457             break;
2458         case RAM_SAVE_FLAG_EOS:
2459             /* normal exit */
2460             break;
2461         default:
2462             error_report("Unknown combination of migration flags: %#x"
2463                          " (postcopy mode)", flags);
2464             ret = -EINVAL;
2465         }
2466 
2467         if (place_needed) {
2468             /* This gets called at the last target page in the host page */
2469             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2470 
2471             if (all_zero) {
2472                 ret = postcopy_place_page_zero(mis, place_dest,
2473                                                block->page_size);
2474             } else {
2475                 ret = postcopy_place_page(mis, place_dest,
2476                                           place_source, block->page_size);
2477             }
2478         }
2479         if (!ret) {
2480             ret = qemu_file_get_error(f);
2481         }
2482     }
2483 
2484     return ret;
2485 }
2486 
2487 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2488 {
2489     int flags = 0, ret = 0, invalid_flags = 0;
2490     static uint64_t seq_iter;
2491     int len = 0;
2492     /*
2493      * If system is running in postcopy mode, page inserts to host memory must
2494      * be atomic
2495      */
2496     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2497     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2498     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2499 
2500     seq_iter++;
2501 
2502     if (version_id != 4) {
2503         ret = -EINVAL;
2504     }
2505 
2506     if (!migrate_use_compression()) {
2507         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2508     }
2509     /* This RCU critical section can be very long running.
2510      * When RCU reclaims in the code start to become numerous,
2511      * it will be necessary to reduce the granularity of this
2512      * critical section.
2513      */
2514     rcu_read_lock();
2515 
2516     if (postcopy_running) {
2517         ret = ram_load_postcopy(f);
2518     }
2519 
2520     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2521         ram_addr_t addr, total_ram_bytes;
2522         void *host = NULL;
2523         uint8_t ch;
2524 
2525         addr = qemu_get_be64(f);
2526         flags = addr & ~TARGET_PAGE_MASK;
2527         addr &= TARGET_PAGE_MASK;
2528 
2529         if (flags & invalid_flags) {
2530             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2531                 error_report("Received an unexpected compressed page");
2532             }
2533 
2534             ret = -EINVAL;
2535             break;
2536         }
2537 
2538         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2539                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2540             RAMBlock *block = ram_block_from_stream(f, flags);
2541 
2542             host = host_from_ram_block_offset(block, addr);
2543             if (!host) {
2544                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2545                 ret = -EINVAL;
2546                 break;
2547             }
2548             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2549         }
2550 
2551         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2552         case RAM_SAVE_FLAG_MEM_SIZE:
2553             /* Synchronize RAM block list */
2554             total_ram_bytes = addr;
2555             while (!ret && total_ram_bytes) {
2556                 RAMBlock *block;
2557                 char id[256];
2558                 ram_addr_t length;
2559 
2560                 len = qemu_get_byte(f);
2561                 qemu_get_buffer(f, (uint8_t *)id, len);
2562                 id[len] = 0;
2563                 length = qemu_get_be64(f);
2564 
2565                 block = qemu_ram_block_by_name(id);
2566                 if (block) {
2567                     if (length != block->used_length) {
2568                         Error *local_err = NULL;
2569 
2570                         ret = qemu_ram_resize(block, length,
2571                                               &local_err);
2572                         if (local_err) {
2573                             error_report_err(local_err);
2574                         }
2575                     }
2576                     /* For postcopy we need to check hugepage sizes match */
2577                     if (postcopy_advised &&
2578                         block->page_size != qemu_host_page_size) {
2579                         uint64_t remote_page_size = qemu_get_be64(f);
2580                         if (remote_page_size != block->page_size) {
2581                             error_report("Mismatched RAM page size %s "
2582                                          "(local) %zd != %" PRId64,
2583                                          id, block->page_size,
2584                                          remote_page_size);
2585                             ret = -EINVAL;
2586                         }
2587                     }
2588                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2589                                           block->idstr);
2590                 } else {
2591                     error_report("Unknown ramblock \"%s\", cannot "
2592                                  "accept migration", id);
2593                     ret = -EINVAL;
2594                 }
2595 
2596                 total_ram_bytes -= length;
2597             }
2598             break;
2599 
2600         case RAM_SAVE_FLAG_ZERO:
2601             ch = qemu_get_byte(f);
2602             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2603             break;
2604 
2605         case RAM_SAVE_FLAG_PAGE:
2606             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2607             break;
2608 
2609         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2610             len = qemu_get_be32(f);
2611             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2612                 error_report("Invalid compressed data length: %d", len);
2613                 ret = -EINVAL;
2614                 break;
2615             }
2616             decompress_data_with_multi_threads(f, host, len);
2617             break;
2618 
2619         case RAM_SAVE_FLAG_XBZRLE:
2620             if (load_xbzrle(f, addr, host) < 0) {
2621                 error_report("Failed to decompress XBZRLE page at "
2622                              RAM_ADDR_FMT, addr);
2623                 ret = -EINVAL;
2624                 break;
2625             }
2626             break;
2627         case RAM_SAVE_FLAG_EOS:
2628             /* normal exit */
2629             break;
2630         default:
2631             if (flags & RAM_SAVE_FLAG_HOOK) {
2632                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2633             } else {
2634                 error_report("Unknown combination of migration flags: %#x",
2635                              flags);
2636                 ret = -EINVAL;
2637             }
2638         }
2639         if (!ret) {
2640             ret = qemu_file_get_error(f);
2641         }
2642     }
2643 
2644     wait_for_decompress_done();
2645     rcu_read_unlock();
2646     trace_ram_load_complete(ret, seq_iter);
2647     return ret;
2648 }
2649 
2650 static SaveVMHandlers savevm_ram_handlers = {
2651     .save_setup = ram_save_setup,
2652     .save_live_iterate = ram_save_iterate,
2653     .save_live_complete_postcopy = ram_save_complete,
2654     .save_live_complete_precopy = ram_save_complete,
2655     .save_live_pending = ram_save_pending,
2656     .load_state = ram_load,
2657     .save_cleanup = ram_save_cleanup,
2658     .load_setup = ram_load_setup,
2659     .load_cleanup = ram_load_cleanup,
2660 };
2661 
2662 void ram_mig_init(void)
2663 {
2664     qemu_mutex_init(&XBZRLE.lock);
2665     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2666 }
2667