xref: /openbmc/qemu/migration/ram.c (revision ac12b601)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
58 
59 #if defined(__linux__)
60 #include "qemu/userfaultfd.h"
61 #endif /* defined(__linux__) */
62 
63 /***********************************************************/
64 /* ram save/restore */
65 
66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
67  * worked for pages that where filled with the same char.  We switched
68  * it to only search for the zero value.  And to avoid confusion with
69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
70  */
71 
72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
73 #define RAM_SAVE_FLAG_ZERO     0x02
74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
75 #define RAM_SAVE_FLAG_PAGE     0x08
76 #define RAM_SAVE_FLAG_EOS      0x10
77 #define RAM_SAVE_FLAG_CONTINUE 0x20
78 #define RAM_SAVE_FLAG_XBZRLE   0x40
79 /* 0x80 is reserved in migration.h start with 0x100 next */
80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
81 
82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
83 {
84     return buffer_is_zero(p, size);
85 }
86 
87 XBZRLECacheStats xbzrle_counters;
88 
89 /* struct contains XBZRLE cache and a static page
90    used by the compression */
91 static struct {
92     /* buffer used for XBZRLE encoding */
93     uint8_t *encoded_buf;
94     /* buffer for storing page content */
95     uint8_t *current_buf;
96     /* Cache for XBZRLE, Protected by lock. */
97     PageCache *cache;
98     QemuMutex lock;
99     /* it will store a page full of zeros */
100     uint8_t *zero_target_page;
101     /* buffer used for XBZRLE decoding */
102     uint8_t *decoded_buf;
103 } XBZRLE;
104 
105 static void XBZRLE_cache_lock(void)
106 {
107     if (migrate_use_xbzrle()) {
108         qemu_mutex_lock(&XBZRLE.lock);
109     }
110 }
111 
112 static void XBZRLE_cache_unlock(void)
113 {
114     if (migrate_use_xbzrle()) {
115         qemu_mutex_unlock(&XBZRLE.lock);
116     }
117 }
118 
119 /**
120  * xbzrle_cache_resize: resize the xbzrle cache
121  *
122  * This function is called from migrate_params_apply in main
123  * thread, possibly while a migration is in progress.  A running
124  * migration may be using the cache and might finish during this call,
125  * hence changes to the cache are protected by XBZRLE.lock().
126  *
127  * Returns 0 for success or -1 for error
128  *
129  * @new_size: new cache size
130  * @errp: set *errp if the check failed, with reason
131  */
132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
133 {
134     PageCache *new_cache;
135     int64_t ret = 0;
136 
137     /* Check for truncation */
138     if (new_size != (size_t)new_size) {
139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
140                    "exceeding address space");
141         return -1;
142     }
143 
144     if (new_size == migrate_xbzrle_cache_size()) {
145         /* nothing to do */
146         return 0;
147     }
148 
149     XBZRLE_cache_lock();
150 
151     if (XBZRLE.cache != NULL) {
152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
153         if (!new_cache) {
154             ret = -1;
155             goto out;
156         }
157 
158         cache_fini(XBZRLE.cache);
159         XBZRLE.cache = new_cache;
160     }
161 out:
162     XBZRLE_cache_unlock();
163     return ret;
164 }
165 
166 bool ramblock_is_ignored(RAMBlock *block)
167 {
168     return !qemu_ram_is_migratable(block) ||
169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
170 }
171 
172 #undef RAMBLOCK_FOREACH
173 
174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
175 {
176     RAMBlock *block;
177     int ret = 0;
178 
179     RCU_READ_LOCK_GUARD();
180 
181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
182         ret = func(block, opaque);
183         if (ret) {
184             break;
185         }
186     }
187     return ret;
188 }
189 
190 static void ramblock_recv_map_init(void)
191 {
192     RAMBlock *rb;
193 
194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
195         assert(!rb->receivedmap);
196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197     }
198 }
199 
200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
201 {
202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
203                     rb->receivedmap);
204 }
205 
206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
207 {
208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
209 }
210 
211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
212 {
213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
214 }
215 
216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
217                                     size_t nr)
218 {
219     bitmap_set_atomic(rb->receivedmap,
220                       ramblock_recv_bitmap_offset(host_addr, rb),
221                       nr);
222 }
223 
224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
225 
226 /*
227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
228  *
229  * Returns >0 if success with sent bytes, or <0 if error.
230  */
231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
232                                   const char *block_name)
233 {
234     RAMBlock *block = qemu_ram_block_by_name(block_name);
235     unsigned long *le_bitmap, nbits;
236     uint64_t size;
237 
238     if (!block) {
239         error_report("%s: invalid block name: %s", __func__, block_name);
240         return -1;
241     }
242 
243     nbits = block->used_length >> TARGET_PAGE_BITS;
244 
245     /*
246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
247      * machines we may need 4 more bytes for padding (see below
248      * comment). So extend it a bit before hand.
249      */
250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
251 
252     /*
253      * Always use little endian when sending the bitmap. This is
254      * required that when source and destination VMs are not using the
255      * same endianness. (Note: big endian won't work.)
256      */
257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
258 
259     /* Size of the bitmap, in bytes */
260     size = DIV_ROUND_UP(nbits, 8);
261 
262     /*
263      * size is always aligned to 8 bytes for 64bit machines, but it
264      * may not be true for 32bit machines. We need this padding to
265      * make sure the migration can survive even between 32bit and
266      * 64bit machines.
267      */
268     size = ROUND_UP(size, 8);
269 
270     qemu_put_be64(file, size);
271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
272     /*
273      * Mark as an end, in case the middle part is screwed up due to
274      * some "mysterious" reason.
275      */
276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
277     qemu_fflush(file);
278 
279     g_free(le_bitmap);
280 
281     if (qemu_file_get_error(file)) {
282         return qemu_file_get_error(file);
283     }
284 
285     return size + sizeof(size);
286 }
287 
288 /*
289  * An outstanding page request, on the source, having been received
290  * and queued
291  */
292 struct RAMSrcPageRequest {
293     RAMBlock *rb;
294     hwaddr    offset;
295     hwaddr    len;
296 
297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
298 };
299 
300 /* State of RAM for migration */
301 struct RAMState {
302     /* QEMUFile used for this migration */
303     QEMUFile *f;
304     /* UFFD file descriptor, used in 'write-tracking' migration */
305     int uffdio_fd;
306     /* Last block that we have visited searching for dirty pages */
307     RAMBlock *last_seen_block;
308     /* Last block from where we have sent data */
309     RAMBlock *last_sent_block;
310     /* Last dirty target page we have sent */
311     ram_addr_t last_page;
312     /* last ram version we have seen */
313     uint32_t last_version;
314     /* We are in the first round */
315     bool ram_bulk_stage;
316     /* The free page optimization is enabled */
317     bool fpo_enabled;
318     /* How many times we have dirty too many pages */
319     int dirty_rate_high_cnt;
320     /* these variables are used for bitmap sync */
321     /* last time we did a full bitmap_sync */
322     int64_t time_last_bitmap_sync;
323     /* bytes transferred at start_time */
324     uint64_t bytes_xfer_prev;
325     /* number of dirty pages since start_time */
326     uint64_t num_dirty_pages_period;
327     /* xbzrle misses since the beginning of the period */
328     uint64_t xbzrle_cache_miss_prev;
329     /* Amount of xbzrle pages since the beginning of the period */
330     uint64_t xbzrle_pages_prev;
331     /* Amount of xbzrle encoded bytes since the beginning of the period */
332     uint64_t xbzrle_bytes_prev;
333 
334     /* compression statistics since the beginning of the period */
335     /* amount of count that no free thread to compress data */
336     uint64_t compress_thread_busy_prev;
337     /* amount bytes after compression */
338     uint64_t compressed_size_prev;
339     /* amount of compressed pages */
340     uint64_t compress_pages_prev;
341 
342     /* total handled target pages at the beginning of period */
343     uint64_t target_page_count_prev;
344     /* total handled target pages since start */
345     uint64_t target_page_count;
346     /* number of dirty bits in the bitmap */
347     uint64_t migration_dirty_pages;
348     /* Protects modification of the bitmap and migration dirty pages */
349     QemuMutex bitmap_mutex;
350     /* The RAMBlock used in the last src_page_requests */
351     RAMBlock *last_req_rb;
352     /* Queue of outstanding page requests from the destination */
353     QemuMutex src_page_req_mutex;
354     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
355 };
356 typedef struct RAMState RAMState;
357 
358 static RAMState *ram_state;
359 
360 static NotifierWithReturnList precopy_notifier_list;
361 
362 void precopy_infrastructure_init(void)
363 {
364     notifier_with_return_list_init(&precopy_notifier_list);
365 }
366 
367 void precopy_add_notifier(NotifierWithReturn *n)
368 {
369     notifier_with_return_list_add(&precopy_notifier_list, n);
370 }
371 
372 void precopy_remove_notifier(NotifierWithReturn *n)
373 {
374     notifier_with_return_remove(n);
375 }
376 
377 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
378 {
379     PrecopyNotifyData pnd;
380     pnd.reason = reason;
381     pnd.errp = errp;
382 
383     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
384 }
385 
386 void precopy_enable_free_page_optimization(void)
387 {
388     if (!ram_state) {
389         return;
390     }
391 
392     ram_state->fpo_enabled = true;
393 }
394 
395 uint64_t ram_bytes_remaining(void)
396 {
397     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
398                        0;
399 }
400 
401 MigrationStats ram_counters;
402 
403 /* used by the search for pages to send */
404 struct PageSearchStatus {
405     /* Current block being searched */
406     RAMBlock    *block;
407     /* Current page to search from */
408     unsigned long page;
409     /* Set once we wrap around */
410     bool         complete_round;
411 };
412 typedef struct PageSearchStatus PageSearchStatus;
413 
414 CompressionStats compression_counters;
415 
416 struct CompressParam {
417     bool done;
418     bool quit;
419     bool zero_page;
420     QEMUFile *file;
421     QemuMutex mutex;
422     QemuCond cond;
423     RAMBlock *block;
424     ram_addr_t offset;
425 
426     /* internally used fields */
427     z_stream stream;
428     uint8_t *originbuf;
429 };
430 typedef struct CompressParam CompressParam;
431 
432 struct DecompressParam {
433     bool done;
434     bool quit;
435     QemuMutex mutex;
436     QemuCond cond;
437     void *des;
438     uint8_t *compbuf;
439     int len;
440     z_stream stream;
441 };
442 typedef struct DecompressParam DecompressParam;
443 
444 static CompressParam *comp_param;
445 static QemuThread *compress_threads;
446 /* comp_done_cond is used to wake up the migration thread when
447  * one of the compression threads has finished the compression.
448  * comp_done_lock is used to co-work with comp_done_cond.
449  */
450 static QemuMutex comp_done_lock;
451 static QemuCond comp_done_cond;
452 /* The empty QEMUFileOps will be used by file in CompressParam */
453 static const QEMUFileOps empty_ops = { };
454 
455 static QEMUFile *decomp_file;
456 static DecompressParam *decomp_param;
457 static QemuThread *decompress_threads;
458 static QemuMutex decomp_done_lock;
459 static QemuCond decomp_done_cond;
460 
461 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
462                                  ram_addr_t offset, uint8_t *source_buf);
463 
464 static void *do_data_compress(void *opaque)
465 {
466     CompressParam *param = opaque;
467     RAMBlock *block;
468     ram_addr_t offset;
469     bool zero_page;
470 
471     qemu_mutex_lock(&param->mutex);
472     while (!param->quit) {
473         if (param->block) {
474             block = param->block;
475             offset = param->offset;
476             param->block = NULL;
477             qemu_mutex_unlock(&param->mutex);
478 
479             zero_page = do_compress_ram_page(param->file, &param->stream,
480                                              block, offset, param->originbuf);
481 
482             qemu_mutex_lock(&comp_done_lock);
483             param->done = true;
484             param->zero_page = zero_page;
485             qemu_cond_signal(&comp_done_cond);
486             qemu_mutex_unlock(&comp_done_lock);
487 
488             qemu_mutex_lock(&param->mutex);
489         } else {
490             qemu_cond_wait(&param->cond, &param->mutex);
491         }
492     }
493     qemu_mutex_unlock(&param->mutex);
494 
495     return NULL;
496 }
497 
498 static void compress_threads_save_cleanup(void)
499 {
500     int i, thread_count;
501 
502     if (!migrate_use_compression() || !comp_param) {
503         return;
504     }
505 
506     thread_count = migrate_compress_threads();
507     for (i = 0; i < thread_count; i++) {
508         /*
509          * we use it as a indicator which shows if the thread is
510          * properly init'd or not
511          */
512         if (!comp_param[i].file) {
513             break;
514         }
515 
516         qemu_mutex_lock(&comp_param[i].mutex);
517         comp_param[i].quit = true;
518         qemu_cond_signal(&comp_param[i].cond);
519         qemu_mutex_unlock(&comp_param[i].mutex);
520 
521         qemu_thread_join(compress_threads + i);
522         qemu_mutex_destroy(&comp_param[i].mutex);
523         qemu_cond_destroy(&comp_param[i].cond);
524         deflateEnd(&comp_param[i].stream);
525         g_free(comp_param[i].originbuf);
526         qemu_fclose(comp_param[i].file);
527         comp_param[i].file = NULL;
528     }
529     qemu_mutex_destroy(&comp_done_lock);
530     qemu_cond_destroy(&comp_done_cond);
531     g_free(compress_threads);
532     g_free(comp_param);
533     compress_threads = NULL;
534     comp_param = NULL;
535 }
536 
537 static int compress_threads_save_setup(void)
538 {
539     int i, thread_count;
540 
541     if (!migrate_use_compression()) {
542         return 0;
543     }
544     thread_count = migrate_compress_threads();
545     compress_threads = g_new0(QemuThread, thread_count);
546     comp_param = g_new0(CompressParam, thread_count);
547     qemu_cond_init(&comp_done_cond);
548     qemu_mutex_init(&comp_done_lock);
549     for (i = 0; i < thread_count; i++) {
550         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
551         if (!comp_param[i].originbuf) {
552             goto exit;
553         }
554 
555         if (deflateInit(&comp_param[i].stream,
556                         migrate_compress_level()) != Z_OK) {
557             g_free(comp_param[i].originbuf);
558             goto exit;
559         }
560 
561         /* comp_param[i].file is just used as a dummy buffer to save data,
562          * set its ops to empty.
563          */
564         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
565         comp_param[i].done = true;
566         comp_param[i].quit = false;
567         qemu_mutex_init(&comp_param[i].mutex);
568         qemu_cond_init(&comp_param[i].cond);
569         qemu_thread_create(compress_threads + i, "compress",
570                            do_data_compress, comp_param + i,
571                            QEMU_THREAD_JOINABLE);
572     }
573     return 0;
574 
575 exit:
576     compress_threads_save_cleanup();
577     return -1;
578 }
579 
580 /**
581  * save_page_header: write page header to wire
582  *
583  * If this is the 1st block, it also writes the block identification
584  *
585  * Returns the number of bytes written
586  *
587  * @f: QEMUFile where to send the data
588  * @block: block that contains the page we want to send
589  * @offset: offset inside the block for the page
590  *          in the lower bits, it contains flags
591  */
592 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
593                                ram_addr_t offset)
594 {
595     size_t size, len;
596 
597     if (block == rs->last_sent_block) {
598         offset |= RAM_SAVE_FLAG_CONTINUE;
599     }
600     qemu_put_be64(f, offset);
601     size = 8;
602 
603     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
604         len = strlen(block->idstr);
605         qemu_put_byte(f, len);
606         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
607         size += 1 + len;
608         rs->last_sent_block = block;
609     }
610     return size;
611 }
612 
613 /**
614  * mig_throttle_guest_down: throotle down the guest
615  *
616  * Reduce amount of guest cpu execution to hopefully slow down memory
617  * writes. If guest dirty memory rate is reduced below the rate at
618  * which we can transfer pages to the destination then we should be
619  * able to complete migration. Some workloads dirty memory way too
620  * fast and will not effectively converge, even with auto-converge.
621  */
622 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
623                                     uint64_t bytes_dirty_threshold)
624 {
625     MigrationState *s = migrate_get_current();
626     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
627     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
628     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
629     int pct_max = s->parameters.max_cpu_throttle;
630 
631     uint64_t throttle_now = cpu_throttle_get_percentage();
632     uint64_t cpu_now, cpu_ideal, throttle_inc;
633 
634     /* We have not started throttling yet. Let's start it. */
635     if (!cpu_throttle_active()) {
636         cpu_throttle_set(pct_initial);
637     } else {
638         /* Throttling already on, just increase the rate */
639         if (!pct_tailslow) {
640             throttle_inc = pct_increment;
641         } else {
642             /* Compute the ideal CPU percentage used by Guest, which may
643              * make the dirty rate match the dirty rate threshold. */
644             cpu_now = 100 - throttle_now;
645             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
646                         bytes_dirty_period);
647             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
648         }
649         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
650     }
651 }
652 
653 /**
654  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
655  *
656  * @rs: current RAM state
657  * @current_addr: address for the zero page
658  *
659  * Update the xbzrle cache to reflect a page that's been sent as all 0.
660  * The important thing is that a stale (not-yet-0'd) page be replaced
661  * by the new data.
662  * As a bonus, if the page wasn't in the cache it gets added so that
663  * when a small write is made into the 0'd page it gets XBZRLE sent.
664  */
665 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
666 {
667     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
668         return;
669     }
670 
671     /* We don't care if this fails to allocate a new cache page
672      * as long as it updated an old one */
673     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
674                  ram_counters.dirty_sync_count);
675 }
676 
677 #define ENCODING_FLAG_XBZRLE 0x1
678 
679 /**
680  * save_xbzrle_page: compress and send current page
681  *
682  * Returns: 1 means that we wrote the page
683  *          0 means that page is identical to the one already sent
684  *          -1 means that xbzrle would be longer than normal
685  *
686  * @rs: current RAM state
687  * @current_data: pointer to the address of the page contents
688  * @current_addr: addr of the page
689  * @block: block that contains the page we want to send
690  * @offset: offset inside the block for the page
691  * @last_stage: if we are at the completion stage
692  */
693 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
694                             ram_addr_t current_addr, RAMBlock *block,
695                             ram_addr_t offset, bool last_stage)
696 {
697     int encoded_len = 0, bytes_xbzrle;
698     uint8_t *prev_cached_page;
699 
700     if (!cache_is_cached(XBZRLE.cache, current_addr,
701                          ram_counters.dirty_sync_count)) {
702         xbzrle_counters.cache_miss++;
703         if (!last_stage) {
704             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
705                              ram_counters.dirty_sync_count) == -1) {
706                 return -1;
707             } else {
708                 /* update *current_data when the page has been
709                    inserted into cache */
710                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
711             }
712         }
713         return -1;
714     }
715 
716     /*
717      * Reaching here means the page has hit the xbzrle cache, no matter what
718      * encoding result it is (normal encoding, overflow or skipping the page),
719      * count the page as encoded. This is used to calculate the encoding rate.
720      *
721      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
722      * 2nd page turns out to be skipped (i.e. no new bytes written to the
723      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
724      * skipped page included. In this way, the encoding rate can tell if the
725      * guest page is good for xbzrle encoding.
726      */
727     xbzrle_counters.pages++;
728     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
729 
730     /* save current buffer into memory */
731     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
732 
733     /* XBZRLE encoding (if there is no overflow) */
734     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
735                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
736                                        TARGET_PAGE_SIZE);
737 
738     /*
739      * Update the cache contents, so that it corresponds to the data
740      * sent, in all cases except where we skip the page.
741      */
742     if (!last_stage && encoded_len != 0) {
743         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
744         /*
745          * In the case where we couldn't compress, ensure that the caller
746          * sends the data from the cache, since the guest might have
747          * changed the RAM since we copied it.
748          */
749         *current_data = prev_cached_page;
750     }
751 
752     if (encoded_len == 0) {
753         trace_save_xbzrle_page_skipping();
754         return 0;
755     } else if (encoded_len == -1) {
756         trace_save_xbzrle_page_overflow();
757         xbzrle_counters.overflow++;
758         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
759         return -1;
760     }
761 
762     /* Send XBZRLE based compressed page */
763     bytes_xbzrle = save_page_header(rs, rs->f, block,
764                                     offset | RAM_SAVE_FLAG_XBZRLE);
765     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
766     qemu_put_be16(rs->f, encoded_len);
767     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
768     bytes_xbzrle += encoded_len + 1 + 2;
769     /*
770      * Like compressed_size (please see update_compress_thread_counts),
771      * the xbzrle encoded bytes don't count the 8 byte header with
772      * RAM_SAVE_FLAG_CONTINUE.
773      */
774     xbzrle_counters.bytes += bytes_xbzrle - 8;
775     ram_counters.transferred += bytes_xbzrle;
776 
777     return 1;
778 }
779 
780 /**
781  * migration_bitmap_find_dirty: find the next dirty page from start
782  *
783  * Returns the page offset within memory region of the start of a dirty page
784  *
785  * @rs: current RAM state
786  * @rb: RAMBlock where to search for dirty pages
787  * @start: page where we start the search
788  */
789 static inline
790 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
791                                           unsigned long start)
792 {
793     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
794     unsigned long *bitmap = rb->bmap;
795     unsigned long next;
796 
797     if (ramblock_is_ignored(rb)) {
798         return size;
799     }
800 
801     /*
802      * When the free page optimization is enabled, we need to check the bitmap
803      * to send the non-free pages rather than all the pages in the bulk stage.
804      */
805     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
806         next = start + 1;
807     } else {
808         next = find_next_bit(bitmap, size, start);
809     }
810 
811     return next;
812 }
813 
814 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
815                                                 RAMBlock *rb,
816                                                 unsigned long page)
817 {
818     bool ret;
819 
820     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
821 
822     /*
823      * Clear dirty bitmap if needed.  This _must_ be called before we
824      * send any of the page in the chunk because we need to make sure
825      * we can capture further page content changes when we sync dirty
826      * log the next time.  So as long as we are going to send any of
827      * the page in the chunk we clear the remote dirty bitmap for all.
828      * Clearing it earlier won't be a problem, but too late will.
829      */
830     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
831         uint8_t shift = rb->clear_bmap_shift;
832         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
833         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
834 
835         /*
836          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
837          * can make things easier sometimes since then start address
838          * of the small chunk will always be 64 pages aligned so the
839          * bitmap will always be aligned to unsigned long.  We should
840          * even be able to remove this restriction but I'm simply
841          * keeping it.
842          */
843         assert(shift >= 6);
844         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
845         memory_region_clear_dirty_bitmap(rb->mr, start, size);
846     }
847 
848     ret = test_and_clear_bit(page, rb->bmap);
849 
850     if (ret) {
851         rs->migration_dirty_pages--;
852     }
853 
854     return ret;
855 }
856 
857 /* Called with RCU critical section */
858 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
859 {
860     uint64_t new_dirty_pages =
861         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
862 
863     rs->migration_dirty_pages += new_dirty_pages;
864     rs->num_dirty_pages_period += new_dirty_pages;
865 }
866 
867 /**
868  * ram_pagesize_summary: calculate all the pagesizes of a VM
869  *
870  * Returns a summary bitmap of the page sizes of all RAMBlocks
871  *
872  * For VMs with just normal pages this is equivalent to the host page
873  * size. If it's got some huge pages then it's the OR of all the
874  * different page sizes.
875  */
876 uint64_t ram_pagesize_summary(void)
877 {
878     RAMBlock *block;
879     uint64_t summary = 0;
880 
881     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
882         summary |= block->page_size;
883     }
884 
885     return summary;
886 }
887 
888 uint64_t ram_get_total_transferred_pages(void)
889 {
890     return  ram_counters.normal + ram_counters.duplicate +
891                 compression_counters.pages + xbzrle_counters.pages;
892 }
893 
894 static void migration_update_rates(RAMState *rs, int64_t end_time)
895 {
896     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
897     double compressed_size;
898 
899     /* calculate period counters */
900     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
901                 / (end_time - rs->time_last_bitmap_sync);
902 
903     if (!page_count) {
904         return;
905     }
906 
907     if (migrate_use_xbzrle()) {
908         double encoded_size, unencoded_size;
909 
910         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
911             rs->xbzrle_cache_miss_prev) / page_count;
912         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
913         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
914                          TARGET_PAGE_SIZE;
915         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
916         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
917             xbzrle_counters.encoding_rate = 0;
918         } else {
919             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
920         }
921         rs->xbzrle_pages_prev = xbzrle_counters.pages;
922         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
923     }
924 
925     if (migrate_use_compression()) {
926         compression_counters.busy_rate = (double)(compression_counters.busy -
927             rs->compress_thread_busy_prev) / page_count;
928         rs->compress_thread_busy_prev = compression_counters.busy;
929 
930         compressed_size = compression_counters.compressed_size -
931                           rs->compressed_size_prev;
932         if (compressed_size) {
933             double uncompressed_size = (compression_counters.pages -
934                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
935 
936             /* Compression-Ratio = Uncompressed-size / Compressed-size */
937             compression_counters.compression_rate =
938                                         uncompressed_size / compressed_size;
939 
940             rs->compress_pages_prev = compression_counters.pages;
941             rs->compressed_size_prev = compression_counters.compressed_size;
942         }
943     }
944 }
945 
946 static void migration_trigger_throttle(RAMState *rs)
947 {
948     MigrationState *s = migrate_get_current();
949     uint64_t threshold = s->parameters.throttle_trigger_threshold;
950 
951     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
952     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
953     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
954 
955     /* During block migration the auto-converge logic incorrectly detects
956      * that ram migration makes no progress. Avoid this by disabling the
957      * throttling logic during the bulk phase of block migration. */
958     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
959         /* The following detection logic can be refined later. For now:
960            Check to see if the ratio between dirtied bytes and the approx.
961            amount of bytes that just got transferred since the last time
962            we were in this routine reaches the threshold. If that happens
963            twice, start or increase throttling. */
964 
965         if ((bytes_dirty_period > bytes_dirty_threshold) &&
966             (++rs->dirty_rate_high_cnt >= 2)) {
967             trace_migration_throttle();
968             rs->dirty_rate_high_cnt = 0;
969             mig_throttle_guest_down(bytes_dirty_period,
970                                     bytes_dirty_threshold);
971         }
972     }
973 }
974 
975 static void migration_bitmap_sync(RAMState *rs)
976 {
977     RAMBlock *block;
978     int64_t end_time;
979 
980     ram_counters.dirty_sync_count++;
981 
982     if (!rs->time_last_bitmap_sync) {
983         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
984     }
985 
986     trace_migration_bitmap_sync_start();
987     memory_global_dirty_log_sync();
988 
989     qemu_mutex_lock(&rs->bitmap_mutex);
990     WITH_RCU_READ_LOCK_GUARD() {
991         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
992             ramblock_sync_dirty_bitmap(rs, block);
993         }
994         ram_counters.remaining = ram_bytes_remaining();
995     }
996     qemu_mutex_unlock(&rs->bitmap_mutex);
997 
998     memory_global_after_dirty_log_sync();
999     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1000 
1001     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1002 
1003     /* more than 1 second = 1000 millisecons */
1004     if (end_time > rs->time_last_bitmap_sync + 1000) {
1005         migration_trigger_throttle(rs);
1006 
1007         migration_update_rates(rs, end_time);
1008 
1009         rs->target_page_count_prev = rs->target_page_count;
1010 
1011         /* reset period counters */
1012         rs->time_last_bitmap_sync = end_time;
1013         rs->num_dirty_pages_period = 0;
1014         rs->bytes_xfer_prev = ram_counters.transferred;
1015     }
1016     if (migrate_use_events()) {
1017         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1018     }
1019 }
1020 
1021 static void migration_bitmap_sync_precopy(RAMState *rs)
1022 {
1023     Error *local_err = NULL;
1024 
1025     /*
1026      * The current notifier usage is just an optimization to migration, so we
1027      * don't stop the normal migration process in the error case.
1028      */
1029     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1030         error_report_err(local_err);
1031         local_err = NULL;
1032     }
1033 
1034     migration_bitmap_sync(rs);
1035 
1036     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1037         error_report_err(local_err);
1038     }
1039 }
1040 
1041 /**
1042  * save_zero_page_to_file: send the zero page to the file
1043  *
1044  * Returns the size of data written to the file, 0 means the page is not
1045  * a zero page
1046  *
1047  * @rs: current RAM state
1048  * @file: the file where the data is saved
1049  * @block: block that contains the page we want to send
1050  * @offset: offset inside the block for the page
1051  */
1052 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1053                                   RAMBlock *block, ram_addr_t offset)
1054 {
1055     uint8_t *p = block->host + offset;
1056     int len = 0;
1057 
1058     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1059         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1060         qemu_put_byte(file, 0);
1061         len += 1;
1062     }
1063     return len;
1064 }
1065 
1066 /**
1067  * save_zero_page: send the zero page to the stream
1068  *
1069  * Returns the number of pages written.
1070  *
1071  * @rs: current RAM state
1072  * @block: block that contains the page we want to send
1073  * @offset: offset inside the block for the page
1074  */
1075 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1076 {
1077     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1078 
1079     if (len) {
1080         ram_counters.duplicate++;
1081         ram_counters.transferred += len;
1082         return 1;
1083     }
1084     return -1;
1085 }
1086 
1087 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1088 {
1089     if (!migrate_release_ram() || !migration_in_postcopy()) {
1090         return;
1091     }
1092 
1093     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1094 }
1095 
1096 /*
1097  * @pages: the number of pages written by the control path,
1098  *        < 0 - error
1099  *        > 0 - number of pages written
1100  *
1101  * Return true if the pages has been saved, otherwise false is returned.
1102  */
1103 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1104                               int *pages)
1105 {
1106     uint64_t bytes_xmit = 0;
1107     int ret;
1108 
1109     *pages = -1;
1110     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1111                                 &bytes_xmit);
1112     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1113         return false;
1114     }
1115 
1116     if (bytes_xmit) {
1117         ram_counters.transferred += bytes_xmit;
1118         *pages = 1;
1119     }
1120 
1121     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1122         return true;
1123     }
1124 
1125     if (bytes_xmit > 0) {
1126         ram_counters.normal++;
1127     } else if (bytes_xmit == 0) {
1128         ram_counters.duplicate++;
1129     }
1130 
1131     return true;
1132 }
1133 
1134 /*
1135  * directly send the page to the stream
1136  *
1137  * Returns the number of pages written.
1138  *
1139  * @rs: current RAM state
1140  * @block: block that contains the page we want to send
1141  * @offset: offset inside the block for the page
1142  * @buf: the page to be sent
1143  * @async: send to page asyncly
1144  */
1145 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1146                             uint8_t *buf, bool async)
1147 {
1148     ram_counters.transferred += save_page_header(rs, rs->f, block,
1149                                                  offset | RAM_SAVE_FLAG_PAGE);
1150     if (async) {
1151         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1152                               migrate_release_ram() &
1153                               migration_in_postcopy());
1154     } else {
1155         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1156     }
1157     ram_counters.transferred += TARGET_PAGE_SIZE;
1158     ram_counters.normal++;
1159     return 1;
1160 }
1161 
1162 /**
1163  * ram_save_page: send the given page to the stream
1164  *
1165  * Returns the number of pages written.
1166  *          < 0 - error
1167  *          >=0 - Number of pages written - this might legally be 0
1168  *                if xbzrle noticed the page was the same.
1169  *
1170  * @rs: current RAM state
1171  * @block: block that contains the page we want to send
1172  * @offset: offset inside the block for the page
1173  * @last_stage: if we are at the completion stage
1174  */
1175 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1176 {
1177     int pages = -1;
1178     uint8_t *p;
1179     bool send_async = true;
1180     RAMBlock *block = pss->block;
1181     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1182     ram_addr_t current_addr = block->offset + offset;
1183 
1184     p = block->host + offset;
1185     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1186 
1187     XBZRLE_cache_lock();
1188     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1189         migrate_use_xbzrle()) {
1190         pages = save_xbzrle_page(rs, &p, current_addr, block,
1191                                  offset, last_stage);
1192         if (!last_stage) {
1193             /* Can't send this cached data async, since the cache page
1194              * might get updated before it gets to the wire
1195              */
1196             send_async = false;
1197         }
1198     }
1199 
1200     /* XBZRLE overflow or normal page */
1201     if (pages == -1) {
1202         pages = save_normal_page(rs, block, offset, p, send_async);
1203     }
1204 
1205     XBZRLE_cache_unlock();
1206 
1207     return pages;
1208 }
1209 
1210 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1211                                  ram_addr_t offset)
1212 {
1213     if (multifd_queue_page(rs->f, block, offset) < 0) {
1214         return -1;
1215     }
1216     ram_counters.normal++;
1217 
1218     return 1;
1219 }
1220 
1221 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1222                                  ram_addr_t offset, uint8_t *source_buf)
1223 {
1224     RAMState *rs = ram_state;
1225     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1226     bool zero_page = false;
1227     int ret;
1228 
1229     if (save_zero_page_to_file(rs, f, block, offset)) {
1230         zero_page = true;
1231         goto exit;
1232     }
1233 
1234     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1235 
1236     /*
1237      * copy it to a internal buffer to avoid it being modified by VM
1238      * so that we can catch up the error during compression and
1239      * decompression
1240      */
1241     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1242     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1243     if (ret < 0) {
1244         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1245         error_report("compressed data failed!");
1246         return false;
1247     }
1248 
1249 exit:
1250     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1251     return zero_page;
1252 }
1253 
1254 static void
1255 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1256 {
1257     ram_counters.transferred += bytes_xmit;
1258 
1259     if (param->zero_page) {
1260         ram_counters.duplicate++;
1261         return;
1262     }
1263 
1264     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1265     compression_counters.compressed_size += bytes_xmit - 8;
1266     compression_counters.pages++;
1267 }
1268 
1269 static bool save_page_use_compression(RAMState *rs);
1270 
1271 static void flush_compressed_data(RAMState *rs)
1272 {
1273     int idx, len, thread_count;
1274 
1275     if (!save_page_use_compression(rs)) {
1276         return;
1277     }
1278     thread_count = migrate_compress_threads();
1279 
1280     qemu_mutex_lock(&comp_done_lock);
1281     for (idx = 0; idx < thread_count; idx++) {
1282         while (!comp_param[idx].done) {
1283             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1284         }
1285     }
1286     qemu_mutex_unlock(&comp_done_lock);
1287 
1288     for (idx = 0; idx < thread_count; idx++) {
1289         qemu_mutex_lock(&comp_param[idx].mutex);
1290         if (!comp_param[idx].quit) {
1291             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1292             /*
1293              * it's safe to fetch zero_page without holding comp_done_lock
1294              * as there is no further request submitted to the thread,
1295              * i.e, the thread should be waiting for a request at this point.
1296              */
1297             update_compress_thread_counts(&comp_param[idx], len);
1298         }
1299         qemu_mutex_unlock(&comp_param[idx].mutex);
1300     }
1301 }
1302 
1303 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1304                                        ram_addr_t offset)
1305 {
1306     param->block = block;
1307     param->offset = offset;
1308 }
1309 
1310 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1311                                            ram_addr_t offset)
1312 {
1313     int idx, thread_count, bytes_xmit = -1, pages = -1;
1314     bool wait = migrate_compress_wait_thread();
1315 
1316     thread_count = migrate_compress_threads();
1317     qemu_mutex_lock(&comp_done_lock);
1318 retry:
1319     for (idx = 0; idx < thread_count; idx++) {
1320         if (comp_param[idx].done) {
1321             comp_param[idx].done = false;
1322             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1323             qemu_mutex_lock(&comp_param[idx].mutex);
1324             set_compress_params(&comp_param[idx], block, offset);
1325             qemu_cond_signal(&comp_param[idx].cond);
1326             qemu_mutex_unlock(&comp_param[idx].mutex);
1327             pages = 1;
1328             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1329             break;
1330         }
1331     }
1332 
1333     /*
1334      * wait for the free thread if the user specifies 'compress-wait-thread',
1335      * otherwise we will post the page out in the main thread as normal page.
1336      */
1337     if (pages < 0 && wait) {
1338         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1339         goto retry;
1340     }
1341     qemu_mutex_unlock(&comp_done_lock);
1342 
1343     return pages;
1344 }
1345 
1346 /**
1347  * find_dirty_block: find the next dirty page and update any state
1348  * associated with the search process.
1349  *
1350  * Returns true if a page is found
1351  *
1352  * @rs: current RAM state
1353  * @pss: data about the state of the current dirty page scan
1354  * @again: set to false if the search has scanned the whole of RAM
1355  */
1356 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1357 {
1358     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1359     if (pss->complete_round && pss->block == rs->last_seen_block &&
1360         pss->page >= rs->last_page) {
1361         /*
1362          * We've been once around the RAM and haven't found anything.
1363          * Give up.
1364          */
1365         *again = false;
1366         return false;
1367     }
1368     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1369         >= pss->block->used_length) {
1370         /* Didn't find anything in this RAM Block */
1371         pss->page = 0;
1372         pss->block = QLIST_NEXT_RCU(pss->block, next);
1373         if (!pss->block) {
1374             /*
1375              * If memory migration starts over, we will meet a dirtied page
1376              * which may still exists in compression threads's ring, so we
1377              * should flush the compressed data to make sure the new page
1378              * is not overwritten by the old one in the destination.
1379              *
1380              * Also If xbzrle is on, stop using the data compression at this
1381              * point. In theory, xbzrle can do better than compression.
1382              */
1383             flush_compressed_data(rs);
1384 
1385             /* Hit the end of the list */
1386             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1387             /* Flag that we've looped */
1388             pss->complete_round = true;
1389             rs->ram_bulk_stage = false;
1390         }
1391         /* Didn't find anything this time, but try again on the new block */
1392         *again = true;
1393         return false;
1394     } else {
1395         /* Can go around again, but... */
1396         *again = true;
1397         /* We've found something so probably don't need to */
1398         return true;
1399     }
1400 }
1401 
1402 /**
1403  * unqueue_page: gets a page of the queue
1404  *
1405  * Helper for 'get_queued_page' - gets a page off the queue
1406  *
1407  * Returns the block of the page (or NULL if none available)
1408  *
1409  * @rs: current RAM state
1410  * @offset: used to return the offset within the RAMBlock
1411  */
1412 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1413 {
1414     RAMBlock *block = NULL;
1415 
1416     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1417         return NULL;
1418     }
1419 
1420     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1421     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1422         struct RAMSrcPageRequest *entry =
1423                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1424         block = entry->rb;
1425         *offset = entry->offset;
1426 
1427         if (entry->len > TARGET_PAGE_SIZE) {
1428             entry->len -= TARGET_PAGE_SIZE;
1429             entry->offset += TARGET_PAGE_SIZE;
1430         } else {
1431             memory_region_unref(block->mr);
1432             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1433             g_free(entry);
1434             migration_consume_urgent_request();
1435         }
1436     }
1437 
1438     return block;
1439 }
1440 
1441 #if defined(__linux__)
1442 /**
1443  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1444  *   is found, return RAM block pointer and page offset
1445  *
1446  * Returns pointer to the RAMBlock containing faulting page,
1447  *   NULL if no write faults are pending
1448  *
1449  * @rs: current RAM state
1450  * @offset: page offset from the beginning of the block
1451  */
1452 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1453 {
1454     struct uffd_msg uffd_msg;
1455     void *page_address;
1456     RAMBlock *block;
1457     int res;
1458 
1459     if (!migrate_background_snapshot()) {
1460         return NULL;
1461     }
1462 
1463     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1464     if (res <= 0) {
1465         return NULL;
1466     }
1467 
1468     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1469     block = qemu_ram_block_from_host(page_address, false, offset);
1470     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1471     return block;
1472 }
1473 
1474 /**
1475  * ram_save_release_protection: release UFFD write protection after
1476  *   a range of pages has been saved
1477  *
1478  * @rs: current RAM state
1479  * @pss: page-search-status structure
1480  * @start_page: index of the first page in the range relative to pss->block
1481  *
1482  * Returns 0 on success, negative value in case of an error
1483 */
1484 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1485         unsigned long start_page)
1486 {
1487     int res = 0;
1488 
1489     /* Check if page is from UFFD-managed region. */
1490     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1491         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1492         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1493 
1494         /* Flush async buffers before un-protect. */
1495         qemu_fflush(rs->f);
1496         /* Un-protect memory range. */
1497         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1498                 false, false);
1499     }
1500 
1501     return res;
1502 }
1503 
1504 /* ram_write_tracking_available: check if kernel supports required UFFD features
1505  *
1506  * Returns true if supports, false otherwise
1507  */
1508 bool ram_write_tracking_available(void)
1509 {
1510     uint64_t uffd_features;
1511     int res;
1512 
1513     res = uffd_query_features(&uffd_features);
1514     return (res == 0 &&
1515             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1516 }
1517 
1518 /* ram_write_tracking_compatible: check if guest configuration is
1519  *   compatible with 'write-tracking'
1520  *
1521  * Returns true if compatible, false otherwise
1522  */
1523 bool ram_write_tracking_compatible(void)
1524 {
1525     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1526     int uffd_fd;
1527     RAMBlock *block;
1528     bool ret = false;
1529 
1530     /* Open UFFD file descriptor */
1531     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1532     if (uffd_fd < 0) {
1533         return false;
1534     }
1535 
1536     RCU_READ_LOCK_GUARD();
1537 
1538     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1539         uint64_t uffd_ioctls;
1540 
1541         /* Nothing to do with read-only and MMIO-writable regions */
1542         if (block->mr->readonly || block->mr->rom_device) {
1543             continue;
1544         }
1545         /* Try to register block memory via UFFD-IO to track writes */
1546         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1547                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1548             goto out;
1549         }
1550         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1551             goto out;
1552         }
1553     }
1554     ret = true;
1555 
1556 out:
1557     uffd_close_fd(uffd_fd);
1558     return ret;
1559 }
1560 
1561 /*
1562  * ram_block_populate_pages: populate memory in the RAM block by reading
1563  *   an integer from the beginning of each page.
1564  *
1565  * Since it's solely used for userfault_fd WP feature, here we just
1566  *   hardcode page size to qemu_real_host_page_size.
1567  *
1568  * @block: RAM block to populate
1569  */
1570 static void ram_block_populate_pages(RAMBlock *block)
1571 {
1572     char *ptr = (char *) block->host;
1573 
1574     for (ram_addr_t offset = 0; offset < block->used_length;
1575             offset += qemu_real_host_page_size) {
1576         char tmp = *(ptr + offset);
1577 
1578         /* Don't optimize the read out */
1579         asm volatile("" : "+r" (tmp));
1580     }
1581 }
1582 
1583 /*
1584  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1585  */
1586 void ram_write_tracking_prepare(void)
1587 {
1588     RAMBlock *block;
1589 
1590     RCU_READ_LOCK_GUARD();
1591 
1592     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1593         /* Nothing to do with read-only and MMIO-writable regions */
1594         if (block->mr->readonly || block->mr->rom_device) {
1595             continue;
1596         }
1597 
1598         /*
1599          * Populate pages of the RAM block before enabling userfault_fd
1600          * write protection.
1601          *
1602          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1603          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1604          * pages with pte_none() entries in page table.
1605          */
1606         ram_block_populate_pages(block);
1607     }
1608 }
1609 
1610 /*
1611  * ram_write_tracking_start: start UFFD-WP memory tracking
1612  *
1613  * Returns 0 for success or negative value in case of error
1614  */
1615 int ram_write_tracking_start(void)
1616 {
1617     int uffd_fd;
1618     RAMState *rs = ram_state;
1619     RAMBlock *block;
1620 
1621     /* Open UFFD file descriptor */
1622     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1623     if (uffd_fd < 0) {
1624         return uffd_fd;
1625     }
1626     rs->uffdio_fd = uffd_fd;
1627 
1628     RCU_READ_LOCK_GUARD();
1629 
1630     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1631         /* Nothing to do with read-only and MMIO-writable regions */
1632         if (block->mr->readonly || block->mr->rom_device) {
1633             continue;
1634         }
1635 
1636         /* Register block memory with UFFD to track writes */
1637         if (uffd_register_memory(rs->uffdio_fd, block->host,
1638                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1639             goto fail;
1640         }
1641         /* Apply UFFD write protection to the block memory range */
1642         if (uffd_change_protection(rs->uffdio_fd, block->host,
1643                 block->max_length, true, false)) {
1644             goto fail;
1645         }
1646         block->flags |= RAM_UF_WRITEPROTECT;
1647         memory_region_ref(block->mr);
1648 
1649         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1650                 block->host, block->max_length);
1651     }
1652 
1653     return 0;
1654 
1655 fail:
1656     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1657 
1658     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1659         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1660             continue;
1661         }
1662         /*
1663          * In case some memory block failed to be write-protected
1664          * remove protection and unregister all succeeded RAM blocks
1665          */
1666         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1667                 false, false);
1668         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1669         /* Cleanup flags and remove reference */
1670         block->flags &= ~RAM_UF_WRITEPROTECT;
1671         memory_region_unref(block->mr);
1672     }
1673 
1674     uffd_close_fd(uffd_fd);
1675     rs->uffdio_fd = -1;
1676     return -1;
1677 }
1678 
1679 /**
1680  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1681  */
1682 void ram_write_tracking_stop(void)
1683 {
1684     RAMState *rs = ram_state;
1685     RAMBlock *block;
1686 
1687     RCU_READ_LOCK_GUARD();
1688 
1689     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1690         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1691             continue;
1692         }
1693         /* Remove protection and unregister all affected RAM blocks */
1694         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1695                 false, false);
1696         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1697 
1698         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1699                 block->host, block->max_length);
1700 
1701         /* Cleanup flags and remove reference */
1702         block->flags &= ~RAM_UF_WRITEPROTECT;
1703         memory_region_unref(block->mr);
1704     }
1705 
1706     /* Finally close UFFD file descriptor */
1707     uffd_close_fd(rs->uffdio_fd);
1708     rs->uffdio_fd = -1;
1709 }
1710 
1711 #else
1712 /* No target OS support, stubs just fail or ignore */
1713 
1714 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1715 {
1716     (void) rs;
1717     (void) offset;
1718 
1719     return NULL;
1720 }
1721 
1722 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1723         unsigned long start_page)
1724 {
1725     (void) rs;
1726     (void) pss;
1727     (void) start_page;
1728 
1729     return 0;
1730 }
1731 
1732 bool ram_write_tracking_available(void)
1733 {
1734     return false;
1735 }
1736 
1737 bool ram_write_tracking_compatible(void)
1738 {
1739     assert(0);
1740     return false;
1741 }
1742 
1743 int ram_write_tracking_start(void)
1744 {
1745     assert(0);
1746     return -1;
1747 }
1748 
1749 void ram_write_tracking_stop(void)
1750 {
1751     assert(0);
1752 }
1753 #endif /* defined(__linux__) */
1754 
1755 /**
1756  * get_queued_page: unqueue a page from the postcopy requests
1757  *
1758  * Skips pages that are already sent (!dirty)
1759  *
1760  * Returns true if a queued page is found
1761  *
1762  * @rs: current RAM state
1763  * @pss: data about the state of the current dirty page scan
1764  */
1765 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1766 {
1767     RAMBlock  *block;
1768     ram_addr_t offset;
1769     bool dirty;
1770 
1771     do {
1772         block = unqueue_page(rs, &offset);
1773         /*
1774          * We're sending this page, and since it's postcopy nothing else
1775          * will dirty it, and we must make sure it doesn't get sent again
1776          * even if this queue request was received after the background
1777          * search already sent it.
1778          */
1779         if (block) {
1780             unsigned long page;
1781 
1782             page = offset >> TARGET_PAGE_BITS;
1783             dirty = test_bit(page, block->bmap);
1784             if (!dirty) {
1785                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1786                                                 page);
1787             } else {
1788                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1789             }
1790         }
1791 
1792     } while (block && !dirty);
1793 
1794     if (!block) {
1795         /*
1796          * Poll write faults too if background snapshot is enabled; that's
1797          * when we have vcpus got blocked by the write protected pages.
1798          */
1799         block = poll_fault_page(rs, &offset);
1800     }
1801 
1802     if (block) {
1803         /*
1804          * As soon as we start servicing pages out of order, then we have
1805          * to kill the bulk stage, since the bulk stage assumes
1806          * in (migration_bitmap_find_and_reset_dirty) that every page is
1807          * dirty, that's no longer true.
1808          */
1809         rs->ram_bulk_stage = false;
1810 
1811         /*
1812          * We want the background search to continue from the queued page
1813          * since the guest is likely to want other pages near to the page
1814          * it just requested.
1815          */
1816         pss->block = block;
1817         pss->page = offset >> TARGET_PAGE_BITS;
1818 
1819         /*
1820          * This unqueued page would break the "one round" check, even is
1821          * really rare.
1822          */
1823         pss->complete_round = false;
1824     }
1825 
1826     return !!block;
1827 }
1828 
1829 /**
1830  * migration_page_queue_free: drop any remaining pages in the ram
1831  * request queue
1832  *
1833  * It should be empty at the end anyway, but in error cases there may
1834  * be some left.  in case that there is any page left, we drop it.
1835  *
1836  */
1837 static void migration_page_queue_free(RAMState *rs)
1838 {
1839     struct RAMSrcPageRequest *mspr, *next_mspr;
1840     /* This queue generally should be empty - but in the case of a failed
1841      * migration might have some droppings in.
1842      */
1843     RCU_READ_LOCK_GUARD();
1844     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1845         memory_region_unref(mspr->rb->mr);
1846         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1847         g_free(mspr);
1848     }
1849 }
1850 
1851 /**
1852  * ram_save_queue_pages: queue the page for transmission
1853  *
1854  * A request from postcopy destination for example.
1855  *
1856  * Returns zero on success or negative on error
1857  *
1858  * @rbname: Name of the RAMBLock of the request. NULL means the
1859  *          same that last one.
1860  * @start: starting address from the start of the RAMBlock
1861  * @len: length (in bytes) to send
1862  */
1863 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1864 {
1865     RAMBlock *ramblock;
1866     RAMState *rs = ram_state;
1867 
1868     ram_counters.postcopy_requests++;
1869     RCU_READ_LOCK_GUARD();
1870 
1871     if (!rbname) {
1872         /* Reuse last RAMBlock */
1873         ramblock = rs->last_req_rb;
1874 
1875         if (!ramblock) {
1876             /*
1877              * Shouldn't happen, we can't reuse the last RAMBlock if
1878              * it's the 1st request.
1879              */
1880             error_report("ram_save_queue_pages no previous block");
1881             return -1;
1882         }
1883     } else {
1884         ramblock = qemu_ram_block_by_name(rbname);
1885 
1886         if (!ramblock) {
1887             /* We shouldn't be asked for a non-existent RAMBlock */
1888             error_report("ram_save_queue_pages no block '%s'", rbname);
1889             return -1;
1890         }
1891         rs->last_req_rb = ramblock;
1892     }
1893     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1894     if (start + len > ramblock->used_length) {
1895         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1896                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1897                      __func__, start, len, ramblock->used_length);
1898         return -1;
1899     }
1900 
1901     struct RAMSrcPageRequest *new_entry =
1902         g_malloc0(sizeof(struct RAMSrcPageRequest));
1903     new_entry->rb = ramblock;
1904     new_entry->offset = start;
1905     new_entry->len = len;
1906 
1907     memory_region_ref(ramblock->mr);
1908     qemu_mutex_lock(&rs->src_page_req_mutex);
1909     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1910     migration_make_urgent_request();
1911     qemu_mutex_unlock(&rs->src_page_req_mutex);
1912 
1913     return 0;
1914 }
1915 
1916 static bool save_page_use_compression(RAMState *rs)
1917 {
1918     if (!migrate_use_compression()) {
1919         return false;
1920     }
1921 
1922     /*
1923      * If xbzrle is on, stop using the data compression after first
1924      * round of migration even if compression is enabled. In theory,
1925      * xbzrle can do better than compression.
1926      */
1927     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1928         return true;
1929     }
1930 
1931     return false;
1932 }
1933 
1934 /*
1935  * try to compress the page before posting it out, return true if the page
1936  * has been properly handled by compression, otherwise needs other
1937  * paths to handle it
1938  */
1939 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1940 {
1941     if (!save_page_use_compression(rs)) {
1942         return false;
1943     }
1944 
1945     /*
1946      * When starting the process of a new block, the first page of
1947      * the block should be sent out before other pages in the same
1948      * block, and all the pages in last block should have been sent
1949      * out, keeping this order is important, because the 'cont' flag
1950      * is used to avoid resending the block name.
1951      *
1952      * We post the fist page as normal page as compression will take
1953      * much CPU resource.
1954      */
1955     if (block != rs->last_sent_block) {
1956         flush_compressed_data(rs);
1957         return false;
1958     }
1959 
1960     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1961         return true;
1962     }
1963 
1964     compression_counters.busy++;
1965     return false;
1966 }
1967 
1968 /**
1969  * ram_save_target_page: save one target page
1970  *
1971  * Returns the number of pages written
1972  *
1973  * @rs: current RAM state
1974  * @pss: data about the page we want to send
1975  * @last_stage: if we are at the completion stage
1976  */
1977 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1978                                 bool last_stage)
1979 {
1980     RAMBlock *block = pss->block;
1981     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1982     int res;
1983 
1984     if (control_save_page(rs, block, offset, &res)) {
1985         return res;
1986     }
1987 
1988     if (save_compress_page(rs, block, offset)) {
1989         return 1;
1990     }
1991 
1992     res = save_zero_page(rs, block, offset);
1993     if (res > 0) {
1994         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1995          * page would be stale
1996          */
1997         if (!save_page_use_compression(rs)) {
1998             XBZRLE_cache_lock();
1999             xbzrle_cache_zero_page(rs, block->offset + offset);
2000             XBZRLE_cache_unlock();
2001         }
2002         ram_release_pages(block->idstr, offset, res);
2003         return res;
2004     }
2005 
2006     /*
2007      * Do not use multifd for:
2008      * 1. Compression as the first page in the new block should be posted out
2009      *    before sending the compressed page
2010      * 2. In postcopy as one whole host page should be placed
2011      */
2012     if (!save_page_use_compression(rs) && migrate_use_multifd()
2013         && !migration_in_postcopy()) {
2014         return ram_save_multifd_page(rs, block, offset);
2015     }
2016 
2017     return ram_save_page(rs, pss, last_stage);
2018 }
2019 
2020 /**
2021  * ram_save_host_page: save a whole host page
2022  *
2023  * Starting at *offset send pages up to the end of the current host
2024  * page. It's valid for the initial offset to point into the middle of
2025  * a host page in which case the remainder of the hostpage is sent.
2026  * Only dirty target pages are sent. Note that the host page size may
2027  * be a huge page for this block.
2028  * The saving stops at the boundary of the used_length of the block
2029  * if the RAMBlock isn't a multiple of the host page size.
2030  *
2031  * Returns the number of pages written or negative on error
2032  *
2033  * @rs: current RAM state
2034  * @ms: current migration state
2035  * @pss: data about the page we want to send
2036  * @last_stage: if we are at the completion stage
2037  */
2038 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2039                               bool last_stage)
2040 {
2041     int tmppages, pages = 0;
2042     size_t pagesize_bits =
2043         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2044     unsigned long start_page = pss->page;
2045     int res;
2046 
2047     if (ramblock_is_ignored(pss->block)) {
2048         error_report("block %s should not be migrated !", pss->block->idstr);
2049         return 0;
2050     }
2051 
2052     do {
2053         /* Check the pages is dirty and if it is send it */
2054         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2055             pss->page++;
2056             continue;
2057         }
2058 
2059         tmppages = ram_save_target_page(rs, pss, last_stage);
2060         if (tmppages < 0) {
2061             return tmppages;
2062         }
2063 
2064         pages += tmppages;
2065         pss->page++;
2066         /* Allow rate limiting to happen in the middle of huge pages */
2067         migration_rate_limit();
2068     } while ((pss->page & (pagesize_bits - 1)) &&
2069              offset_in_ramblock(pss->block,
2070                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2071     /* The offset we leave with is the last one we looked at */
2072     pss->page--;
2073 
2074     res = ram_save_release_protection(rs, pss, start_page);
2075     return (res < 0 ? res : pages);
2076 }
2077 
2078 /**
2079  * ram_find_and_save_block: finds a dirty page and sends it to f
2080  *
2081  * Called within an RCU critical section.
2082  *
2083  * Returns the number of pages written where zero means no dirty pages,
2084  * or negative on error
2085  *
2086  * @rs: current RAM state
2087  * @last_stage: if we are at the completion stage
2088  *
2089  * On systems where host-page-size > target-page-size it will send all the
2090  * pages in a host page that are dirty.
2091  */
2092 
2093 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2094 {
2095     PageSearchStatus pss;
2096     int pages = 0;
2097     bool again, found;
2098 
2099     /* No dirty page as there is zero RAM */
2100     if (!ram_bytes_total()) {
2101         return pages;
2102     }
2103 
2104     pss.block = rs->last_seen_block;
2105     pss.page = rs->last_page;
2106     pss.complete_round = false;
2107 
2108     if (!pss.block) {
2109         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2110     }
2111 
2112     do {
2113         again = true;
2114         found = get_queued_page(rs, &pss);
2115 
2116         if (!found) {
2117             /* priority queue empty, so just search for something dirty */
2118             found = find_dirty_block(rs, &pss, &again);
2119         }
2120 
2121         if (found) {
2122             pages = ram_save_host_page(rs, &pss, last_stage);
2123         }
2124     } while (!pages && again);
2125 
2126     rs->last_seen_block = pss.block;
2127     rs->last_page = pss.page;
2128 
2129     return pages;
2130 }
2131 
2132 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2133 {
2134     uint64_t pages = size / TARGET_PAGE_SIZE;
2135 
2136     if (zero) {
2137         ram_counters.duplicate += pages;
2138     } else {
2139         ram_counters.normal += pages;
2140         ram_counters.transferred += size;
2141         qemu_update_position(f, size);
2142     }
2143 }
2144 
2145 static uint64_t ram_bytes_total_common(bool count_ignored)
2146 {
2147     RAMBlock *block;
2148     uint64_t total = 0;
2149 
2150     RCU_READ_LOCK_GUARD();
2151 
2152     if (count_ignored) {
2153         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2154             total += block->used_length;
2155         }
2156     } else {
2157         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2158             total += block->used_length;
2159         }
2160     }
2161     return total;
2162 }
2163 
2164 uint64_t ram_bytes_total(void)
2165 {
2166     return ram_bytes_total_common(false);
2167 }
2168 
2169 static void xbzrle_load_setup(void)
2170 {
2171     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2172 }
2173 
2174 static void xbzrle_load_cleanup(void)
2175 {
2176     g_free(XBZRLE.decoded_buf);
2177     XBZRLE.decoded_buf = NULL;
2178 }
2179 
2180 static void ram_state_cleanup(RAMState **rsp)
2181 {
2182     if (*rsp) {
2183         migration_page_queue_free(*rsp);
2184         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2185         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2186         g_free(*rsp);
2187         *rsp = NULL;
2188     }
2189 }
2190 
2191 static void xbzrle_cleanup(void)
2192 {
2193     XBZRLE_cache_lock();
2194     if (XBZRLE.cache) {
2195         cache_fini(XBZRLE.cache);
2196         g_free(XBZRLE.encoded_buf);
2197         g_free(XBZRLE.current_buf);
2198         g_free(XBZRLE.zero_target_page);
2199         XBZRLE.cache = NULL;
2200         XBZRLE.encoded_buf = NULL;
2201         XBZRLE.current_buf = NULL;
2202         XBZRLE.zero_target_page = NULL;
2203     }
2204     XBZRLE_cache_unlock();
2205 }
2206 
2207 static void ram_save_cleanup(void *opaque)
2208 {
2209     RAMState **rsp = opaque;
2210     RAMBlock *block;
2211 
2212     /* We don't use dirty log with background snapshots */
2213     if (!migrate_background_snapshot()) {
2214         /* caller have hold iothread lock or is in a bh, so there is
2215          * no writing race against the migration bitmap
2216          */
2217         memory_global_dirty_log_stop();
2218     }
2219 
2220     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2221         g_free(block->clear_bmap);
2222         block->clear_bmap = NULL;
2223         g_free(block->bmap);
2224         block->bmap = NULL;
2225     }
2226 
2227     xbzrle_cleanup();
2228     compress_threads_save_cleanup();
2229     ram_state_cleanup(rsp);
2230 }
2231 
2232 static void ram_state_reset(RAMState *rs)
2233 {
2234     rs->last_seen_block = NULL;
2235     rs->last_sent_block = NULL;
2236     rs->last_page = 0;
2237     rs->last_version = ram_list.version;
2238     rs->ram_bulk_stage = true;
2239     rs->fpo_enabled = false;
2240 }
2241 
2242 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2243 
2244 /*
2245  * 'expected' is the value you expect the bitmap mostly to be full
2246  * of; it won't bother printing lines that are all this value.
2247  * If 'todump' is null the migration bitmap is dumped.
2248  */
2249 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2250                            unsigned long pages)
2251 {
2252     int64_t cur;
2253     int64_t linelen = 128;
2254     char linebuf[129];
2255 
2256     for (cur = 0; cur < pages; cur += linelen) {
2257         int64_t curb;
2258         bool found = false;
2259         /*
2260          * Last line; catch the case where the line length
2261          * is longer than remaining ram
2262          */
2263         if (cur + linelen > pages) {
2264             linelen = pages - cur;
2265         }
2266         for (curb = 0; curb < linelen; curb++) {
2267             bool thisbit = test_bit(cur + curb, todump);
2268             linebuf[curb] = thisbit ? '1' : '.';
2269             found = found || (thisbit != expected);
2270         }
2271         if (found) {
2272             linebuf[curb] = '\0';
2273             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2274         }
2275     }
2276 }
2277 
2278 /* **** functions for postcopy ***** */
2279 
2280 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2281 {
2282     struct RAMBlock *block;
2283 
2284     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2285         unsigned long *bitmap = block->bmap;
2286         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2287         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2288 
2289         while (run_start < range) {
2290             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2291             ram_discard_range(block->idstr,
2292                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2293                               ((ram_addr_t)(run_end - run_start))
2294                                 << TARGET_PAGE_BITS);
2295             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2296         }
2297     }
2298 }
2299 
2300 /**
2301  * postcopy_send_discard_bm_ram: discard a RAMBlock
2302  *
2303  * Returns zero on success
2304  *
2305  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2306  *
2307  * @ms: current migration state
2308  * @block: RAMBlock to discard
2309  */
2310 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2311 {
2312     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2313     unsigned long current;
2314     unsigned long *bitmap = block->bmap;
2315 
2316     for (current = 0; current < end; ) {
2317         unsigned long one = find_next_bit(bitmap, end, current);
2318         unsigned long zero, discard_length;
2319 
2320         if (one >= end) {
2321             break;
2322         }
2323 
2324         zero = find_next_zero_bit(bitmap, end, one + 1);
2325 
2326         if (zero >= end) {
2327             discard_length = end - one;
2328         } else {
2329             discard_length = zero - one;
2330         }
2331         postcopy_discard_send_range(ms, one, discard_length);
2332         current = one + discard_length;
2333     }
2334 
2335     return 0;
2336 }
2337 
2338 /**
2339  * postcopy_each_ram_send_discard: discard all RAMBlocks
2340  *
2341  * Returns 0 for success or negative for error
2342  *
2343  * Utility for the outgoing postcopy code.
2344  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2345  *   passing it bitmap indexes and name.
2346  * (qemu_ram_foreach_block ends up passing unscaled lengths
2347  *  which would mean postcopy code would have to deal with target page)
2348  *
2349  * @ms: current migration state
2350  */
2351 static int postcopy_each_ram_send_discard(MigrationState *ms)
2352 {
2353     struct RAMBlock *block;
2354     int ret;
2355 
2356     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2357         postcopy_discard_send_init(ms, block->idstr);
2358 
2359         /*
2360          * Postcopy sends chunks of bitmap over the wire, but it
2361          * just needs indexes at this point, avoids it having
2362          * target page specific code.
2363          */
2364         ret = postcopy_send_discard_bm_ram(ms, block);
2365         postcopy_discard_send_finish(ms);
2366         if (ret) {
2367             return ret;
2368         }
2369     }
2370 
2371     return 0;
2372 }
2373 
2374 /**
2375  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2376  *
2377  * Helper for postcopy_chunk_hostpages; it's called twice to
2378  * canonicalize the two bitmaps, that are similar, but one is
2379  * inverted.
2380  *
2381  * Postcopy requires that all target pages in a hostpage are dirty or
2382  * clean, not a mix.  This function canonicalizes the bitmaps.
2383  *
2384  * @ms: current migration state
2385  * @block: block that contains the page we want to canonicalize
2386  */
2387 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2388 {
2389     RAMState *rs = ram_state;
2390     unsigned long *bitmap = block->bmap;
2391     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2392     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2393     unsigned long run_start;
2394 
2395     if (block->page_size == TARGET_PAGE_SIZE) {
2396         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2397         return;
2398     }
2399 
2400     /* Find a dirty page */
2401     run_start = find_next_bit(bitmap, pages, 0);
2402 
2403     while (run_start < pages) {
2404 
2405         /*
2406          * If the start of this run of pages is in the middle of a host
2407          * page, then we need to fixup this host page.
2408          */
2409         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2410             /* Find the end of this run */
2411             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2412             /*
2413              * If the end isn't at the start of a host page, then the
2414              * run doesn't finish at the end of a host page
2415              * and we need to discard.
2416              */
2417         }
2418 
2419         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2420             unsigned long page;
2421             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2422                                                              host_ratio);
2423             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2424 
2425             /* Clean up the bitmap */
2426             for (page = fixup_start_addr;
2427                  page < fixup_start_addr + host_ratio; page++) {
2428                 /*
2429                  * Remark them as dirty, updating the count for any pages
2430                  * that weren't previously dirty.
2431                  */
2432                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2433             }
2434         }
2435 
2436         /* Find the next dirty page for the next iteration */
2437         run_start = find_next_bit(bitmap, pages, run_start);
2438     }
2439 }
2440 
2441 /**
2442  * postcopy_chunk_hostpages: discard any partially sent host page
2443  *
2444  * Utility for the outgoing postcopy code.
2445  *
2446  * Discard any partially sent host-page size chunks, mark any partially
2447  * dirty host-page size chunks as all dirty.  In this case the host-page
2448  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2449  *
2450  * Returns zero on success
2451  *
2452  * @ms: current migration state
2453  * @block: block we want to work with
2454  */
2455 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2456 {
2457     postcopy_discard_send_init(ms, block->idstr);
2458 
2459     /*
2460      * Ensure that all partially dirty host pages are made fully dirty.
2461      */
2462     postcopy_chunk_hostpages_pass(ms, block);
2463 
2464     postcopy_discard_send_finish(ms);
2465     return 0;
2466 }
2467 
2468 /**
2469  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2470  *
2471  * Returns zero on success
2472  *
2473  * Transmit the set of pages to be discarded after precopy to the target
2474  * these are pages that:
2475  *     a) Have been previously transmitted but are now dirty again
2476  *     b) Pages that have never been transmitted, this ensures that
2477  *        any pages on the destination that have been mapped by background
2478  *        tasks get discarded (transparent huge pages is the specific concern)
2479  * Hopefully this is pretty sparse
2480  *
2481  * @ms: current migration state
2482  */
2483 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2484 {
2485     RAMState *rs = ram_state;
2486     RAMBlock *block;
2487     int ret;
2488 
2489     RCU_READ_LOCK_GUARD();
2490 
2491     /* This should be our last sync, the src is now paused */
2492     migration_bitmap_sync(rs);
2493 
2494     /* Easiest way to make sure we don't resume in the middle of a host-page */
2495     rs->last_seen_block = NULL;
2496     rs->last_sent_block = NULL;
2497     rs->last_page = 0;
2498 
2499     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2500         /* Deal with TPS != HPS and huge pages */
2501         ret = postcopy_chunk_hostpages(ms, block);
2502         if (ret) {
2503             return ret;
2504         }
2505 
2506 #ifdef DEBUG_POSTCOPY
2507         ram_debug_dump_bitmap(block->bmap, true,
2508                               block->used_length >> TARGET_PAGE_BITS);
2509 #endif
2510     }
2511     trace_ram_postcopy_send_discard_bitmap();
2512 
2513     return postcopy_each_ram_send_discard(ms);
2514 }
2515 
2516 /**
2517  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2518  *
2519  * Returns zero on success
2520  *
2521  * @rbname: name of the RAMBlock of the request. NULL means the
2522  *          same that last one.
2523  * @start: RAMBlock starting page
2524  * @length: RAMBlock size
2525  */
2526 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2527 {
2528     trace_ram_discard_range(rbname, start, length);
2529 
2530     RCU_READ_LOCK_GUARD();
2531     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2532 
2533     if (!rb) {
2534         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2535         return -1;
2536     }
2537 
2538     /*
2539      * On source VM, we don't need to update the received bitmap since
2540      * we don't even have one.
2541      */
2542     if (rb->receivedmap) {
2543         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2544                      length >> qemu_target_page_bits());
2545     }
2546 
2547     return ram_block_discard_range(rb, start, length);
2548 }
2549 
2550 /*
2551  * For every allocation, we will try not to crash the VM if the
2552  * allocation failed.
2553  */
2554 static int xbzrle_init(void)
2555 {
2556     Error *local_err = NULL;
2557 
2558     if (!migrate_use_xbzrle()) {
2559         return 0;
2560     }
2561 
2562     XBZRLE_cache_lock();
2563 
2564     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2565     if (!XBZRLE.zero_target_page) {
2566         error_report("%s: Error allocating zero page", __func__);
2567         goto err_out;
2568     }
2569 
2570     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2571                               TARGET_PAGE_SIZE, &local_err);
2572     if (!XBZRLE.cache) {
2573         error_report_err(local_err);
2574         goto free_zero_page;
2575     }
2576 
2577     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2578     if (!XBZRLE.encoded_buf) {
2579         error_report("%s: Error allocating encoded_buf", __func__);
2580         goto free_cache;
2581     }
2582 
2583     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2584     if (!XBZRLE.current_buf) {
2585         error_report("%s: Error allocating current_buf", __func__);
2586         goto free_encoded_buf;
2587     }
2588 
2589     /* We are all good */
2590     XBZRLE_cache_unlock();
2591     return 0;
2592 
2593 free_encoded_buf:
2594     g_free(XBZRLE.encoded_buf);
2595     XBZRLE.encoded_buf = NULL;
2596 free_cache:
2597     cache_fini(XBZRLE.cache);
2598     XBZRLE.cache = NULL;
2599 free_zero_page:
2600     g_free(XBZRLE.zero_target_page);
2601     XBZRLE.zero_target_page = NULL;
2602 err_out:
2603     XBZRLE_cache_unlock();
2604     return -ENOMEM;
2605 }
2606 
2607 static int ram_state_init(RAMState **rsp)
2608 {
2609     *rsp = g_try_new0(RAMState, 1);
2610 
2611     if (!*rsp) {
2612         error_report("%s: Init ramstate fail", __func__);
2613         return -1;
2614     }
2615 
2616     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2617     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2618     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2619 
2620     /*
2621      * Count the total number of pages used by ram blocks not including any
2622      * gaps due to alignment or unplugs.
2623      * This must match with the initial values of dirty bitmap.
2624      */
2625     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2626     ram_state_reset(*rsp);
2627 
2628     return 0;
2629 }
2630 
2631 static void ram_list_init_bitmaps(void)
2632 {
2633     MigrationState *ms = migrate_get_current();
2634     RAMBlock *block;
2635     unsigned long pages;
2636     uint8_t shift;
2637 
2638     /* Skip setting bitmap if there is no RAM */
2639     if (ram_bytes_total()) {
2640         shift = ms->clear_bitmap_shift;
2641         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2642             error_report("clear_bitmap_shift (%u) too big, using "
2643                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2644             shift = CLEAR_BITMAP_SHIFT_MAX;
2645         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2646             error_report("clear_bitmap_shift (%u) too small, using "
2647                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2648             shift = CLEAR_BITMAP_SHIFT_MIN;
2649         }
2650 
2651         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2652             pages = block->max_length >> TARGET_PAGE_BITS;
2653             /*
2654              * The initial dirty bitmap for migration must be set with all
2655              * ones to make sure we'll migrate every guest RAM page to
2656              * destination.
2657              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2658              * new migration after a failed migration, ram_list.
2659              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2660              * guest memory.
2661              */
2662             block->bmap = bitmap_new(pages);
2663             bitmap_set(block->bmap, 0, pages);
2664             block->clear_bmap_shift = shift;
2665             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2666         }
2667     }
2668 }
2669 
2670 static void ram_init_bitmaps(RAMState *rs)
2671 {
2672     /* For memory_global_dirty_log_start below.  */
2673     qemu_mutex_lock_iothread();
2674     qemu_mutex_lock_ramlist();
2675 
2676     WITH_RCU_READ_LOCK_GUARD() {
2677         ram_list_init_bitmaps();
2678         /* We don't use dirty log with background snapshots */
2679         if (!migrate_background_snapshot()) {
2680             memory_global_dirty_log_start();
2681             migration_bitmap_sync_precopy(rs);
2682         }
2683     }
2684     qemu_mutex_unlock_ramlist();
2685     qemu_mutex_unlock_iothread();
2686 }
2687 
2688 static int ram_init_all(RAMState **rsp)
2689 {
2690     if (ram_state_init(rsp)) {
2691         return -1;
2692     }
2693 
2694     if (xbzrle_init()) {
2695         ram_state_cleanup(rsp);
2696         return -1;
2697     }
2698 
2699     ram_init_bitmaps(*rsp);
2700 
2701     return 0;
2702 }
2703 
2704 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2705 {
2706     RAMBlock *block;
2707     uint64_t pages = 0;
2708 
2709     /*
2710      * Postcopy is not using xbzrle/compression, so no need for that.
2711      * Also, since source are already halted, we don't need to care
2712      * about dirty page logging as well.
2713      */
2714 
2715     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2716         pages += bitmap_count_one(block->bmap,
2717                                   block->used_length >> TARGET_PAGE_BITS);
2718     }
2719 
2720     /* This may not be aligned with current bitmaps. Recalculate. */
2721     rs->migration_dirty_pages = pages;
2722 
2723     rs->last_seen_block = NULL;
2724     rs->last_sent_block = NULL;
2725     rs->last_page = 0;
2726     rs->last_version = ram_list.version;
2727     /*
2728      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2729      * matter what we have sent.
2730      */
2731     rs->ram_bulk_stage = false;
2732 
2733     /* Update RAMState cache of output QEMUFile */
2734     rs->f = out;
2735 
2736     trace_ram_state_resume_prepare(pages);
2737 }
2738 
2739 /*
2740  * This function clears bits of the free pages reported by the caller from the
2741  * migration dirty bitmap. @addr is the host address corresponding to the
2742  * start of the continuous guest free pages, and @len is the total bytes of
2743  * those pages.
2744  */
2745 void qemu_guest_free_page_hint(void *addr, size_t len)
2746 {
2747     RAMBlock *block;
2748     ram_addr_t offset;
2749     size_t used_len, start, npages;
2750     MigrationState *s = migrate_get_current();
2751 
2752     /* This function is currently expected to be used during live migration */
2753     if (!migration_is_setup_or_active(s->state)) {
2754         return;
2755     }
2756 
2757     for (; len > 0; len -= used_len, addr += used_len) {
2758         block = qemu_ram_block_from_host(addr, false, &offset);
2759         if (unlikely(!block || offset >= block->used_length)) {
2760             /*
2761              * The implementation might not support RAMBlock resize during
2762              * live migration, but it could happen in theory with future
2763              * updates. So we add a check here to capture that case.
2764              */
2765             error_report_once("%s unexpected error", __func__);
2766             return;
2767         }
2768 
2769         if (len <= block->used_length - offset) {
2770             used_len = len;
2771         } else {
2772             used_len = block->used_length - offset;
2773         }
2774 
2775         start = offset >> TARGET_PAGE_BITS;
2776         npages = used_len >> TARGET_PAGE_BITS;
2777 
2778         qemu_mutex_lock(&ram_state->bitmap_mutex);
2779         ram_state->migration_dirty_pages -=
2780                       bitmap_count_one_with_offset(block->bmap, start, npages);
2781         bitmap_clear(block->bmap, start, npages);
2782         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2783     }
2784 }
2785 
2786 /*
2787  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2788  * long-running RCU critical section.  When rcu-reclaims in the code
2789  * start to become numerous it will be necessary to reduce the
2790  * granularity of these critical sections.
2791  */
2792 
2793 /**
2794  * ram_save_setup: Setup RAM for migration
2795  *
2796  * Returns zero to indicate success and negative for error
2797  *
2798  * @f: QEMUFile where to send the data
2799  * @opaque: RAMState pointer
2800  */
2801 static int ram_save_setup(QEMUFile *f, void *opaque)
2802 {
2803     RAMState **rsp = opaque;
2804     RAMBlock *block;
2805 
2806     if (compress_threads_save_setup()) {
2807         return -1;
2808     }
2809 
2810     /* migration has already setup the bitmap, reuse it. */
2811     if (!migration_in_colo_state()) {
2812         if (ram_init_all(rsp) != 0) {
2813             compress_threads_save_cleanup();
2814             return -1;
2815         }
2816     }
2817     (*rsp)->f = f;
2818 
2819     WITH_RCU_READ_LOCK_GUARD() {
2820         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2821 
2822         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2823             qemu_put_byte(f, strlen(block->idstr));
2824             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2825             qemu_put_be64(f, block->used_length);
2826             if (migrate_postcopy_ram() && block->page_size !=
2827                                           qemu_host_page_size) {
2828                 qemu_put_be64(f, block->page_size);
2829             }
2830             if (migrate_ignore_shared()) {
2831                 qemu_put_be64(f, block->mr->addr);
2832             }
2833         }
2834     }
2835 
2836     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2837     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2838 
2839     multifd_send_sync_main(f);
2840     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2841     qemu_fflush(f);
2842 
2843     return 0;
2844 }
2845 
2846 /**
2847  * ram_save_iterate: iterative stage for migration
2848  *
2849  * Returns zero to indicate success and negative for error
2850  *
2851  * @f: QEMUFile where to send the data
2852  * @opaque: RAMState pointer
2853  */
2854 static int ram_save_iterate(QEMUFile *f, void *opaque)
2855 {
2856     RAMState **temp = opaque;
2857     RAMState *rs = *temp;
2858     int ret = 0;
2859     int i;
2860     int64_t t0;
2861     int done = 0;
2862 
2863     if (blk_mig_bulk_active()) {
2864         /* Avoid transferring ram during bulk phase of block migration as
2865          * the bulk phase will usually take a long time and transferring
2866          * ram updates during that time is pointless. */
2867         goto out;
2868     }
2869 
2870     WITH_RCU_READ_LOCK_GUARD() {
2871         if (ram_list.version != rs->last_version) {
2872             ram_state_reset(rs);
2873         }
2874 
2875         /* Read version before ram_list.blocks */
2876         smp_rmb();
2877 
2878         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2879 
2880         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2881         i = 0;
2882         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2883                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2884             int pages;
2885 
2886             if (qemu_file_get_error(f)) {
2887                 break;
2888             }
2889 
2890             pages = ram_find_and_save_block(rs, false);
2891             /* no more pages to sent */
2892             if (pages == 0) {
2893                 done = 1;
2894                 break;
2895             }
2896 
2897             if (pages < 0) {
2898                 qemu_file_set_error(f, pages);
2899                 break;
2900             }
2901 
2902             rs->target_page_count += pages;
2903 
2904             /*
2905              * During postcopy, it is necessary to make sure one whole host
2906              * page is sent in one chunk.
2907              */
2908             if (migrate_postcopy_ram()) {
2909                 flush_compressed_data(rs);
2910             }
2911 
2912             /*
2913              * we want to check in the 1st loop, just in case it was the 1st
2914              * time and we had to sync the dirty bitmap.
2915              * qemu_clock_get_ns() is a bit expensive, so we only check each
2916              * some iterations
2917              */
2918             if ((i & 63) == 0) {
2919                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2920                               1000000;
2921                 if (t1 > MAX_WAIT) {
2922                     trace_ram_save_iterate_big_wait(t1, i);
2923                     break;
2924                 }
2925             }
2926             i++;
2927         }
2928     }
2929 
2930     /*
2931      * Must occur before EOS (or any QEMUFile operation)
2932      * because of RDMA protocol.
2933      */
2934     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2935 
2936 out:
2937     if (ret >= 0
2938         && migration_is_setup_or_active(migrate_get_current()->state)) {
2939         multifd_send_sync_main(rs->f);
2940         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2941         qemu_fflush(f);
2942         ram_counters.transferred += 8;
2943 
2944         ret = qemu_file_get_error(f);
2945     }
2946     if (ret < 0) {
2947         return ret;
2948     }
2949 
2950     return done;
2951 }
2952 
2953 /**
2954  * ram_save_complete: function called to send the remaining amount of ram
2955  *
2956  * Returns zero to indicate success or negative on error
2957  *
2958  * Called with iothread lock
2959  *
2960  * @f: QEMUFile where to send the data
2961  * @opaque: RAMState pointer
2962  */
2963 static int ram_save_complete(QEMUFile *f, void *opaque)
2964 {
2965     RAMState **temp = opaque;
2966     RAMState *rs = *temp;
2967     int ret = 0;
2968 
2969     WITH_RCU_READ_LOCK_GUARD() {
2970         if (!migration_in_postcopy()) {
2971             migration_bitmap_sync_precopy(rs);
2972         }
2973 
2974         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2975 
2976         /* try transferring iterative blocks of memory */
2977 
2978         /* flush all remaining blocks regardless of rate limiting */
2979         while (true) {
2980             int pages;
2981 
2982             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2983             /* no more blocks to sent */
2984             if (pages == 0) {
2985                 break;
2986             }
2987             if (pages < 0) {
2988                 ret = pages;
2989                 break;
2990             }
2991         }
2992 
2993         flush_compressed_data(rs);
2994         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2995     }
2996 
2997     if (ret >= 0) {
2998         multifd_send_sync_main(rs->f);
2999         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3000         qemu_fflush(f);
3001     }
3002 
3003     return ret;
3004 }
3005 
3006 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3007                              uint64_t *res_precopy_only,
3008                              uint64_t *res_compatible,
3009                              uint64_t *res_postcopy_only)
3010 {
3011     RAMState **temp = opaque;
3012     RAMState *rs = *temp;
3013     uint64_t remaining_size;
3014 
3015     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3016 
3017     if (!migration_in_postcopy() &&
3018         remaining_size < max_size) {
3019         qemu_mutex_lock_iothread();
3020         WITH_RCU_READ_LOCK_GUARD() {
3021             migration_bitmap_sync_precopy(rs);
3022         }
3023         qemu_mutex_unlock_iothread();
3024         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3025     }
3026 
3027     if (migrate_postcopy_ram()) {
3028         /* We can do postcopy, and all the data is postcopiable */
3029         *res_compatible += remaining_size;
3030     } else {
3031         *res_precopy_only += remaining_size;
3032     }
3033 }
3034 
3035 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3036 {
3037     unsigned int xh_len;
3038     int xh_flags;
3039     uint8_t *loaded_data;
3040 
3041     /* extract RLE header */
3042     xh_flags = qemu_get_byte(f);
3043     xh_len = qemu_get_be16(f);
3044 
3045     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3046         error_report("Failed to load XBZRLE page - wrong compression!");
3047         return -1;
3048     }
3049 
3050     if (xh_len > TARGET_PAGE_SIZE) {
3051         error_report("Failed to load XBZRLE page - len overflow!");
3052         return -1;
3053     }
3054     loaded_data = XBZRLE.decoded_buf;
3055     /* load data and decode */
3056     /* it can change loaded_data to point to an internal buffer */
3057     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3058 
3059     /* decode RLE */
3060     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3061                              TARGET_PAGE_SIZE) == -1) {
3062         error_report("Failed to load XBZRLE page - decode error!");
3063         return -1;
3064     }
3065 
3066     return 0;
3067 }
3068 
3069 /**
3070  * ram_block_from_stream: read a RAMBlock id from the migration stream
3071  *
3072  * Must be called from within a rcu critical section.
3073  *
3074  * Returns a pointer from within the RCU-protected ram_list.
3075  *
3076  * @f: QEMUFile where to read the data from
3077  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3078  */
3079 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3080 {
3081     static RAMBlock *block;
3082     char id[256];
3083     uint8_t len;
3084 
3085     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3086         if (!block) {
3087             error_report("Ack, bad migration stream!");
3088             return NULL;
3089         }
3090         return block;
3091     }
3092 
3093     len = qemu_get_byte(f);
3094     qemu_get_buffer(f, (uint8_t *)id, len);
3095     id[len] = 0;
3096 
3097     block = qemu_ram_block_by_name(id);
3098     if (!block) {
3099         error_report("Can't find block %s", id);
3100         return NULL;
3101     }
3102 
3103     if (ramblock_is_ignored(block)) {
3104         error_report("block %s should not be migrated !", id);
3105         return NULL;
3106     }
3107 
3108     return block;
3109 }
3110 
3111 static inline void *host_from_ram_block_offset(RAMBlock *block,
3112                                                ram_addr_t offset)
3113 {
3114     if (!offset_in_ramblock(block, offset)) {
3115         return NULL;
3116     }
3117 
3118     return block->host + offset;
3119 }
3120 
3121 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3122                              ram_addr_t offset, bool record_bitmap)
3123 {
3124     if (!offset_in_ramblock(block, offset)) {
3125         return NULL;
3126     }
3127     if (!block->colo_cache) {
3128         error_report("%s: colo_cache is NULL in block :%s",
3129                      __func__, block->idstr);
3130         return NULL;
3131     }
3132 
3133     /*
3134     * During colo checkpoint, we need bitmap of these migrated pages.
3135     * It help us to decide which pages in ram cache should be flushed
3136     * into VM's RAM later.
3137     */
3138     if (record_bitmap &&
3139         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3140         ram_state->migration_dirty_pages++;
3141     }
3142     return block->colo_cache + offset;
3143 }
3144 
3145 /**
3146  * ram_handle_compressed: handle the zero page case
3147  *
3148  * If a page (or a whole RDMA chunk) has been
3149  * determined to be zero, then zap it.
3150  *
3151  * @host: host address for the zero page
3152  * @ch: what the page is filled from.  We only support zero
3153  * @size: size of the zero page
3154  */
3155 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3156 {
3157     if (ch != 0 || !is_zero_range(host, size)) {
3158         memset(host, ch, size);
3159     }
3160 }
3161 
3162 /* return the size after decompression, or negative value on error */
3163 static int
3164 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3165                      const uint8_t *source, size_t source_len)
3166 {
3167     int err;
3168 
3169     err = inflateReset(stream);
3170     if (err != Z_OK) {
3171         return -1;
3172     }
3173 
3174     stream->avail_in = source_len;
3175     stream->next_in = (uint8_t *)source;
3176     stream->avail_out = dest_len;
3177     stream->next_out = dest;
3178 
3179     err = inflate(stream, Z_NO_FLUSH);
3180     if (err != Z_STREAM_END) {
3181         return -1;
3182     }
3183 
3184     return stream->total_out;
3185 }
3186 
3187 static void *do_data_decompress(void *opaque)
3188 {
3189     DecompressParam *param = opaque;
3190     unsigned long pagesize;
3191     uint8_t *des;
3192     int len, ret;
3193 
3194     qemu_mutex_lock(&param->mutex);
3195     while (!param->quit) {
3196         if (param->des) {
3197             des = param->des;
3198             len = param->len;
3199             param->des = 0;
3200             qemu_mutex_unlock(&param->mutex);
3201 
3202             pagesize = TARGET_PAGE_SIZE;
3203 
3204             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3205                                        param->compbuf, len);
3206             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3207                 error_report("decompress data failed");
3208                 qemu_file_set_error(decomp_file, ret);
3209             }
3210 
3211             qemu_mutex_lock(&decomp_done_lock);
3212             param->done = true;
3213             qemu_cond_signal(&decomp_done_cond);
3214             qemu_mutex_unlock(&decomp_done_lock);
3215 
3216             qemu_mutex_lock(&param->mutex);
3217         } else {
3218             qemu_cond_wait(&param->cond, &param->mutex);
3219         }
3220     }
3221     qemu_mutex_unlock(&param->mutex);
3222 
3223     return NULL;
3224 }
3225 
3226 static int wait_for_decompress_done(void)
3227 {
3228     int idx, thread_count;
3229 
3230     if (!migrate_use_compression()) {
3231         return 0;
3232     }
3233 
3234     thread_count = migrate_decompress_threads();
3235     qemu_mutex_lock(&decomp_done_lock);
3236     for (idx = 0; idx < thread_count; idx++) {
3237         while (!decomp_param[idx].done) {
3238             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3239         }
3240     }
3241     qemu_mutex_unlock(&decomp_done_lock);
3242     return qemu_file_get_error(decomp_file);
3243 }
3244 
3245 static void compress_threads_load_cleanup(void)
3246 {
3247     int i, thread_count;
3248 
3249     if (!migrate_use_compression()) {
3250         return;
3251     }
3252     thread_count = migrate_decompress_threads();
3253     for (i = 0; i < thread_count; i++) {
3254         /*
3255          * we use it as a indicator which shows if the thread is
3256          * properly init'd or not
3257          */
3258         if (!decomp_param[i].compbuf) {
3259             break;
3260         }
3261 
3262         qemu_mutex_lock(&decomp_param[i].mutex);
3263         decomp_param[i].quit = true;
3264         qemu_cond_signal(&decomp_param[i].cond);
3265         qemu_mutex_unlock(&decomp_param[i].mutex);
3266     }
3267     for (i = 0; i < thread_count; i++) {
3268         if (!decomp_param[i].compbuf) {
3269             break;
3270         }
3271 
3272         qemu_thread_join(decompress_threads + i);
3273         qemu_mutex_destroy(&decomp_param[i].mutex);
3274         qemu_cond_destroy(&decomp_param[i].cond);
3275         inflateEnd(&decomp_param[i].stream);
3276         g_free(decomp_param[i].compbuf);
3277         decomp_param[i].compbuf = NULL;
3278     }
3279     g_free(decompress_threads);
3280     g_free(decomp_param);
3281     decompress_threads = NULL;
3282     decomp_param = NULL;
3283     decomp_file = NULL;
3284 }
3285 
3286 static int compress_threads_load_setup(QEMUFile *f)
3287 {
3288     int i, thread_count;
3289 
3290     if (!migrate_use_compression()) {
3291         return 0;
3292     }
3293 
3294     thread_count = migrate_decompress_threads();
3295     decompress_threads = g_new0(QemuThread, thread_count);
3296     decomp_param = g_new0(DecompressParam, thread_count);
3297     qemu_mutex_init(&decomp_done_lock);
3298     qemu_cond_init(&decomp_done_cond);
3299     decomp_file = f;
3300     for (i = 0; i < thread_count; i++) {
3301         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3302             goto exit;
3303         }
3304 
3305         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3306         qemu_mutex_init(&decomp_param[i].mutex);
3307         qemu_cond_init(&decomp_param[i].cond);
3308         decomp_param[i].done = true;
3309         decomp_param[i].quit = false;
3310         qemu_thread_create(decompress_threads + i, "decompress",
3311                            do_data_decompress, decomp_param + i,
3312                            QEMU_THREAD_JOINABLE);
3313     }
3314     return 0;
3315 exit:
3316     compress_threads_load_cleanup();
3317     return -1;
3318 }
3319 
3320 static void decompress_data_with_multi_threads(QEMUFile *f,
3321                                                void *host, int len)
3322 {
3323     int idx, thread_count;
3324 
3325     thread_count = migrate_decompress_threads();
3326     QEMU_LOCK_GUARD(&decomp_done_lock);
3327     while (true) {
3328         for (idx = 0; idx < thread_count; idx++) {
3329             if (decomp_param[idx].done) {
3330                 decomp_param[idx].done = false;
3331                 qemu_mutex_lock(&decomp_param[idx].mutex);
3332                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3333                 decomp_param[idx].des = host;
3334                 decomp_param[idx].len = len;
3335                 qemu_cond_signal(&decomp_param[idx].cond);
3336                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3337                 break;
3338             }
3339         }
3340         if (idx < thread_count) {
3341             break;
3342         } else {
3343             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3344         }
3345     }
3346 }
3347 
3348  /*
3349   * we must set ram_bulk_stage to false, otherwise in
3350   * migation_bitmap_find_dirty the bitmap will be unused and
3351   * all the pages in ram cache wil be flushed to the ram of
3352   * secondary VM.
3353   */
3354 static void colo_init_ram_state(void)
3355 {
3356     ram_state_init(&ram_state);
3357     ram_state->ram_bulk_stage = false;
3358 }
3359 
3360 /*
3361  * colo cache: this is for secondary VM, we cache the whole
3362  * memory of the secondary VM, it is need to hold the global lock
3363  * to call this helper.
3364  */
3365 int colo_init_ram_cache(void)
3366 {
3367     RAMBlock *block;
3368 
3369     WITH_RCU_READ_LOCK_GUARD() {
3370         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3371             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3372                                                     NULL,
3373                                                     false);
3374             if (!block->colo_cache) {
3375                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3376                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3377                              block->used_length);
3378                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3379                     if (block->colo_cache) {
3380                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3381                         block->colo_cache = NULL;
3382                     }
3383                 }
3384                 return -errno;
3385             }
3386         }
3387     }
3388 
3389     /*
3390     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3391     * with to decide which page in cache should be flushed into SVM's RAM. Here
3392     * we use the same name 'ram_bitmap' as for migration.
3393     */
3394     if (ram_bytes_total()) {
3395         RAMBlock *block;
3396 
3397         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3398             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3399             block->bmap = bitmap_new(pages);
3400         }
3401     }
3402 
3403     colo_init_ram_state();
3404     return 0;
3405 }
3406 
3407 /* TODO: duplicated with ram_init_bitmaps */
3408 void colo_incoming_start_dirty_log(void)
3409 {
3410     RAMBlock *block = NULL;
3411     /* For memory_global_dirty_log_start below. */
3412     qemu_mutex_lock_iothread();
3413     qemu_mutex_lock_ramlist();
3414 
3415     memory_global_dirty_log_sync();
3416     WITH_RCU_READ_LOCK_GUARD() {
3417         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3418             ramblock_sync_dirty_bitmap(ram_state, block);
3419             /* Discard this dirty bitmap record */
3420             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3421         }
3422         memory_global_dirty_log_start();
3423     }
3424     ram_state->migration_dirty_pages = 0;
3425     qemu_mutex_unlock_ramlist();
3426     qemu_mutex_unlock_iothread();
3427 }
3428 
3429 /* It is need to hold the global lock to call this helper */
3430 void colo_release_ram_cache(void)
3431 {
3432     RAMBlock *block;
3433 
3434     memory_global_dirty_log_stop();
3435     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3436         g_free(block->bmap);
3437         block->bmap = NULL;
3438     }
3439 
3440     WITH_RCU_READ_LOCK_GUARD() {
3441         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3442             if (block->colo_cache) {
3443                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3444                 block->colo_cache = NULL;
3445             }
3446         }
3447     }
3448     ram_state_cleanup(&ram_state);
3449 }
3450 
3451 /**
3452  * ram_load_setup: Setup RAM for migration incoming side
3453  *
3454  * Returns zero to indicate success and negative for error
3455  *
3456  * @f: QEMUFile where to receive the data
3457  * @opaque: RAMState pointer
3458  */
3459 static int ram_load_setup(QEMUFile *f, void *opaque)
3460 {
3461     if (compress_threads_load_setup(f)) {
3462         return -1;
3463     }
3464 
3465     xbzrle_load_setup();
3466     ramblock_recv_map_init();
3467 
3468     return 0;
3469 }
3470 
3471 static int ram_load_cleanup(void *opaque)
3472 {
3473     RAMBlock *rb;
3474 
3475     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3476         qemu_ram_block_writeback(rb);
3477     }
3478 
3479     xbzrle_load_cleanup();
3480     compress_threads_load_cleanup();
3481 
3482     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3483         g_free(rb->receivedmap);
3484         rb->receivedmap = NULL;
3485     }
3486 
3487     return 0;
3488 }
3489 
3490 /**
3491  * ram_postcopy_incoming_init: allocate postcopy data structures
3492  *
3493  * Returns 0 for success and negative if there was one error
3494  *
3495  * @mis: current migration incoming state
3496  *
3497  * Allocate data structures etc needed by incoming migration with
3498  * postcopy-ram. postcopy-ram's similarly names
3499  * postcopy_ram_incoming_init does the work.
3500  */
3501 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3502 {
3503     return postcopy_ram_incoming_init(mis);
3504 }
3505 
3506 /**
3507  * ram_load_postcopy: load a page in postcopy case
3508  *
3509  * Returns 0 for success or -errno in case of error
3510  *
3511  * Called in postcopy mode by ram_load().
3512  * rcu_read_lock is taken prior to this being called.
3513  *
3514  * @f: QEMUFile where to send the data
3515  */
3516 static int ram_load_postcopy(QEMUFile *f)
3517 {
3518     int flags = 0, ret = 0;
3519     bool place_needed = false;
3520     bool matches_target_page_size = false;
3521     MigrationIncomingState *mis = migration_incoming_get_current();
3522     /* Temporary page that is later 'placed' */
3523     void *postcopy_host_page = mis->postcopy_tmp_page;
3524     void *this_host = NULL;
3525     bool all_zero = true;
3526     int target_pages = 0;
3527 
3528     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3529         ram_addr_t addr;
3530         void *host = NULL;
3531         void *page_buffer = NULL;
3532         void *place_source = NULL;
3533         RAMBlock *block = NULL;
3534         uint8_t ch;
3535         int len;
3536 
3537         addr = qemu_get_be64(f);
3538 
3539         /*
3540          * If qemu file error, we should stop here, and then "addr"
3541          * may be invalid
3542          */
3543         ret = qemu_file_get_error(f);
3544         if (ret) {
3545             break;
3546         }
3547 
3548         flags = addr & ~TARGET_PAGE_MASK;
3549         addr &= TARGET_PAGE_MASK;
3550 
3551         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3552         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3553                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3554             block = ram_block_from_stream(f, flags);
3555 
3556             host = host_from_ram_block_offset(block, addr);
3557             if (!host) {
3558                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3559                 ret = -EINVAL;
3560                 break;
3561             }
3562             target_pages++;
3563             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3564             /*
3565              * Postcopy requires that we place whole host pages atomically;
3566              * these may be huge pages for RAMBlocks that are backed by
3567              * hugetlbfs.
3568              * To make it atomic, the data is read into a temporary page
3569              * that's moved into place later.
3570              * The migration protocol uses,  possibly smaller, target-pages
3571              * however the source ensures it always sends all the components
3572              * of a host page in one chunk.
3573              */
3574             page_buffer = postcopy_host_page +
3575                           ((uintptr_t)host & (block->page_size - 1));
3576             if (target_pages == 1) {
3577                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3578                                                     block->page_size);
3579             } else {
3580                 /* not the 1st TP within the HP */
3581                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3582                     (uintptr_t)this_host) {
3583                     error_report("Non-same host page %p/%p",
3584                                   host, this_host);
3585                     ret = -EINVAL;
3586                     break;
3587                 }
3588             }
3589 
3590             /*
3591              * If it's the last part of a host page then we place the host
3592              * page
3593              */
3594             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3595                 place_needed = true;
3596             }
3597             place_source = postcopy_host_page;
3598         }
3599 
3600         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3601         case RAM_SAVE_FLAG_ZERO:
3602             ch = qemu_get_byte(f);
3603             /*
3604              * Can skip to set page_buffer when
3605              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3606              */
3607             if (ch || !matches_target_page_size) {
3608                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3609             }
3610             if (ch) {
3611                 all_zero = false;
3612             }
3613             break;
3614 
3615         case RAM_SAVE_FLAG_PAGE:
3616             all_zero = false;
3617             if (!matches_target_page_size) {
3618                 /* For huge pages, we always use temporary buffer */
3619                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3620             } else {
3621                 /*
3622                  * For small pages that matches target page size, we
3623                  * avoid the qemu_file copy.  Instead we directly use
3624                  * the buffer of QEMUFile to place the page.  Note: we
3625                  * cannot do any QEMUFile operation before using that
3626                  * buffer to make sure the buffer is valid when
3627                  * placing the page.
3628                  */
3629                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3630                                          TARGET_PAGE_SIZE);
3631             }
3632             break;
3633         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3634             all_zero = false;
3635             len = qemu_get_be32(f);
3636             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3637                 error_report("Invalid compressed data length: %d", len);
3638                 ret = -EINVAL;
3639                 break;
3640             }
3641             decompress_data_with_multi_threads(f, page_buffer, len);
3642             break;
3643 
3644         case RAM_SAVE_FLAG_EOS:
3645             /* normal exit */
3646             multifd_recv_sync_main();
3647             break;
3648         default:
3649             error_report("Unknown combination of migration flags: 0x%x"
3650                          " (postcopy mode)", flags);
3651             ret = -EINVAL;
3652             break;
3653         }
3654 
3655         /* Got the whole host page, wait for decompress before placing. */
3656         if (place_needed) {
3657             ret |= wait_for_decompress_done();
3658         }
3659 
3660         /* Detect for any possible file errors */
3661         if (!ret && qemu_file_get_error(f)) {
3662             ret = qemu_file_get_error(f);
3663         }
3664 
3665         if (!ret && place_needed) {
3666             /* This gets called at the last target page in the host page */
3667             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3668                                                        block->page_size);
3669 
3670             if (all_zero) {
3671                 ret = postcopy_place_page_zero(mis, place_dest,
3672                                                block);
3673             } else {
3674                 ret = postcopy_place_page(mis, place_dest,
3675                                           place_source, block);
3676             }
3677             place_needed = false;
3678             target_pages = 0;
3679             /* Assume we have a zero page until we detect something different */
3680             all_zero = true;
3681         }
3682     }
3683 
3684     return ret;
3685 }
3686 
3687 static bool postcopy_is_advised(void)
3688 {
3689     PostcopyState ps = postcopy_state_get();
3690     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3691 }
3692 
3693 static bool postcopy_is_running(void)
3694 {
3695     PostcopyState ps = postcopy_state_get();
3696     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3697 }
3698 
3699 /*
3700  * Flush content of RAM cache into SVM's memory.
3701  * Only flush the pages that be dirtied by PVM or SVM or both.
3702  */
3703 void colo_flush_ram_cache(void)
3704 {
3705     RAMBlock *block = NULL;
3706     void *dst_host;
3707     void *src_host;
3708     unsigned long offset = 0;
3709 
3710     memory_global_dirty_log_sync();
3711     WITH_RCU_READ_LOCK_GUARD() {
3712         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3713             ramblock_sync_dirty_bitmap(ram_state, block);
3714         }
3715     }
3716 
3717     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3718     WITH_RCU_READ_LOCK_GUARD() {
3719         block = QLIST_FIRST_RCU(&ram_list.blocks);
3720 
3721         while (block) {
3722             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3723 
3724             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3725                 >= block->used_length) {
3726                 offset = 0;
3727                 block = QLIST_NEXT_RCU(block, next);
3728             } else {
3729                 migration_bitmap_clear_dirty(ram_state, block, offset);
3730                 dst_host = block->host
3731                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3732                 src_host = block->colo_cache
3733                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3734                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3735             }
3736         }
3737     }
3738     trace_colo_flush_ram_cache_end();
3739 }
3740 
3741 /**
3742  * ram_load_precopy: load pages in precopy case
3743  *
3744  * Returns 0 for success or -errno in case of error
3745  *
3746  * Called in precopy mode by ram_load().
3747  * rcu_read_lock is taken prior to this being called.
3748  *
3749  * @f: QEMUFile where to send the data
3750  */
3751 static int ram_load_precopy(QEMUFile *f)
3752 {
3753     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3754     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3755     bool postcopy_advised = postcopy_is_advised();
3756     if (!migrate_use_compression()) {
3757         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3758     }
3759 
3760     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3761         ram_addr_t addr, total_ram_bytes;
3762         void *host = NULL, *host_bak = NULL;
3763         uint8_t ch;
3764 
3765         /*
3766          * Yield periodically to let main loop run, but an iteration of
3767          * the main loop is expensive, so do it each some iterations
3768          */
3769         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3770             aio_co_schedule(qemu_get_current_aio_context(),
3771                             qemu_coroutine_self());
3772             qemu_coroutine_yield();
3773         }
3774         i++;
3775 
3776         addr = qemu_get_be64(f);
3777         flags = addr & ~TARGET_PAGE_MASK;
3778         addr &= TARGET_PAGE_MASK;
3779 
3780         if (flags & invalid_flags) {
3781             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3782                 error_report("Received an unexpected compressed page");
3783             }
3784 
3785             ret = -EINVAL;
3786             break;
3787         }
3788 
3789         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3790                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3791             RAMBlock *block = ram_block_from_stream(f, flags);
3792 
3793             host = host_from_ram_block_offset(block, addr);
3794             /*
3795              * After going into COLO stage, we should not load the page
3796              * into SVM's memory directly, we put them into colo_cache firstly.
3797              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3798              * Previously, we copied all these memory in preparing stage of COLO
3799              * while we need to stop VM, which is a time-consuming process.
3800              * Here we optimize it by a trick, back-up every page while in
3801              * migration process while COLO is enabled, though it affects the
3802              * speed of the migration, but it obviously reduce the downtime of
3803              * back-up all SVM'S memory in COLO preparing stage.
3804              */
3805             if (migration_incoming_colo_enabled()) {
3806                 if (migration_incoming_in_colo_state()) {
3807                     /* In COLO stage, put all pages into cache temporarily */
3808                     host = colo_cache_from_block_offset(block, addr, true);
3809                 } else {
3810                    /*
3811                     * In migration stage but before COLO stage,
3812                     * Put all pages into both cache and SVM's memory.
3813                     */
3814                     host_bak = colo_cache_from_block_offset(block, addr, false);
3815                 }
3816             }
3817             if (!host) {
3818                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3819                 ret = -EINVAL;
3820                 break;
3821             }
3822             if (!migration_incoming_in_colo_state()) {
3823                 ramblock_recv_bitmap_set(block, host);
3824             }
3825 
3826             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3827         }
3828 
3829         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3830         case RAM_SAVE_FLAG_MEM_SIZE:
3831             /* Synchronize RAM block list */
3832             total_ram_bytes = addr;
3833             while (!ret && total_ram_bytes) {
3834                 RAMBlock *block;
3835                 char id[256];
3836                 ram_addr_t length;
3837 
3838                 len = qemu_get_byte(f);
3839                 qemu_get_buffer(f, (uint8_t *)id, len);
3840                 id[len] = 0;
3841                 length = qemu_get_be64(f);
3842 
3843                 block = qemu_ram_block_by_name(id);
3844                 if (block && !qemu_ram_is_migratable(block)) {
3845                     error_report("block %s should not be migrated !", id);
3846                     ret = -EINVAL;
3847                 } else if (block) {
3848                     if (length != block->used_length) {
3849                         Error *local_err = NULL;
3850 
3851                         ret = qemu_ram_resize(block, length,
3852                                               &local_err);
3853                         if (local_err) {
3854                             error_report_err(local_err);
3855                         }
3856                     }
3857                     /* For postcopy we need to check hugepage sizes match */
3858                     if (postcopy_advised && migrate_postcopy_ram() &&
3859                         block->page_size != qemu_host_page_size) {
3860                         uint64_t remote_page_size = qemu_get_be64(f);
3861                         if (remote_page_size != block->page_size) {
3862                             error_report("Mismatched RAM page size %s "
3863                                          "(local) %zd != %" PRId64,
3864                                          id, block->page_size,
3865                                          remote_page_size);
3866                             ret = -EINVAL;
3867                         }
3868                     }
3869                     if (migrate_ignore_shared()) {
3870                         hwaddr addr = qemu_get_be64(f);
3871                         if (ramblock_is_ignored(block) &&
3872                             block->mr->addr != addr) {
3873                             error_report("Mismatched GPAs for block %s "
3874                                          "%" PRId64 "!= %" PRId64,
3875                                          id, (uint64_t)addr,
3876                                          (uint64_t)block->mr->addr);
3877                             ret = -EINVAL;
3878                         }
3879                     }
3880                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3881                                           block->idstr);
3882                 } else {
3883                     error_report("Unknown ramblock \"%s\", cannot "
3884                                  "accept migration", id);
3885                     ret = -EINVAL;
3886                 }
3887 
3888                 total_ram_bytes -= length;
3889             }
3890             break;
3891 
3892         case RAM_SAVE_FLAG_ZERO:
3893             ch = qemu_get_byte(f);
3894             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3895             break;
3896 
3897         case RAM_SAVE_FLAG_PAGE:
3898             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3899             break;
3900 
3901         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3902             len = qemu_get_be32(f);
3903             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3904                 error_report("Invalid compressed data length: %d", len);
3905                 ret = -EINVAL;
3906                 break;
3907             }
3908             decompress_data_with_multi_threads(f, host, len);
3909             break;
3910 
3911         case RAM_SAVE_FLAG_XBZRLE:
3912             if (load_xbzrle(f, addr, host) < 0) {
3913                 error_report("Failed to decompress XBZRLE page at "
3914                              RAM_ADDR_FMT, addr);
3915                 ret = -EINVAL;
3916                 break;
3917             }
3918             break;
3919         case RAM_SAVE_FLAG_EOS:
3920             /* normal exit */
3921             multifd_recv_sync_main();
3922             break;
3923         default:
3924             if (flags & RAM_SAVE_FLAG_HOOK) {
3925                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3926             } else {
3927                 error_report("Unknown combination of migration flags: 0x%x",
3928                              flags);
3929                 ret = -EINVAL;
3930             }
3931         }
3932         if (!ret) {
3933             ret = qemu_file_get_error(f);
3934         }
3935         if (!ret && host_bak) {
3936             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3937         }
3938     }
3939 
3940     ret |= wait_for_decompress_done();
3941     return ret;
3942 }
3943 
3944 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3945 {
3946     int ret = 0;
3947     static uint64_t seq_iter;
3948     /*
3949      * If system is running in postcopy mode, page inserts to host memory must
3950      * be atomic
3951      */
3952     bool postcopy_running = postcopy_is_running();
3953 
3954     seq_iter++;
3955 
3956     if (version_id != 4) {
3957         return -EINVAL;
3958     }
3959 
3960     /*
3961      * This RCU critical section can be very long running.
3962      * When RCU reclaims in the code start to become numerous,
3963      * it will be necessary to reduce the granularity of this
3964      * critical section.
3965      */
3966     WITH_RCU_READ_LOCK_GUARD() {
3967         if (postcopy_running) {
3968             ret = ram_load_postcopy(f);
3969         } else {
3970             ret = ram_load_precopy(f);
3971         }
3972     }
3973     trace_ram_load_complete(ret, seq_iter);
3974 
3975     return ret;
3976 }
3977 
3978 static bool ram_has_postcopy(void *opaque)
3979 {
3980     RAMBlock *rb;
3981     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3982         if (ramblock_is_pmem(rb)) {
3983             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3984                          "is not supported now!", rb->idstr, rb->host);
3985             return false;
3986         }
3987     }
3988 
3989     return migrate_postcopy_ram();
3990 }
3991 
3992 /* Sync all the dirty bitmap with destination VM.  */
3993 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3994 {
3995     RAMBlock *block;
3996     QEMUFile *file = s->to_dst_file;
3997     int ramblock_count = 0;
3998 
3999     trace_ram_dirty_bitmap_sync_start();
4000 
4001     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4002         qemu_savevm_send_recv_bitmap(file, block->idstr);
4003         trace_ram_dirty_bitmap_request(block->idstr);
4004         ramblock_count++;
4005     }
4006 
4007     trace_ram_dirty_bitmap_sync_wait();
4008 
4009     /* Wait until all the ramblocks' dirty bitmap synced */
4010     while (ramblock_count--) {
4011         qemu_sem_wait(&s->rp_state.rp_sem);
4012     }
4013 
4014     trace_ram_dirty_bitmap_sync_complete();
4015 
4016     return 0;
4017 }
4018 
4019 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4020 {
4021     qemu_sem_post(&s->rp_state.rp_sem);
4022 }
4023 
4024 /*
4025  * Read the received bitmap, revert it as the initial dirty bitmap.
4026  * This is only used when the postcopy migration is paused but wants
4027  * to resume from a middle point.
4028  */
4029 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4030 {
4031     int ret = -EINVAL;
4032     QEMUFile *file = s->rp_state.from_dst_file;
4033     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4034     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4035     uint64_t size, end_mark;
4036 
4037     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4038 
4039     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4040         error_report("%s: incorrect state %s", __func__,
4041                      MigrationStatus_str(s->state));
4042         return -EINVAL;
4043     }
4044 
4045     /*
4046      * Note: see comments in ramblock_recv_bitmap_send() on why we
4047      * need the endianness conversion, and the paddings.
4048      */
4049     local_size = ROUND_UP(local_size, 8);
4050 
4051     /* Add paddings */
4052     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4053 
4054     size = qemu_get_be64(file);
4055 
4056     /* The size of the bitmap should match with our ramblock */
4057     if (size != local_size) {
4058         error_report("%s: ramblock '%s' bitmap size mismatch "
4059                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4060                      block->idstr, size, local_size);
4061         ret = -EINVAL;
4062         goto out;
4063     }
4064 
4065     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4066     end_mark = qemu_get_be64(file);
4067 
4068     ret = qemu_file_get_error(file);
4069     if (ret || size != local_size) {
4070         error_report("%s: read bitmap failed for ramblock '%s': %d"
4071                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4072                      __func__, block->idstr, ret, local_size, size);
4073         ret = -EIO;
4074         goto out;
4075     }
4076 
4077     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4078         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4079                      __func__, block->idstr, end_mark);
4080         ret = -EINVAL;
4081         goto out;
4082     }
4083 
4084     /*
4085      * Endianness conversion. We are during postcopy (though paused).
4086      * The dirty bitmap won't change. We can directly modify it.
4087      */
4088     bitmap_from_le(block->bmap, le_bitmap, nbits);
4089 
4090     /*
4091      * What we received is "received bitmap". Revert it as the initial
4092      * dirty bitmap for this ramblock.
4093      */
4094     bitmap_complement(block->bmap, block->bmap, nbits);
4095 
4096     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4097 
4098     /*
4099      * We succeeded to sync bitmap for current ramblock. If this is
4100      * the last one to sync, we need to notify the main send thread.
4101      */
4102     ram_dirty_bitmap_reload_notify(s);
4103 
4104     ret = 0;
4105 out:
4106     g_free(le_bitmap);
4107     return ret;
4108 }
4109 
4110 static int ram_resume_prepare(MigrationState *s, void *opaque)
4111 {
4112     RAMState *rs = *(RAMState **)opaque;
4113     int ret;
4114 
4115     ret = ram_dirty_bitmap_sync_all(s, rs);
4116     if (ret) {
4117         return ret;
4118     }
4119 
4120     ram_state_resume_prepare(rs, s->to_dst_file);
4121 
4122     return 0;
4123 }
4124 
4125 static SaveVMHandlers savevm_ram_handlers = {
4126     .save_setup = ram_save_setup,
4127     .save_live_iterate = ram_save_iterate,
4128     .save_live_complete_postcopy = ram_save_complete,
4129     .save_live_complete_precopy = ram_save_complete,
4130     .has_postcopy = ram_has_postcopy,
4131     .save_live_pending = ram_save_pending,
4132     .load_state = ram_load,
4133     .save_cleanup = ram_save_cleanup,
4134     .load_setup = ram_load_setup,
4135     .load_cleanup = ram_load_cleanup,
4136     .resume_prepare = ram_resume_prepare,
4137 };
4138 
4139 void ram_mig_init(void)
4140 {
4141     qemu_mutex_init(&XBZRLE.lock);
4142     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4143 }
4144