xref: /openbmc/qemu/migration/ram.c (revision 8afc43ea)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 #include "options.h"
61 
62 #include "hw/boards.h" /* for machine_dump_guest_core() */
63 
64 #if defined(__linux__)
65 #include "qemu/userfaultfd.h"
66 #endif /* defined(__linux__) */
67 
68 /***********************************************************/
69 /* ram save/restore */
70 
71 /*
72  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
73  * worked for pages that were filled with the same char.  We switched
74  * it to only search for the zero value.  And to avoid confusion with
75  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
76  */
77 /*
78  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
79  */
80 #define RAM_SAVE_FLAG_FULL     0x01
81 #define RAM_SAVE_FLAG_ZERO     0x02
82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
83 #define RAM_SAVE_FLAG_PAGE     0x08
84 #define RAM_SAVE_FLAG_EOS      0x10
85 #define RAM_SAVE_FLAG_CONTINUE 0x20
86 #define RAM_SAVE_FLAG_XBZRLE   0x40
87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
88 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
89 /* We can't use any flag that is bigger than 0x200 */
90 
91 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
92      uint8_t *, int) = xbzrle_encode_buffer;
93 #if defined(CONFIG_AVX512BW_OPT)
94 #include "qemu/cpuid.h"
95 static void __attribute__((constructor)) init_cpu_flag(void)
96 {
97     unsigned max = __get_cpuid_max(0, NULL);
98     int a, b, c, d;
99     if (max >= 1) {
100         __cpuid(1, a, b, c, d);
101          /* We must check that AVX is not just available, but usable.  */
102         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
103             int bv;
104             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
105             __cpuid_count(7, 0, a, b, c, d);
106            /* 0xe6:
107             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
108             *                    and ZMM16-ZMM31 state are enabled by OS)
109             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
110             */
111             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
112                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
113             }
114         }
115     }
116 }
117 #endif
118 
119 XBZRLECacheStats xbzrle_counters;
120 
121 /* used by the search for pages to send */
122 struct PageSearchStatus {
123     /* The migration channel used for a specific host page */
124     QEMUFile    *pss_channel;
125     /* Last block from where we have sent data */
126     RAMBlock *last_sent_block;
127     /* Current block being searched */
128     RAMBlock    *block;
129     /* Current page to search from */
130     unsigned long page;
131     /* Set once we wrap around */
132     bool         complete_round;
133     /* Whether we're sending a host page */
134     bool          host_page_sending;
135     /* The start/end of current host page.  Invalid if host_page_sending==false */
136     unsigned long host_page_start;
137     unsigned long host_page_end;
138 };
139 typedef struct PageSearchStatus PageSearchStatus;
140 
141 /* struct contains XBZRLE cache and a static page
142    used by the compression */
143 static struct {
144     /* buffer used for XBZRLE encoding */
145     uint8_t *encoded_buf;
146     /* buffer for storing page content */
147     uint8_t *current_buf;
148     /* Cache for XBZRLE, Protected by lock. */
149     PageCache *cache;
150     QemuMutex lock;
151     /* it will store a page full of zeros */
152     uint8_t *zero_target_page;
153     /* buffer used for XBZRLE decoding */
154     uint8_t *decoded_buf;
155 } XBZRLE;
156 
157 static void XBZRLE_cache_lock(void)
158 {
159     if (migrate_xbzrle()) {
160         qemu_mutex_lock(&XBZRLE.lock);
161     }
162 }
163 
164 static void XBZRLE_cache_unlock(void)
165 {
166     if (migrate_xbzrle()) {
167         qemu_mutex_unlock(&XBZRLE.lock);
168     }
169 }
170 
171 /**
172  * xbzrle_cache_resize: resize the xbzrle cache
173  *
174  * This function is called from migrate_params_apply in main
175  * thread, possibly while a migration is in progress.  A running
176  * migration may be using the cache and might finish during this call,
177  * hence changes to the cache are protected by XBZRLE.lock().
178  *
179  * Returns 0 for success or -1 for error
180  *
181  * @new_size: new cache size
182  * @errp: set *errp if the check failed, with reason
183  */
184 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
185 {
186     PageCache *new_cache;
187     int64_t ret = 0;
188 
189     /* Check for truncation */
190     if (new_size != (size_t)new_size) {
191         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
192                    "exceeding address space");
193         return -1;
194     }
195 
196     if (new_size == migrate_xbzrle_cache_size()) {
197         /* nothing to do */
198         return 0;
199     }
200 
201     XBZRLE_cache_lock();
202 
203     if (XBZRLE.cache != NULL) {
204         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
205         if (!new_cache) {
206             ret = -1;
207             goto out;
208         }
209 
210         cache_fini(XBZRLE.cache);
211         XBZRLE.cache = new_cache;
212     }
213 out:
214     XBZRLE_cache_unlock();
215     return ret;
216 }
217 
218 static bool postcopy_preempt_active(void)
219 {
220     return migrate_postcopy_preempt() && migration_in_postcopy();
221 }
222 
223 bool ramblock_is_ignored(RAMBlock *block)
224 {
225     return !qemu_ram_is_migratable(block) ||
226            (migrate_ignore_shared() && qemu_ram_is_shared(block));
227 }
228 
229 #undef RAMBLOCK_FOREACH
230 
231 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
232 {
233     RAMBlock *block;
234     int ret = 0;
235 
236     RCU_READ_LOCK_GUARD();
237 
238     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
239         ret = func(block, opaque);
240         if (ret) {
241             break;
242         }
243     }
244     return ret;
245 }
246 
247 static void ramblock_recv_map_init(void)
248 {
249     RAMBlock *rb;
250 
251     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
252         assert(!rb->receivedmap);
253         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
254     }
255 }
256 
257 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
258 {
259     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
260                     rb->receivedmap);
261 }
262 
263 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
264 {
265     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
266 }
267 
268 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
269 {
270     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
271 }
272 
273 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
274                                     size_t nr)
275 {
276     bitmap_set_atomic(rb->receivedmap,
277                       ramblock_recv_bitmap_offset(host_addr, rb),
278                       nr);
279 }
280 
281 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
282 
283 /*
284  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
285  *
286  * Returns >0 if success with sent bytes, or <0 if error.
287  */
288 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
289                                   const char *block_name)
290 {
291     RAMBlock *block = qemu_ram_block_by_name(block_name);
292     unsigned long *le_bitmap, nbits;
293     uint64_t size;
294 
295     if (!block) {
296         error_report("%s: invalid block name: %s", __func__, block_name);
297         return -1;
298     }
299 
300     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
301 
302     /*
303      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
304      * machines we may need 4 more bytes for padding (see below
305      * comment). So extend it a bit before hand.
306      */
307     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
308 
309     /*
310      * Always use little endian when sending the bitmap. This is
311      * required that when source and destination VMs are not using the
312      * same endianness. (Note: big endian won't work.)
313      */
314     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
315 
316     /* Size of the bitmap, in bytes */
317     size = DIV_ROUND_UP(nbits, 8);
318 
319     /*
320      * size is always aligned to 8 bytes for 64bit machines, but it
321      * may not be true for 32bit machines. We need this padding to
322      * make sure the migration can survive even between 32bit and
323      * 64bit machines.
324      */
325     size = ROUND_UP(size, 8);
326 
327     qemu_put_be64(file, size);
328     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
329     /*
330      * Mark as an end, in case the middle part is screwed up due to
331      * some "mysterious" reason.
332      */
333     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
334     qemu_fflush(file);
335 
336     g_free(le_bitmap);
337 
338     if (qemu_file_get_error(file)) {
339         return qemu_file_get_error(file);
340     }
341 
342     return size + sizeof(size);
343 }
344 
345 /*
346  * An outstanding page request, on the source, having been received
347  * and queued
348  */
349 struct RAMSrcPageRequest {
350     RAMBlock *rb;
351     hwaddr    offset;
352     hwaddr    len;
353 
354     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
355 };
356 
357 /* State of RAM for migration */
358 struct RAMState {
359     /*
360      * PageSearchStatus structures for the channels when send pages.
361      * Protected by the bitmap_mutex.
362      */
363     PageSearchStatus pss[RAM_CHANNEL_MAX];
364     /* UFFD file descriptor, used in 'write-tracking' migration */
365     int uffdio_fd;
366     /* total ram size in bytes */
367     uint64_t ram_bytes_total;
368     /* Last block that we have visited searching for dirty pages */
369     RAMBlock *last_seen_block;
370     /* Last dirty target page we have sent */
371     ram_addr_t last_page;
372     /* last ram version we have seen */
373     uint32_t last_version;
374     /* How many times we have dirty too many pages */
375     int dirty_rate_high_cnt;
376     /* these variables are used for bitmap sync */
377     /* last time we did a full bitmap_sync */
378     int64_t time_last_bitmap_sync;
379     /* bytes transferred at start_time */
380     uint64_t bytes_xfer_prev;
381     /* number of dirty pages since start_time */
382     uint64_t num_dirty_pages_period;
383     /* xbzrle misses since the beginning of the period */
384     uint64_t xbzrle_cache_miss_prev;
385     /* Amount of xbzrle pages since the beginning of the period */
386     uint64_t xbzrle_pages_prev;
387     /* Amount of xbzrle encoded bytes since the beginning of the period */
388     uint64_t xbzrle_bytes_prev;
389     /* Start using XBZRLE (e.g., after the first round). */
390     bool xbzrle_enabled;
391     /* Are we on the last stage of migration */
392     bool last_stage;
393     /* compression statistics since the beginning of the period */
394     /* amount of count that no free thread to compress data */
395     uint64_t compress_thread_busy_prev;
396     /* amount bytes after compression */
397     uint64_t compressed_size_prev;
398     /* amount of compressed pages */
399     uint64_t compress_pages_prev;
400 
401     /* total handled target pages at the beginning of period */
402     uint64_t target_page_count_prev;
403     /* total handled target pages since start */
404     uint64_t target_page_count;
405     /* number of dirty bits in the bitmap */
406     uint64_t migration_dirty_pages;
407     /*
408      * Protects:
409      * - dirty/clear bitmap
410      * - migration_dirty_pages
411      * - pss structures
412      */
413     QemuMutex bitmap_mutex;
414     /* The RAMBlock used in the last src_page_requests */
415     RAMBlock *last_req_rb;
416     /* Queue of outstanding page requests from the destination */
417     QemuMutex src_page_req_mutex;
418     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
419 };
420 typedef struct RAMState RAMState;
421 
422 static RAMState *ram_state;
423 
424 static NotifierWithReturnList precopy_notifier_list;
425 
426 /* Whether postcopy has queued requests? */
427 static bool postcopy_has_request(RAMState *rs)
428 {
429     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
430 }
431 
432 void precopy_infrastructure_init(void)
433 {
434     notifier_with_return_list_init(&precopy_notifier_list);
435 }
436 
437 void precopy_add_notifier(NotifierWithReturn *n)
438 {
439     notifier_with_return_list_add(&precopy_notifier_list, n);
440 }
441 
442 void precopy_remove_notifier(NotifierWithReturn *n)
443 {
444     notifier_with_return_remove(n);
445 }
446 
447 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
448 {
449     PrecopyNotifyData pnd;
450     pnd.reason = reason;
451     pnd.errp = errp;
452 
453     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
454 }
455 
456 uint64_t ram_bytes_remaining(void)
457 {
458     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
459                        0;
460 }
461 
462 RAMStats ram_counters;
463 
464 void ram_transferred_add(uint64_t bytes)
465 {
466     if (runstate_is_running()) {
467         stat64_add(&ram_counters.precopy_bytes, bytes);
468     } else if (migration_in_postcopy()) {
469         stat64_add(&ram_counters.postcopy_bytes, bytes);
470     } else {
471         stat64_add(&ram_counters.downtime_bytes, bytes);
472     }
473     stat64_add(&ram_counters.transferred, bytes);
474 }
475 
476 struct MigrationOps {
477     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
478 };
479 typedef struct MigrationOps MigrationOps;
480 
481 MigrationOps *migration_ops;
482 
483 CompressionStats compression_counters;
484 
485 struct CompressParam {
486     bool done;
487     bool quit;
488     bool zero_page;
489     QEMUFile *file;
490     QemuMutex mutex;
491     QemuCond cond;
492     RAMBlock *block;
493     ram_addr_t offset;
494 
495     /* internally used fields */
496     z_stream stream;
497     uint8_t *originbuf;
498 };
499 typedef struct CompressParam CompressParam;
500 
501 struct DecompressParam {
502     bool done;
503     bool quit;
504     QemuMutex mutex;
505     QemuCond cond;
506     void *des;
507     uint8_t *compbuf;
508     int len;
509     z_stream stream;
510 };
511 typedef struct DecompressParam DecompressParam;
512 
513 static CompressParam *comp_param;
514 static QemuThread *compress_threads;
515 /* comp_done_cond is used to wake up the migration thread when
516  * one of the compression threads has finished the compression.
517  * comp_done_lock is used to co-work with comp_done_cond.
518  */
519 static QemuMutex comp_done_lock;
520 static QemuCond comp_done_cond;
521 
522 static QEMUFile *decomp_file;
523 static DecompressParam *decomp_param;
524 static QemuThread *decompress_threads;
525 static QemuMutex decomp_done_lock;
526 static QemuCond decomp_done_cond;
527 
528 static int ram_save_host_page_urgent(PageSearchStatus *pss);
529 
530 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
531                                  ram_addr_t offset, uint8_t *source_buf);
532 
533 /* NOTE: page is the PFN not real ram_addr_t. */
534 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
535 {
536     pss->block = rb;
537     pss->page = page;
538     pss->complete_round = false;
539 }
540 
541 /*
542  * Check whether two PSSs are actively sending the same page.  Return true
543  * if it is, false otherwise.
544  */
545 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
546 {
547     return pss1->host_page_sending && pss2->host_page_sending &&
548         (pss1->host_page_start == pss2->host_page_start);
549 }
550 
551 static void *do_data_compress(void *opaque)
552 {
553     CompressParam *param = opaque;
554     RAMBlock *block;
555     ram_addr_t offset;
556     bool zero_page;
557 
558     qemu_mutex_lock(&param->mutex);
559     while (!param->quit) {
560         if (param->block) {
561             block = param->block;
562             offset = param->offset;
563             param->block = NULL;
564             qemu_mutex_unlock(&param->mutex);
565 
566             zero_page = do_compress_ram_page(param->file, &param->stream,
567                                              block, offset, param->originbuf);
568 
569             qemu_mutex_lock(&comp_done_lock);
570             param->done = true;
571             param->zero_page = zero_page;
572             qemu_cond_signal(&comp_done_cond);
573             qemu_mutex_unlock(&comp_done_lock);
574 
575             qemu_mutex_lock(&param->mutex);
576         } else {
577             qemu_cond_wait(&param->cond, &param->mutex);
578         }
579     }
580     qemu_mutex_unlock(&param->mutex);
581 
582     return NULL;
583 }
584 
585 static void compress_threads_save_cleanup(void)
586 {
587     int i, thread_count;
588 
589     if (!migrate_compress() || !comp_param) {
590         return;
591     }
592 
593     thread_count = migrate_compress_threads();
594     for (i = 0; i < thread_count; i++) {
595         /*
596          * we use it as a indicator which shows if the thread is
597          * properly init'd or not
598          */
599         if (!comp_param[i].file) {
600             break;
601         }
602 
603         qemu_mutex_lock(&comp_param[i].mutex);
604         comp_param[i].quit = true;
605         qemu_cond_signal(&comp_param[i].cond);
606         qemu_mutex_unlock(&comp_param[i].mutex);
607 
608         qemu_thread_join(compress_threads + i);
609         qemu_mutex_destroy(&comp_param[i].mutex);
610         qemu_cond_destroy(&comp_param[i].cond);
611         deflateEnd(&comp_param[i].stream);
612         g_free(comp_param[i].originbuf);
613         qemu_fclose(comp_param[i].file);
614         comp_param[i].file = NULL;
615     }
616     qemu_mutex_destroy(&comp_done_lock);
617     qemu_cond_destroy(&comp_done_cond);
618     g_free(compress_threads);
619     g_free(comp_param);
620     compress_threads = NULL;
621     comp_param = NULL;
622 }
623 
624 static int compress_threads_save_setup(void)
625 {
626     int i, thread_count;
627 
628     if (!migrate_compress()) {
629         return 0;
630     }
631     thread_count = migrate_compress_threads();
632     compress_threads = g_new0(QemuThread, thread_count);
633     comp_param = g_new0(CompressParam, thread_count);
634     qemu_cond_init(&comp_done_cond);
635     qemu_mutex_init(&comp_done_lock);
636     for (i = 0; i < thread_count; i++) {
637         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
638         if (!comp_param[i].originbuf) {
639             goto exit;
640         }
641 
642         if (deflateInit(&comp_param[i].stream,
643                         migrate_compress_level()) != Z_OK) {
644             g_free(comp_param[i].originbuf);
645             goto exit;
646         }
647 
648         /* comp_param[i].file is just used as a dummy buffer to save data,
649          * set its ops to empty.
650          */
651         comp_param[i].file = qemu_file_new_output(
652             QIO_CHANNEL(qio_channel_null_new()));
653         comp_param[i].done = true;
654         comp_param[i].quit = false;
655         qemu_mutex_init(&comp_param[i].mutex);
656         qemu_cond_init(&comp_param[i].cond);
657         qemu_thread_create(compress_threads + i, "compress",
658                            do_data_compress, comp_param + i,
659                            QEMU_THREAD_JOINABLE);
660     }
661     return 0;
662 
663 exit:
664     compress_threads_save_cleanup();
665     return -1;
666 }
667 
668 /**
669  * save_page_header: write page header to wire
670  *
671  * If this is the 1st block, it also writes the block identification
672  *
673  * Returns the number of bytes written
674  *
675  * @pss: current PSS channel status
676  * @block: block that contains the page we want to send
677  * @offset: offset inside the block for the page
678  *          in the lower bits, it contains flags
679  */
680 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
681                                RAMBlock *block, ram_addr_t offset)
682 {
683     size_t size, len;
684     bool same_block = (block == pss->last_sent_block);
685 
686     if (same_block) {
687         offset |= RAM_SAVE_FLAG_CONTINUE;
688     }
689     qemu_put_be64(f, offset);
690     size = 8;
691 
692     if (!same_block) {
693         len = strlen(block->idstr);
694         qemu_put_byte(f, len);
695         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
696         size += 1 + len;
697         pss->last_sent_block = block;
698     }
699     return size;
700 }
701 
702 /**
703  * mig_throttle_guest_down: throttle down the guest
704  *
705  * Reduce amount of guest cpu execution to hopefully slow down memory
706  * writes. If guest dirty memory rate is reduced below the rate at
707  * which we can transfer pages to the destination then we should be
708  * able to complete migration. Some workloads dirty memory way too
709  * fast and will not effectively converge, even with auto-converge.
710  */
711 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
712                                     uint64_t bytes_dirty_threshold)
713 {
714     uint64_t pct_initial = migrate_cpu_throttle_initial();
715     uint64_t pct_increment = migrate_cpu_throttle_increment();
716     bool pct_tailslow = migrate_cpu_throttle_tailslow();
717     int pct_max = migrate_max_cpu_throttle();
718 
719     uint64_t throttle_now = cpu_throttle_get_percentage();
720     uint64_t cpu_now, cpu_ideal, throttle_inc;
721 
722     /* We have not started throttling yet. Let's start it. */
723     if (!cpu_throttle_active()) {
724         cpu_throttle_set(pct_initial);
725     } else {
726         /* Throttling already on, just increase the rate */
727         if (!pct_tailslow) {
728             throttle_inc = pct_increment;
729         } else {
730             /* Compute the ideal CPU percentage used by Guest, which may
731              * make the dirty rate match the dirty rate threshold. */
732             cpu_now = 100 - throttle_now;
733             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
734                         bytes_dirty_period);
735             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
736         }
737         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
738     }
739 }
740 
741 void mig_throttle_counter_reset(void)
742 {
743     RAMState *rs = ram_state;
744 
745     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
746     rs->num_dirty_pages_period = 0;
747     rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
748 }
749 
750 /**
751  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
752  *
753  * @rs: current RAM state
754  * @current_addr: address for the zero page
755  *
756  * Update the xbzrle cache to reflect a page that's been sent as all 0.
757  * The important thing is that a stale (not-yet-0'd) page be replaced
758  * by the new data.
759  * As a bonus, if the page wasn't in the cache it gets added so that
760  * when a small write is made into the 0'd page it gets XBZRLE sent.
761  */
762 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
763 {
764     /* We don't care if this fails to allocate a new cache page
765      * as long as it updated an old one */
766     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
767                  stat64_get(&ram_counters.dirty_sync_count));
768 }
769 
770 #define ENCODING_FLAG_XBZRLE 0x1
771 
772 /**
773  * save_xbzrle_page: compress and send current page
774  *
775  * Returns: 1 means that we wrote the page
776  *          0 means that page is identical to the one already sent
777  *          -1 means that xbzrle would be longer than normal
778  *
779  * @rs: current RAM state
780  * @pss: current PSS channel
781  * @current_data: pointer to the address of the page contents
782  * @current_addr: addr of the page
783  * @block: block that contains the page we want to send
784  * @offset: offset inside the block for the page
785  */
786 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
787                             uint8_t **current_data, ram_addr_t current_addr,
788                             RAMBlock *block, ram_addr_t offset)
789 {
790     int encoded_len = 0, bytes_xbzrle;
791     uint8_t *prev_cached_page;
792     QEMUFile *file = pss->pss_channel;
793     uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
794 
795     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
796         xbzrle_counters.cache_miss++;
797         if (!rs->last_stage) {
798             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
799                              generation) == -1) {
800                 return -1;
801             } else {
802                 /* update *current_data when the page has been
803                    inserted into cache */
804                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
805             }
806         }
807         return -1;
808     }
809 
810     /*
811      * Reaching here means the page has hit the xbzrle cache, no matter what
812      * encoding result it is (normal encoding, overflow or skipping the page),
813      * count the page as encoded. This is used to calculate the encoding rate.
814      *
815      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
816      * 2nd page turns out to be skipped (i.e. no new bytes written to the
817      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
818      * skipped page included. In this way, the encoding rate can tell if the
819      * guest page is good for xbzrle encoding.
820      */
821     xbzrle_counters.pages++;
822     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
823 
824     /* save current buffer into memory */
825     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
826 
827     /* XBZRLE encoding (if there is no overflow) */
828     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
829                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
830                                             TARGET_PAGE_SIZE);
831 
832     /*
833      * Update the cache contents, so that it corresponds to the data
834      * sent, in all cases except where we skip the page.
835      */
836     if (!rs->last_stage && encoded_len != 0) {
837         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
838         /*
839          * In the case where we couldn't compress, ensure that the caller
840          * sends the data from the cache, since the guest might have
841          * changed the RAM since we copied it.
842          */
843         *current_data = prev_cached_page;
844     }
845 
846     if (encoded_len == 0) {
847         trace_save_xbzrle_page_skipping();
848         return 0;
849     } else if (encoded_len == -1) {
850         trace_save_xbzrle_page_overflow();
851         xbzrle_counters.overflow++;
852         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
853         return -1;
854     }
855 
856     /* Send XBZRLE based compressed page */
857     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
858                                     offset | RAM_SAVE_FLAG_XBZRLE);
859     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
860     qemu_put_be16(file, encoded_len);
861     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
862     bytes_xbzrle += encoded_len + 1 + 2;
863     /*
864      * Like compressed_size (please see update_compress_thread_counts),
865      * the xbzrle encoded bytes don't count the 8 byte header with
866      * RAM_SAVE_FLAG_CONTINUE.
867      */
868     xbzrle_counters.bytes += bytes_xbzrle - 8;
869     ram_transferred_add(bytes_xbzrle);
870 
871     return 1;
872 }
873 
874 /**
875  * pss_find_next_dirty: find the next dirty page of current ramblock
876  *
877  * This function updates pss->page to point to the next dirty page index
878  * within the ramblock to migrate, or the end of ramblock when nothing
879  * found.  Note that when pss->host_page_sending==true it means we're
880  * during sending a host page, so we won't look for dirty page that is
881  * outside the host page boundary.
882  *
883  * @pss: the current page search status
884  */
885 static void pss_find_next_dirty(PageSearchStatus *pss)
886 {
887     RAMBlock *rb = pss->block;
888     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
889     unsigned long *bitmap = rb->bmap;
890 
891     if (ramblock_is_ignored(rb)) {
892         /* Points directly to the end, so we know no dirty page */
893         pss->page = size;
894         return;
895     }
896 
897     /*
898      * If during sending a host page, only look for dirty pages within the
899      * current host page being send.
900      */
901     if (pss->host_page_sending) {
902         assert(pss->host_page_end);
903         size = MIN(size, pss->host_page_end);
904     }
905 
906     pss->page = find_next_bit(bitmap, size, pss->page);
907 }
908 
909 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
910                                                        unsigned long page)
911 {
912     uint8_t shift;
913     hwaddr size, start;
914 
915     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
916         return;
917     }
918 
919     shift = rb->clear_bmap_shift;
920     /*
921      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
922      * can make things easier sometimes since then start address
923      * of the small chunk will always be 64 pages aligned so the
924      * bitmap will always be aligned to unsigned long. We should
925      * even be able to remove this restriction but I'm simply
926      * keeping it.
927      */
928     assert(shift >= 6);
929 
930     size = 1ULL << (TARGET_PAGE_BITS + shift);
931     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
932     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
933     memory_region_clear_dirty_bitmap(rb->mr, start, size);
934 }
935 
936 static void
937 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
938                                                  unsigned long start,
939                                                  unsigned long npages)
940 {
941     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
942     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
943     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
944 
945     /*
946      * Clear pages from start to start + npages - 1, so the end boundary is
947      * exclusive.
948      */
949     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
950         migration_clear_memory_region_dirty_bitmap(rb, i);
951     }
952 }
953 
954 /*
955  * colo_bitmap_find_diry:find contiguous dirty pages from start
956  *
957  * Returns the page offset within memory region of the start of the contiguout
958  * dirty page
959  *
960  * @rs: current RAM state
961  * @rb: RAMBlock where to search for dirty pages
962  * @start: page where we start the search
963  * @num: the number of contiguous dirty pages
964  */
965 static inline
966 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
967                                      unsigned long start, unsigned long *num)
968 {
969     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
970     unsigned long *bitmap = rb->bmap;
971     unsigned long first, next;
972 
973     *num = 0;
974 
975     if (ramblock_is_ignored(rb)) {
976         return size;
977     }
978 
979     first = find_next_bit(bitmap, size, start);
980     if (first >= size) {
981         return first;
982     }
983     next = find_next_zero_bit(bitmap, size, first + 1);
984     assert(next >= first);
985     *num = next - first;
986     return first;
987 }
988 
989 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
990                                                 RAMBlock *rb,
991                                                 unsigned long page)
992 {
993     bool ret;
994 
995     /*
996      * Clear dirty bitmap if needed.  This _must_ be called before we
997      * send any of the page in the chunk because we need to make sure
998      * we can capture further page content changes when we sync dirty
999      * log the next time.  So as long as we are going to send any of
1000      * the page in the chunk we clear the remote dirty bitmap for all.
1001      * Clearing it earlier won't be a problem, but too late will.
1002      */
1003     migration_clear_memory_region_dirty_bitmap(rb, page);
1004 
1005     ret = test_and_clear_bit(page, rb->bmap);
1006     if (ret) {
1007         rs->migration_dirty_pages--;
1008     }
1009 
1010     return ret;
1011 }
1012 
1013 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1014                                        void *opaque)
1015 {
1016     const hwaddr offset = section->offset_within_region;
1017     const hwaddr size = int128_get64(section->size);
1018     const unsigned long start = offset >> TARGET_PAGE_BITS;
1019     const unsigned long npages = size >> TARGET_PAGE_BITS;
1020     RAMBlock *rb = section->mr->ram_block;
1021     uint64_t *cleared_bits = opaque;
1022 
1023     /*
1024      * We don't grab ram_state->bitmap_mutex because we expect to run
1025      * only when starting migration or during postcopy recovery where
1026      * we don't have concurrent access.
1027      */
1028     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1029         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1030     }
1031     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1032     bitmap_clear(rb->bmap, start, npages);
1033 }
1034 
1035 /*
1036  * Exclude all dirty pages from migration that fall into a discarded range as
1037  * managed by a RamDiscardManager responsible for the mapped memory region of
1038  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1039  *
1040  * Discarded pages ("logically unplugged") have undefined content and must
1041  * not get migrated, because even reading these pages for migration might
1042  * result in undesired behavior.
1043  *
1044  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1045  *
1046  * Note: The result is only stable while migrating (precopy/postcopy).
1047  */
1048 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1049 {
1050     uint64_t cleared_bits = 0;
1051 
1052     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1053         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1054         MemoryRegionSection section = {
1055             .mr = rb->mr,
1056             .offset_within_region = 0,
1057             .size = int128_make64(qemu_ram_get_used_length(rb)),
1058         };
1059 
1060         ram_discard_manager_replay_discarded(rdm, &section,
1061                                              dirty_bitmap_clear_section,
1062                                              &cleared_bits);
1063     }
1064     return cleared_bits;
1065 }
1066 
1067 /*
1068  * Check if a host-page aligned page falls into a discarded range as managed by
1069  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1070  *
1071  * Note: The result is only stable while migrating (precopy/postcopy).
1072  */
1073 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1074 {
1075     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1076         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1077         MemoryRegionSection section = {
1078             .mr = rb->mr,
1079             .offset_within_region = start,
1080             .size = int128_make64(qemu_ram_pagesize(rb)),
1081         };
1082 
1083         return !ram_discard_manager_is_populated(rdm, &section);
1084     }
1085     return false;
1086 }
1087 
1088 /* Called with RCU critical section */
1089 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1090 {
1091     uint64_t new_dirty_pages =
1092         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1093 
1094     rs->migration_dirty_pages += new_dirty_pages;
1095     rs->num_dirty_pages_period += new_dirty_pages;
1096 }
1097 
1098 /**
1099  * ram_pagesize_summary: calculate all the pagesizes of a VM
1100  *
1101  * Returns a summary bitmap of the page sizes of all RAMBlocks
1102  *
1103  * For VMs with just normal pages this is equivalent to the host page
1104  * size. If it's got some huge pages then it's the OR of all the
1105  * different page sizes.
1106  */
1107 uint64_t ram_pagesize_summary(void)
1108 {
1109     RAMBlock *block;
1110     uint64_t summary = 0;
1111 
1112     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1113         summary |= block->page_size;
1114     }
1115 
1116     return summary;
1117 }
1118 
1119 uint64_t ram_get_total_transferred_pages(void)
1120 {
1121     return stat64_get(&ram_counters.normal_pages) +
1122         stat64_get(&ram_counters.zero_pages) +
1123         compression_counters.pages + xbzrle_counters.pages;
1124 }
1125 
1126 static void migration_update_rates(RAMState *rs, int64_t end_time)
1127 {
1128     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1129     double compressed_size;
1130 
1131     /* calculate period counters */
1132     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1133                 / (end_time - rs->time_last_bitmap_sync);
1134 
1135     if (!page_count) {
1136         return;
1137     }
1138 
1139     if (migrate_xbzrle()) {
1140         double encoded_size, unencoded_size;
1141 
1142         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1143             rs->xbzrle_cache_miss_prev) / page_count;
1144         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1145         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1146                          TARGET_PAGE_SIZE;
1147         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1148         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1149             xbzrle_counters.encoding_rate = 0;
1150         } else {
1151             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1152         }
1153         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1154         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1155     }
1156 
1157     if (migrate_compress()) {
1158         compression_counters.busy_rate = (double)(compression_counters.busy -
1159             rs->compress_thread_busy_prev) / page_count;
1160         rs->compress_thread_busy_prev = compression_counters.busy;
1161 
1162         compressed_size = compression_counters.compressed_size -
1163                           rs->compressed_size_prev;
1164         if (compressed_size) {
1165             double uncompressed_size = (compression_counters.pages -
1166                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1167 
1168             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1169             compression_counters.compression_rate =
1170                                         uncompressed_size / compressed_size;
1171 
1172             rs->compress_pages_prev = compression_counters.pages;
1173             rs->compressed_size_prev = compression_counters.compressed_size;
1174         }
1175     }
1176 }
1177 
1178 static void migration_trigger_throttle(RAMState *rs)
1179 {
1180     uint64_t threshold = migrate_throttle_trigger_threshold();
1181     uint64_t bytes_xfer_period =
1182         stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
1183     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1184     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1185 
1186     /* During block migration the auto-converge logic incorrectly detects
1187      * that ram migration makes no progress. Avoid this by disabling the
1188      * throttling logic during the bulk phase of block migration. */
1189     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1190         /* The following detection logic can be refined later. For now:
1191            Check to see if the ratio between dirtied bytes and the approx.
1192            amount of bytes that just got transferred since the last time
1193            we were in this routine reaches the threshold. If that happens
1194            twice, start or increase throttling. */
1195 
1196         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1197             (++rs->dirty_rate_high_cnt >= 2)) {
1198             trace_migration_throttle();
1199             rs->dirty_rate_high_cnt = 0;
1200             mig_throttle_guest_down(bytes_dirty_period,
1201                                     bytes_dirty_threshold);
1202         }
1203     }
1204 }
1205 
1206 static void migration_bitmap_sync(RAMState *rs)
1207 {
1208     RAMBlock *block;
1209     int64_t end_time;
1210 
1211     stat64_add(&ram_counters.dirty_sync_count, 1);
1212 
1213     if (!rs->time_last_bitmap_sync) {
1214         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1215     }
1216 
1217     trace_migration_bitmap_sync_start();
1218     memory_global_dirty_log_sync();
1219 
1220     qemu_mutex_lock(&rs->bitmap_mutex);
1221     WITH_RCU_READ_LOCK_GUARD() {
1222         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1223             ramblock_sync_dirty_bitmap(rs, block);
1224         }
1225         ram_counters.remaining = ram_bytes_remaining();
1226     }
1227     qemu_mutex_unlock(&rs->bitmap_mutex);
1228 
1229     memory_global_after_dirty_log_sync();
1230     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1231 
1232     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1233 
1234     /* more than 1 second = 1000 millisecons */
1235     if (end_time > rs->time_last_bitmap_sync + 1000) {
1236         migration_trigger_throttle(rs);
1237 
1238         migration_update_rates(rs, end_time);
1239 
1240         rs->target_page_count_prev = rs->target_page_count;
1241 
1242         /* reset period counters */
1243         rs->time_last_bitmap_sync = end_time;
1244         rs->num_dirty_pages_period = 0;
1245         rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
1246     }
1247     if (migrate_events()) {
1248         uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
1249         qapi_event_send_migration_pass(generation);
1250     }
1251 }
1252 
1253 static void migration_bitmap_sync_precopy(RAMState *rs)
1254 {
1255     Error *local_err = NULL;
1256 
1257     /*
1258      * The current notifier usage is just an optimization to migration, so we
1259      * don't stop the normal migration process in the error case.
1260      */
1261     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1262         error_report_err(local_err);
1263         local_err = NULL;
1264     }
1265 
1266     migration_bitmap_sync(rs);
1267 
1268     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1269         error_report_err(local_err);
1270     }
1271 }
1272 
1273 void ram_release_page(const char *rbname, uint64_t offset)
1274 {
1275     if (!migrate_release_ram() || !migration_in_postcopy()) {
1276         return;
1277     }
1278 
1279     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1280 }
1281 
1282 /**
1283  * save_zero_page_to_file: send the zero page to the file
1284  *
1285  * Returns the size of data written to the file, 0 means the page is not
1286  * a zero page
1287  *
1288  * @pss: current PSS channel
1289  * @block: block that contains the page we want to send
1290  * @offset: offset inside the block for the page
1291  */
1292 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1293                                   RAMBlock *block, ram_addr_t offset)
1294 {
1295     uint8_t *p = block->host + offset;
1296     int len = 0;
1297 
1298     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1299         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1300         qemu_put_byte(file, 0);
1301         len += 1;
1302         ram_release_page(block->idstr, offset);
1303     }
1304     return len;
1305 }
1306 
1307 /**
1308  * save_zero_page: send the zero page to the stream
1309  *
1310  * Returns the number of pages written.
1311  *
1312  * @pss: current PSS channel
1313  * @block: block that contains the page we want to send
1314  * @offset: offset inside the block for the page
1315  */
1316 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1317                           ram_addr_t offset)
1318 {
1319     int len = save_zero_page_to_file(pss, f, block, offset);
1320 
1321     if (len) {
1322         stat64_add(&ram_counters.zero_pages, 1);
1323         ram_transferred_add(len);
1324         return 1;
1325     }
1326     return -1;
1327 }
1328 
1329 /*
1330  * @pages: the number of pages written by the control path,
1331  *        < 0 - error
1332  *        > 0 - number of pages written
1333  *
1334  * Return true if the pages has been saved, otherwise false is returned.
1335  */
1336 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1337                               ram_addr_t offset, int *pages)
1338 {
1339     uint64_t bytes_xmit = 0;
1340     int ret;
1341 
1342     *pages = -1;
1343     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1344                                 TARGET_PAGE_SIZE, &bytes_xmit);
1345     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1346         return false;
1347     }
1348 
1349     if (bytes_xmit) {
1350         ram_transferred_add(bytes_xmit);
1351         *pages = 1;
1352     }
1353 
1354     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1355         return true;
1356     }
1357 
1358     if (bytes_xmit > 0) {
1359         stat64_add(&ram_counters.normal_pages, 1);
1360     } else if (bytes_xmit == 0) {
1361         stat64_add(&ram_counters.zero_pages, 1);
1362     }
1363 
1364     return true;
1365 }
1366 
1367 /*
1368  * directly send the page to the stream
1369  *
1370  * Returns the number of pages written.
1371  *
1372  * @pss: current PSS channel
1373  * @block: block that contains the page we want to send
1374  * @offset: offset inside the block for the page
1375  * @buf: the page to be sent
1376  * @async: send to page asyncly
1377  */
1378 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1379                             ram_addr_t offset, uint8_t *buf, bool async)
1380 {
1381     QEMUFile *file = pss->pss_channel;
1382 
1383     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1384                                          offset | RAM_SAVE_FLAG_PAGE));
1385     if (async) {
1386         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1387                               migrate_release_ram() &&
1388                               migration_in_postcopy());
1389     } else {
1390         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1391     }
1392     ram_transferred_add(TARGET_PAGE_SIZE);
1393     stat64_add(&ram_counters.normal_pages, 1);
1394     return 1;
1395 }
1396 
1397 /**
1398  * ram_save_page: send the given page to the stream
1399  *
1400  * Returns the number of pages written.
1401  *          < 0 - error
1402  *          >=0 - Number of pages written - this might legally be 0
1403  *                if xbzrle noticed the page was the same.
1404  *
1405  * @rs: current RAM state
1406  * @block: block that contains the page we want to send
1407  * @offset: offset inside the block for the page
1408  */
1409 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1410 {
1411     int pages = -1;
1412     uint8_t *p;
1413     bool send_async = true;
1414     RAMBlock *block = pss->block;
1415     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1416     ram_addr_t current_addr = block->offset + offset;
1417 
1418     p = block->host + offset;
1419     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1420 
1421     XBZRLE_cache_lock();
1422     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1423         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1424                                  block, offset);
1425         if (!rs->last_stage) {
1426             /* Can't send this cached data async, since the cache page
1427              * might get updated before it gets to the wire
1428              */
1429             send_async = false;
1430         }
1431     }
1432 
1433     /* XBZRLE overflow or normal page */
1434     if (pages == -1) {
1435         pages = save_normal_page(pss, block, offset, p, send_async);
1436     }
1437 
1438     XBZRLE_cache_unlock();
1439 
1440     return pages;
1441 }
1442 
1443 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1444                                  ram_addr_t offset)
1445 {
1446     if (multifd_queue_page(file, block, offset) < 0) {
1447         return -1;
1448     }
1449     stat64_add(&ram_counters.normal_pages, 1);
1450 
1451     return 1;
1452 }
1453 
1454 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1455                                  ram_addr_t offset, uint8_t *source_buf)
1456 {
1457     RAMState *rs = ram_state;
1458     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1459     uint8_t *p = block->host + offset;
1460     int ret;
1461 
1462     if (save_zero_page_to_file(pss, f, block, offset)) {
1463         return true;
1464     }
1465 
1466     save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1467 
1468     /*
1469      * copy it to a internal buffer to avoid it being modified by VM
1470      * so that we can catch up the error during compression and
1471      * decompression
1472      */
1473     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1474     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1475     if (ret < 0) {
1476         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1477         error_report("compressed data failed!");
1478     }
1479     return false;
1480 }
1481 
1482 static void
1483 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1484 {
1485     ram_transferred_add(bytes_xmit);
1486 
1487     if (param->zero_page) {
1488         stat64_add(&ram_counters.zero_pages, 1);
1489         return;
1490     }
1491 
1492     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1493     compression_counters.compressed_size += bytes_xmit - 8;
1494     compression_counters.pages++;
1495 }
1496 
1497 static bool save_page_use_compression(RAMState *rs);
1498 
1499 static void flush_compressed_data(RAMState *rs)
1500 {
1501     MigrationState *ms = migrate_get_current();
1502     int idx, len, thread_count;
1503 
1504     if (!save_page_use_compression(rs)) {
1505         return;
1506     }
1507     thread_count = migrate_compress_threads();
1508 
1509     qemu_mutex_lock(&comp_done_lock);
1510     for (idx = 0; idx < thread_count; idx++) {
1511         while (!comp_param[idx].done) {
1512             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1513         }
1514     }
1515     qemu_mutex_unlock(&comp_done_lock);
1516 
1517     for (idx = 0; idx < thread_count; idx++) {
1518         qemu_mutex_lock(&comp_param[idx].mutex);
1519         if (!comp_param[idx].quit) {
1520             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1521             /*
1522              * it's safe to fetch zero_page without holding comp_done_lock
1523              * as there is no further request submitted to the thread,
1524              * i.e, the thread should be waiting for a request at this point.
1525              */
1526             update_compress_thread_counts(&comp_param[idx], len);
1527         }
1528         qemu_mutex_unlock(&comp_param[idx].mutex);
1529     }
1530 }
1531 
1532 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1533                                        ram_addr_t offset)
1534 {
1535     param->block = block;
1536     param->offset = offset;
1537 }
1538 
1539 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1540 {
1541     int idx, thread_count, bytes_xmit = -1, pages = -1;
1542     bool wait = migrate_compress_wait_thread();
1543     MigrationState *ms = migrate_get_current();
1544 
1545     thread_count = migrate_compress_threads();
1546     qemu_mutex_lock(&comp_done_lock);
1547 retry:
1548     for (idx = 0; idx < thread_count; idx++) {
1549         if (comp_param[idx].done) {
1550             comp_param[idx].done = false;
1551             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1552                                             comp_param[idx].file);
1553             qemu_mutex_lock(&comp_param[idx].mutex);
1554             set_compress_params(&comp_param[idx], block, offset);
1555             qemu_cond_signal(&comp_param[idx].cond);
1556             qemu_mutex_unlock(&comp_param[idx].mutex);
1557             pages = 1;
1558             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1559             break;
1560         }
1561     }
1562 
1563     /*
1564      * wait for the free thread if the user specifies 'compress-wait-thread',
1565      * otherwise we will post the page out in the main thread as normal page.
1566      */
1567     if (pages < 0 && wait) {
1568         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1569         goto retry;
1570     }
1571     qemu_mutex_unlock(&comp_done_lock);
1572 
1573     return pages;
1574 }
1575 
1576 #define PAGE_ALL_CLEAN 0
1577 #define PAGE_TRY_AGAIN 1
1578 #define PAGE_DIRTY_FOUND 2
1579 /**
1580  * find_dirty_block: find the next dirty page and update any state
1581  * associated with the search process.
1582  *
1583  * Returns:
1584  *         PAGE_ALL_CLEAN: no dirty page found, give up
1585  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1586  *         PAGE_DIRTY_FOUND: dirty page found
1587  *
1588  * @rs: current RAM state
1589  * @pss: data about the state of the current dirty page scan
1590  * @again: set to false if the search has scanned the whole of RAM
1591  */
1592 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1593 {
1594     /* Update pss->page for the next dirty bit in ramblock */
1595     pss_find_next_dirty(pss);
1596 
1597     if (pss->complete_round && pss->block == rs->last_seen_block &&
1598         pss->page >= rs->last_page) {
1599         /*
1600          * We've been once around the RAM and haven't found anything.
1601          * Give up.
1602          */
1603         return PAGE_ALL_CLEAN;
1604     }
1605     if (!offset_in_ramblock(pss->block,
1606                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1607         /* Didn't find anything in this RAM Block */
1608         pss->page = 0;
1609         pss->block = QLIST_NEXT_RCU(pss->block, next);
1610         if (!pss->block) {
1611             /*
1612              * If memory migration starts over, we will meet a dirtied page
1613              * which may still exists in compression threads's ring, so we
1614              * should flush the compressed data to make sure the new page
1615              * is not overwritten by the old one in the destination.
1616              *
1617              * Also If xbzrle is on, stop using the data compression at this
1618              * point. In theory, xbzrle can do better than compression.
1619              */
1620             flush_compressed_data(rs);
1621 
1622             /* Hit the end of the list */
1623             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1624             /* Flag that we've looped */
1625             pss->complete_round = true;
1626             /* After the first round, enable XBZRLE. */
1627             if (migrate_xbzrle()) {
1628                 rs->xbzrle_enabled = true;
1629             }
1630         }
1631         /* Didn't find anything this time, but try again on the new block */
1632         return PAGE_TRY_AGAIN;
1633     } else {
1634         /* We've found something */
1635         return PAGE_DIRTY_FOUND;
1636     }
1637 }
1638 
1639 /**
1640  * unqueue_page: gets a page of the queue
1641  *
1642  * Helper for 'get_queued_page' - gets a page off the queue
1643  *
1644  * Returns the block of the page (or NULL if none available)
1645  *
1646  * @rs: current RAM state
1647  * @offset: used to return the offset within the RAMBlock
1648  */
1649 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1650 {
1651     struct RAMSrcPageRequest *entry;
1652     RAMBlock *block = NULL;
1653 
1654     if (!postcopy_has_request(rs)) {
1655         return NULL;
1656     }
1657 
1658     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1659 
1660     /*
1661      * This should _never_ change even after we take the lock, because no one
1662      * should be taking anything off the request list other than us.
1663      */
1664     assert(postcopy_has_request(rs));
1665 
1666     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1667     block = entry->rb;
1668     *offset = entry->offset;
1669 
1670     if (entry->len > TARGET_PAGE_SIZE) {
1671         entry->len -= TARGET_PAGE_SIZE;
1672         entry->offset += TARGET_PAGE_SIZE;
1673     } else {
1674         memory_region_unref(block->mr);
1675         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1676         g_free(entry);
1677         migration_consume_urgent_request();
1678     }
1679 
1680     return block;
1681 }
1682 
1683 #if defined(__linux__)
1684 /**
1685  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1686  *   is found, return RAM block pointer and page offset
1687  *
1688  * Returns pointer to the RAMBlock containing faulting page,
1689  *   NULL if no write faults are pending
1690  *
1691  * @rs: current RAM state
1692  * @offset: page offset from the beginning of the block
1693  */
1694 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1695 {
1696     struct uffd_msg uffd_msg;
1697     void *page_address;
1698     RAMBlock *block;
1699     int res;
1700 
1701     if (!migrate_background_snapshot()) {
1702         return NULL;
1703     }
1704 
1705     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1706     if (res <= 0) {
1707         return NULL;
1708     }
1709 
1710     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1711     block = qemu_ram_block_from_host(page_address, false, offset);
1712     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1713     return block;
1714 }
1715 
1716 /**
1717  * ram_save_release_protection: release UFFD write protection after
1718  *   a range of pages has been saved
1719  *
1720  * @rs: current RAM state
1721  * @pss: page-search-status structure
1722  * @start_page: index of the first page in the range relative to pss->block
1723  *
1724  * Returns 0 on success, negative value in case of an error
1725 */
1726 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1727         unsigned long start_page)
1728 {
1729     int res = 0;
1730 
1731     /* Check if page is from UFFD-managed region. */
1732     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1733         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1734         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1735 
1736         /* Flush async buffers before un-protect. */
1737         qemu_fflush(pss->pss_channel);
1738         /* Un-protect memory range. */
1739         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1740                 false, false);
1741     }
1742 
1743     return res;
1744 }
1745 
1746 /* ram_write_tracking_available: check if kernel supports required UFFD features
1747  *
1748  * Returns true if supports, false otherwise
1749  */
1750 bool ram_write_tracking_available(void)
1751 {
1752     uint64_t uffd_features;
1753     int res;
1754 
1755     res = uffd_query_features(&uffd_features);
1756     return (res == 0 &&
1757             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1758 }
1759 
1760 /* ram_write_tracking_compatible: check if guest configuration is
1761  *   compatible with 'write-tracking'
1762  *
1763  * Returns true if compatible, false otherwise
1764  */
1765 bool ram_write_tracking_compatible(void)
1766 {
1767     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1768     int uffd_fd;
1769     RAMBlock *block;
1770     bool ret = false;
1771 
1772     /* Open UFFD file descriptor */
1773     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1774     if (uffd_fd < 0) {
1775         return false;
1776     }
1777 
1778     RCU_READ_LOCK_GUARD();
1779 
1780     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1781         uint64_t uffd_ioctls;
1782 
1783         /* Nothing to do with read-only and MMIO-writable regions */
1784         if (block->mr->readonly || block->mr->rom_device) {
1785             continue;
1786         }
1787         /* Try to register block memory via UFFD-IO to track writes */
1788         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1789                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1790             goto out;
1791         }
1792         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1793             goto out;
1794         }
1795     }
1796     ret = true;
1797 
1798 out:
1799     uffd_close_fd(uffd_fd);
1800     return ret;
1801 }
1802 
1803 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1804                                        ram_addr_t size)
1805 {
1806     const ram_addr_t end = offset + size;
1807 
1808     /*
1809      * We read one byte of each page; this will preallocate page tables if
1810      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1811      * where no page was populated yet. This might require adaption when
1812      * supporting other mappings, like shmem.
1813      */
1814     for (; offset < end; offset += block->page_size) {
1815         char tmp = *((char *)block->host + offset);
1816 
1817         /* Don't optimize the read out */
1818         asm volatile("" : "+r" (tmp));
1819     }
1820 }
1821 
1822 static inline int populate_read_section(MemoryRegionSection *section,
1823                                         void *opaque)
1824 {
1825     const hwaddr size = int128_get64(section->size);
1826     hwaddr offset = section->offset_within_region;
1827     RAMBlock *block = section->mr->ram_block;
1828 
1829     populate_read_range(block, offset, size);
1830     return 0;
1831 }
1832 
1833 /*
1834  * ram_block_populate_read: preallocate page tables and populate pages in the
1835  *   RAM block by reading a byte of each page.
1836  *
1837  * Since it's solely used for userfault_fd WP feature, here we just
1838  *   hardcode page size to qemu_real_host_page_size.
1839  *
1840  * @block: RAM block to populate
1841  */
1842 static void ram_block_populate_read(RAMBlock *rb)
1843 {
1844     /*
1845      * Skip populating all pages that fall into a discarded range as managed by
1846      * a RamDiscardManager responsible for the mapped memory region of the
1847      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1848      * must not get populated automatically. We don't have to track
1849      * modifications via userfaultfd WP reliably, because these pages will
1850      * not be part of the migration stream either way -- see
1851      * ramblock_dirty_bitmap_exclude_discarded_pages().
1852      *
1853      * Note: The result is only stable while migrating (precopy/postcopy).
1854      */
1855     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1856         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1857         MemoryRegionSection section = {
1858             .mr = rb->mr,
1859             .offset_within_region = 0,
1860             .size = rb->mr->size,
1861         };
1862 
1863         ram_discard_manager_replay_populated(rdm, &section,
1864                                              populate_read_section, NULL);
1865     } else {
1866         populate_read_range(rb, 0, rb->used_length);
1867     }
1868 }
1869 
1870 /*
1871  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1872  */
1873 void ram_write_tracking_prepare(void)
1874 {
1875     RAMBlock *block;
1876 
1877     RCU_READ_LOCK_GUARD();
1878 
1879     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1880         /* Nothing to do with read-only and MMIO-writable regions */
1881         if (block->mr->readonly || block->mr->rom_device) {
1882             continue;
1883         }
1884 
1885         /*
1886          * Populate pages of the RAM block before enabling userfault_fd
1887          * write protection.
1888          *
1889          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1890          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1891          * pages with pte_none() entries in page table.
1892          */
1893         ram_block_populate_read(block);
1894     }
1895 }
1896 
1897 static inline int uffd_protect_section(MemoryRegionSection *section,
1898                                        void *opaque)
1899 {
1900     const hwaddr size = int128_get64(section->size);
1901     const hwaddr offset = section->offset_within_region;
1902     RAMBlock *rb = section->mr->ram_block;
1903     int uffd_fd = (uintptr_t)opaque;
1904 
1905     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1906                                   false);
1907 }
1908 
1909 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1910 {
1911     assert(rb->flags & RAM_UF_WRITEPROTECT);
1912 
1913     /* See ram_block_populate_read() */
1914     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1915         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1916         MemoryRegionSection section = {
1917             .mr = rb->mr,
1918             .offset_within_region = 0,
1919             .size = rb->mr->size,
1920         };
1921 
1922         return ram_discard_manager_replay_populated(rdm, &section,
1923                                                     uffd_protect_section,
1924                                                     (void *)(uintptr_t)uffd_fd);
1925     }
1926     return uffd_change_protection(uffd_fd, rb->host,
1927                                   rb->used_length, true, false);
1928 }
1929 
1930 /*
1931  * ram_write_tracking_start: start UFFD-WP memory tracking
1932  *
1933  * Returns 0 for success or negative value in case of error
1934  */
1935 int ram_write_tracking_start(void)
1936 {
1937     int uffd_fd;
1938     RAMState *rs = ram_state;
1939     RAMBlock *block;
1940 
1941     /* Open UFFD file descriptor */
1942     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1943     if (uffd_fd < 0) {
1944         return uffd_fd;
1945     }
1946     rs->uffdio_fd = uffd_fd;
1947 
1948     RCU_READ_LOCK_GUARD();
1949 
1950     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1951         /* Nothing to do with read-only and MMIO-writable regions */
1952         if (block->mr->readonly || block->mr->rom_device) {
1953             continue;
1954         }
1955 
1956         /* Register block memory with UFFD to track writes */
1957         if (uffd_register_memory(rs->uffdio_fd, block->host,
1958                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1959             goto fail;
1960         }
1961         block->flags |= RAM_UF_WRITEPROTECT;
1962         memory_region_ref(block->mr);
1963 
1964         /* Apply UFFD write protection to the block memory range */
1965         if (ram_block_uffd_protect(block, uffd_fd)) {
1966             goto fail;
1967         }
1968 
1969         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1970                 block->host, block->max_length);
1971     }
1972 
1973     return 0;
1974 
1975 fail:
1976     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1977 
1978     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1979         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1980             continue;
1981         }
1982         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1983         /* Cleanup flags and remove reference */
1984         block->flags &= ~RAM_UF_WRITEPROTECT;
1985         memory_region_unref(block->mr);
1986     }
1987 
1988     uffd_close_fd(uffd_fd);
1989     rs->uffdio_fd = -1;
1990     return -1;
1991 }
1992 
1993 /**
1994  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1995  */
1996 void ram_write_tracking_stop(void)
1997 {
1998     RAMState *rs = ram_state;
1999     RAMBlock *block;
2000 
2001     RCU_READ_LOCK_GUARD();
2002 
2003     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2004         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2005             continue;
2006         }
2007         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2008 
2009         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2010                 block->host, block->max_length);
2011 
2012         /* Cleanup flags and remove reference */
2013         block->flags &= ~RAM_UF_WRITEPROTECT;
2014         memory_region_unref(block->mr);
2015     }
2016 
2017     /* Finally close UFFD file descriptor */
2018     uffd_close_fd(rs->uffdio_fd);
2019     rs->uffdio_fd = -1;
2020 }
2021 
2022 #else
2023 /* No target OS support, stubs just fail or ignore */
2024 
2025 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2026 {
2027     (void) rs;
2028     (void) offset;
2029 
2030     return NULL;
2031 }
2032 
2033 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2034         unsigned long start_page)
2035 {
2036     (void) rs;
2037     (void) pss;
2038     (void) start_page;
2039 
2040     return 0;
2041 }
2042 
2043 bool ram_write_tracking_available(void)
2044 {
2045     return false;
2046 }
2047 
2048 bool ram_write_tracking_compatible(void)
2049 {
2050     assert(0);
2051     return false;
2052 }
2053 
2054 int ram_write_tracking_start(void)
2055 {
2056     assert(0);
2057     return -1;
2058 }
2059 
2060 void ram_write_tracking_stop(void)
2061 {
2062     assert(0);
2063 }
2064 #endif /* defined(__linux__) */
2065 
2066 /**
2067  * get_queued_page: unqueue a page from the postcopy requests
2068  *
2069  * Skips pages that are already sent (!dirty)
2070  *
2071  * Returns true if a queued page is found
2072  *
2073  * @rs: current RAM state
2074  * @pss: data about the state of the current dirty page scan
2075  */
2076 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2077 {
2078     RAMBlock  *block;
2079     ram_addr_t offset;
2080     bool dirty;
2081 
2082     do {
2083         block = unqueue_page(rs, &offset);
2084         /*
2085          * We're sending this page, and since it's postcopy nothing else
2086          * will dirty it, and we must make sure it doesn't get sent again
2087          * even if this queue request was received after the background
2088          * search already sent it.
2089          */
2090         if (block) {
2091             unsigned long page;
2092 
2093             page = offset >> TARGET_PAGE_BITS;
2094             dirty = test_bit(page, block->bmap);
2095             if (!dirty) {
2096                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2097                                                 page);
2098             } else {
2099                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2100             }
2101         }
2102 
2103     } while (block && !dirty);
2104 
2105     if (!block) {
2106         /*
2107          * Poll write faults too if background snapshot is enabled; that's
2108          * when we have vcpus got blocked by the write protected pages.
2109          */
2110         block = poll_fault_page(rs, &offset);
2111     }
2112 
2113     if (block) {
2114         /*
2115          * We want the background search to continue from the queued page
2116          * since the guest is likely to want other pages near to the page
2117          * it just requested.
2118          */
2119         pss->block = block;
2120         pss->page = offset >> TARGET_PAGE_BITS;
2121 
2122         /*
2123          * This unqueued page would break the "one round" check, even is
2124          * really rare.
2125          */
2126         pss->complete_round = false;
2127     }
2128 
2129     return !!block;
2130 }
2131 
2132 /**
2133  * migration_page_queue_free: drop any remaining pages in the ram
2134  * request queue
2135  *
2136  * It should be empty at the end anyway, but in error cases there may
2137  * be some left.  in case that there is any page left, we drop it.
2138  *
2139  */
2140 static void migration_page_queue_free(RAMState *rs)
2141 {
2142     struct RAMSrcPageRequest *mspr, *next_mspr;
2143     /* This queue generally should be empty - but in the case of a failed
2144      * migration might have some droppings in.
2145      */
2146     RCU_READ_LOCK_GUARD();
2147     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2148         memory_region_unref(mspr->rb->mr);
2149         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2150         g_free(mspr);
2151     }
2152 }
2153 
2154 /**
2155  * ram_save_queue_pages: queue the page for transmission
2156  *
2157  * A request from postcopy destination for example.
2158  *
2159  * Returns zero on success or negative on error
2160  *
2161  * @rbname: Name of the RAMBLock of the request. NULL means the
2162  *          same that last one.
2163  * @start: starting address from the start of the RAMBlock
2164  * @len: length (in bytes) to send
2165  */
2166 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2167 {
2168     RAMBlock *ramblock;
2169     RAMState *rs = ram_state;
2170 
2171     stat64_add(&ram_counters.postcopy_requests, 1);
2172     RCU_READ_LOCK_GUARD();
2173 
2174     if (!rbname) {
2175         /* Reuse last RAMBlock */
2176         ramblock = rs->last_req_rb;
2177 
2178         if (!ramblock) {
2179             /*
2180              * Shouldn't happen, we can't reuse the last RAMBlock if
2181              * it's the 1st request.
2182              */
2183             error_report("ram_save_queue_pages no previous block");
2184             return -1;
2185         }
2186     } else {
2187         ramblock = qemu_ram_block_by_name(rbname);
2188 
2189         if (!ramblock) {
2190             /* We shouldn't be asked for a non-existent RAMBlock */
2191             error_report("ram_save_queue_pages no block '%s'", rbname);
2192             return -1;
2193         }
2194         rs->last_req_rb = ramblock;
2195     }
2196     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2197     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2198         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2199                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2200                      __func__, start, len, ramblock->used_length);
2201         return -1;
2202     }
2203 
2204     /*
2205      * When with postcopy preempt, we send back the page directly in the
2206      * rp-return thread.
2207      */
2208     if (postcopy_preempt_active()) {
2209         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2210         size_t page_size = qemu_ram_pagesize(ramblock);
2211         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2212         int ret = 0;
2213 
2214         qemu_mutex_lock(&rs->bitmap_mutex);
2215 
2216         pss_init(pss, ramblock, page_start);
2217         /*
2218          * Always use the preempt channel, and make sure it's there.  It's
2219          * safe to access without lock, because when rp-thread is running
2220          * we should be the only one who operates on the qemufile
2221          */
2222         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2223         assert(pss->pss_channel);
2224 
2225         /*
2226          * It must be either one or multiple of host page size.  Just
2227          * assert; if something wrong we're mostly split brain anyway.
2228          */
2229         assert(len % page_size == 0);
2230         while (len) {
2231             if (ram_save_host_page_urgent(pss)) {
2232                 error_report("%s: ram_save_host_page_urgent() failed: "
2233                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2234                              __func__, ramblock->idstr, start);
2235                 ret = -1;
2236                 break;
2237             }
2238             /*
2239              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2240              * will automatically be moved and point to the next host page
2241              * we're going to send, so no need to update here.
2242              *
2243              * Normally QEMU never sends >1 host page in requests, so
2244              * logically we don't even need that as the loop should only
2245              * run once, but just to be consistent.
2246              */
2247             len -= page_size;
2248         };
2249         qemu_mutex_unlock(&rs->bitmap_mutex);
2250 
2251         return ret;
2252     }
2253 
2254     struct RAMSrcPageRequest *new_entry =
2255         g_new0(struct RAMSrcPageRequest, 1);
2256     new_entry->rb = ramblock;
2257     new_entry->offset = start;
2258     new_entry->len = len;
2259 
2260     memory_region_ref(ramblock->mr);
2261     qemu_mutex_lock(&rs->src_page_req_mutex);
2262     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2263     migration_make_urgent_request();
2264     qemu_mutex_unlock(&rs->src_page_req_mutex);
2265 
2266     return 0;
2267 }
2268 
2269 static bool save_page_use_compression(RAMState *rs)
2270 {
2271     if (!migrate_compress()) {
2272         return false;
2273     }
2274 
2275     /*
2276      * If xbzrle is enabled (e.g., after first round of migration), stop
2277      * using the data compression. In theory, xbzrle can do better than
2278      * compression.
2279      */
2280     if (rs->xbzrle_enabled) {
2281         return false;
2282     }
2283 
2284     return true;
2285 }
2286 
2287 /*
2288  * try to compress the page before posting it out, return true if the page
2289  * has been properly handled by compression, otherwise needs other
2290  * paths to handle it
2291  */
2292 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2293                                RAMBlock *block, ram_addr_t offset)
2294 {
2295     if (!save_page_use_compression(rs)) {
2296         return false;
2297     }
2298 
2299     /*
2300      * When starting the process of a new block, the first page of
2301      * the block should be sent out before other pages in the same
2302      * block, and all the pages in last block should have been sent
2303      * out, keeping this order is important, because the 'cont' flag
2304      * is used to avoid resending the block name.
2305      *
2306      * We post the fist page as normal page as compression will take
2307      * much CPU resource.
2308      */
2309     if (block != pss->last_sent_block) {
2310         flush_compressed_data(rs);
2311         return false;
2312     }
2313 
2314     if (compress_page_with_multi_thread(block, offset) > 0) {
2315         return true;
2316     }
2317 
2318     compression_counters.busy++;
2319     return false;
2320 }
2321 
2322 /**
2323  * ram_save_target_page_legacy: save one target page
2324  *
2325  * Returns the number of pages written
2326  *
2327  * @rs: current RAM state
2328  * @pss: data about the page we want to send
2329  */
2330 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2331 {
2332     RAMBlock *block = pss->block;
2333     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2334     int res;
2335 
2336     if (control_save_page(pss, block, offset, &res)) {
2337         return res;
2338     }
2339 
2340     if (save_compress_page(rs, pss, block, offset)) {
2341         return 1;
2342     }
2343 
2344     res = save_zero_page(pss, pss->pss_channel, block, offset);
2345     if (res > 0) {
2346         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2347          * page would be stale
2348          */
2349         if (rs->xbzrle_enabled) {
2350             XBZRLE_cache_lock();
2351             xbzrle_cache_zero_page(rs, block->offset + offset);
2352             XBZRLE_cache_unlock();
2353         }
2354         return res;
2355     }
2356 
2357     /*
2358      * Do not use multifd in postcopy as one whole host page should be
2359      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2360      * if host page size == guest page size the dest guest during run may
2361      * still see partially copied pages which is data corruption.
2362      */
2363     if (migrate_multifd() && !migration_in_postcopy()) {
2364         return ram_save_multifd_page(pss->pss_channel, block, offset);
2365     }
2366 
2367     return ram_save_page(rs, pss);
2368 }
2369 
2370 /* Should be called before sending a host page */
2371 static void pss_host_page_prepare(PageSearchStatus *pss)
2372 {
2373     /* How many guest pages are there in one host page? */
2374     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2375 
2376     pss->host_page_sending = true;
2377     if (guest_pfns <= 1) {
2378         /*
2379          * This covers both when guest psize == host psize, or when guest
2380          * has larger psize than the host (guest_pfns==0).
2381          *
2382          * For the latter, we always send one whole guest page per
2383          * iteration of the host page (example: an Alpha VM on x86 host
2384          * will have guest psize 8K while host psize 4K).
2385          */
2386         pss->host_page_start = pss->page;
2387         pss->host_page_end = pss->page + 1;
2388     } else {
2389         /*
2390          * The host page spans over multiple guest pages, we send them
2391          * within the same host page iteration.
2392          */
2393         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2394         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2395     }
2396 }
2397 
2398 /*
2399  * Whether the page pointed by PSS is within the host page being sent.
2400  * Must be called after a previous pss_host_page_prepare().
2401  */
2402 static bool pss_within_range(PageSearchStatus *pss)
2403 {
2404     ram_addr_t ram_addr;
2405 
2406     assert(pss->host_page_sending);
2407 
2408     /* Over host-page boundary? */
2409     if (pss->page >= pss->host_page_end) {
2410         return false;
2411     }
2412 
2413     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2414 
2415     return offset_in_ramblock(pss->block, ram_addr);
2416 }
2417 
2418 static void pss_host_page_finish(PageSearchStatus *pss)
2419 {
2420     pss->host_page_sending = false;
2421     /* This is not needed, but just to reset it */
2422     pss->host_page_start = pss->host_page_end = 0;
2423 }
2424 
2425 /*
2426  * Send an urgent host page specified by `pss'.  Need to be called with
2427  * bitmap_mutex held.
2428  *
2429  * Returns 0 if save host page succeeded, false otherwise.
2430  */
2431 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2432 {
2433     bool page_dirty, sent = false;
2434     RAMState *rs = ram_state;
2435     int ret = 0;
2436 
2437     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2438     pss_host_page_prepare(pss);
2439 
2440     /*
2441      * If precopy is sending the same page, let it be done in precopy, or
2442      * we could send the same page in two channels and none of them will
2443      * receive the whole page.
2444      */
2445     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2446         trace_postcopy_preempt_hit(pss->block->idstr,
2447                                    pss->page << TARGET_PAGE_BITS);
2448         return 0;
2449     }
2450 
2451     do {
2452         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2453 
2454         if (page_dirty) {
2455             /* Be strict to return code; it must be 1, or what else? */
2456             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2457                 error_report_once("%s: ram_save_target_page failed", __func__);
2458                 ret = -1;
2459                 goto out;
2460             }
2461             sent = true;
2462         }
2463         pss_find_next_dirty(pss);
2464     } while (pss_within_range(pss));
2465 out:
2466     pss_host_page_finish(pss);
2467     /* For urgent requests, flush immediately if sent */
2468     if (sent) {
2469         qemu_fflush(pss->pss_channel);
2470     }
2471     return ret;
2472 }
2473 
2474 /**
2475  * ram_save_host_page: save a whole host page
2476  *
2477  * Starting at *offset send pages up to the end of the current host
2478  * page. It's valid for the initial offset to point into the middle of
2479  * a host page in which case the remainder of the hostpage is sent.
2480  * Only dirty target pages are sent. Note that the host page size may
2481  * be a huge page for this block.
2482  *
2483  * The saving stops at the boundary of the used_length of the block
2484  * if the RAMBlock isn't a multiple of the host page size.
2485  *
2486  * The caller must be with ram_state.bitmap_mutex held to call this
2487  * function.  Note that this function can temporarily release the lock, but
2488  * when the function is returned it'll make sure the lock is still held.
2489  *
2490  * Returns the number of pages written or negative on error
2491  *
2492  * @rs: current RAM state
2493  * @pss: data about the page we want to send
2494  */
2495 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2496 {
2497     bool page_dirty, preempt_active = postcopy_preempt_active();
2498     int tmppages, pages = 0;
2499     size_t pagesize_bits =
2500         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2501     unsigned long start_page = pss->page;
2502     int res;
2503 
2504     if (ramblock_is_ignored(pss->block)) {
2505         error_report("block %s should not be migrated !", pss->block->idstr);
2506         return 0;
2507     }
2508 
2509     /* Update host page boundary information */
2510     pss_host_page_prepare(pss);
2511 
2512     do {
2513         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2514 
2515         /* Check the pages is dirty and if it is send it */
2516         if (page_dirty) {
2517             /*
2518              * Properly yield the lock only in postcopy preempt mode
2519              * because both migration thread and rp-return thread can
2520              * operate on the bitmaps.
2521              */
2522             if (preempt_active) {
2523                 qemu_mutex_unlock(&rs->bitmap_mutex);
2524             }
2525             tmppages = migration_ops->ram_save_target_page(rs, pss);
2526             if (tmppages >= 0) {
2527                 pages += tmppages;
2528                 /*
2529                  * Allow rate limiting to happen in the middle of huge pages if
2530                  * something is sent in the current iteration.
2531                  */
2532                 if (pagesize_bits > 1 && tmppages > 0) {
2533                     migration_rate_limit();
2534                 }
2535             }
2536             if (preempt_active) {
2537                 qemu_mutex_lock(&rs->bitmap_mutex);
2538             }
2539         } else {
2540             tmppages = 0;
2541         }
2542 
2543         if (tmppages < 0) {
2544             pss_host_page_finish(pss);
2545             return tmppages;
2546         }
2547 
2548         pss_find_next_dirty(pss);
2549     } while (pss_within_range(pss));
2550 
2551     pss_host_page_finish(pss);
2552 
2553     res = ram_save_release_protection(rs, pss, start_page);
2554     return (res < 0 ? res : pages);
2555 }
2556 
2557 /**
2558  * ram_find_and_save_block: finds a dirty page and sends it to f
2559  *
2560  * Called within an RCU critical section.
2561  *
2562  * Returns the number of pages written where zero means no dirty pages,
2563  * or negative on error
2564  *
2565  * @rs: current RAM state
2566  *
2567  * On systems where host-page-size > target-page-size it will send all the
2568  * pages in a host page that are dirty.
2569  */
2570 static int ram_find_and_save_block(RAMState *rs)
2571 {
2572     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2573     int pages = 0;
2574 
2575     /* No dirty page as there is zero RAM */
2576     if (!rs->ram_bytes_total) {
2577         return pages;
2578     }
2579 
2580     /*
2581      * Always keep last_seen_block/last_page valid during this procedure,
2582      * because find_dirty_block() relies on these values (e.g., we compare
2583      * last_seen_block with pss.block to see whether we searched all the
2584      * ramblocks) to detect the completion of migration.  Having NULL value
2585      * of last_seen_block can conditionally cause below loop to run forever.
2586      */
2587     if (!rs->last_seen_block) {
2588         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2589         rs->last_page = 0;
2590     }
2591 
2592     pss_init(pss, rs->last_seen_block, rs->last_page);
2593 
2594     while (true){
2595         if (!get_queued_page(rs, pss)) {
2596             /* priority queue empty, so just search for something dirty */
2597             int res = find_dirty_block(rs, pss);
2598             if (res != PAGE_DIRTY_FOUND) {
2599                 if (res == PAGE_ALL_CLEAN) {
2600                     break;
2601                 } else if (res == PAGE_TRY_AGAIN) {
2602                     continue;
2603                 }
2604             }
2605         }
2606         pages = ram_save_host_page(rs, pss);
2607         if (pages) {
2608             break;
2609         }
2610     }
2611 
2612     rs->last_seen_block = pss->block;
2613     rs->last_page = pss->page;
2614 
2615     return pages;
2616 }
2617 
2618 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2619 {
2620     uint64_t pages = size / TARGET_PAGE_SIZE;
2621 
2622     if (zero) {
2623         stat64_add(&ram_counters.zero_pages, pages);
2624     } else {
2625         stat64_add(&ram_counters.normal_pages, pages);
2626         ram_transferred_add(size);
2627         qemu_file_credit_transfer(f, size);
2628     }
2629 }
2630 
2631 static uint64_t ram_bytes_total_with_ignored(void)
2632 {
2633     RAMBlock *block;
2634     uint64_t total = 0;
2635 
2636     RCU_READ_LOCK_GUARD();
2637 
2638     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2639         total += block->used_length;
2640     }
2641     return total;
2642 }
2643 
2644 uint64_t ram_bytes_total(void)
2645 {
2646     RAMBlock *block;
2647     uint64_t total = 0;
2648 
2649     RCU_READ_LOCK_GUARD();
2650 
2651     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2652         total += block->used_length;
2653     }
2654     return total;
2655 }
2656 
2657 static void xbzrle_load_setup(void)
2658 {
2659     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2660 }
2661 
2662 static void xbzrle_load_cleanup(void)
2663 {
2664     g_free(XBZRLE.decoded_buf);
2665     XBZRLE.decoded_buf = NULL;
2666 }
2667 
2668 static void ram_state_cleanup(RAMState **rsp)
2669 {
2670     if (*rsp) {
2671         migration_page_queue_free(*rsp);
2672         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2673         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2674         g_free(*rsp);
2675         *rsp = NULL;
2676     }
2677 }
2678 
2679 static void xbzrle_cleanup(void)
2680 {
2681     XBZRLE_cache_lock();
2682     if (XBZRLE.cache) {
2683         cache_fini(XBZRLE.cache);
2684         g_free(XBZRLE.encoded_buf);
2685         g_free(XBZRLE.current_buf);
2686         g_free(XBZRLE.zero_target_page);
2687         XBZRLE.cache = NULL;
2688         XBZRLE.encoded_buf = NULL;
2689         XBZRLE.current_buf = NULL;
2690         XBZRLE.zero_target_page = NULL;
2691     }
2692     XBZRLE_cache_unlock();
2693 }
2694 
2695 static void ram_save_cleanup(void *opaque)
2696 {
2697     RAMState **rsp = opaque;
2698     RAMBlock *block;
2699 
2700     /* We don't use dirty log with background snapshots */
2701     if (!migrate_background_snapshot()) {
2702         /* caller have hold iothread lock or is in a bh, so there is
2703          * no writing race against the migration bitmap
2704          */
2705         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2706             /*
2707              * do not stop dirty log without starting it, since
2708              * memory_global_dirty_log_stop will assert that
2709              * memory_global_dirty_log_start/stop used in pairs
2710              */
2711             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2712         }
2713     }
2714 
2715     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2716         g_free(block->clear_bmap);
2717         block->clear_bmap = NULL;
2718         g_free(block->bmap);
2719         block->bmap = NULL;
2720     }
2721 
2722     xbzrle_cleanup();
2723     compress_threads_save_cleanup();
2724     ram_state_cleanup(rsp);
2725     g_free(migration_ops);
2726     migration_ops = NULL;
2727 }
2728 
2729 static void ram_state_reset(RAMState *rs)
2730 {
2731     int i;
2732 
2733     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2734         rs->pss[i].last_sent_block = NULL;
2735     }
2736 
2737     rs->last_seen_block = NULL;
2738     rs->last_page = 0;
2739     rs->last_version = ram_list.version;
2740     rs->xbzrle_enabled = false;
2741 }
2742 
2743 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2744 
2745 /* **** functions for postcopy ***** */
2746 
2747 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2748 {
2749     struct RAMBlock *block;
2750 
2751     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2752         unsigned long *bitmap = block->bmap;
2753         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2754         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2755 
2756         while (run_start < range) {
2757             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2758             ram_discard_range(block->idstr,
2759                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2760                               ((ram_addr_t)(run_end - run_start))
2761                                 << TARGET_PAGE_BITS);
2762             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2763         }
2764     }
2765 }
2766 
2767 /**
2768  * postcopy_send_discard_bm_ram: discard a RAMBlock
2769  *
2770  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2771  *
2772  * @ms: current migration state
2773  * @block: RAMBlock to discard
2774  */
2775 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2776 {
2777     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2778     unsigned long current;
2779     unsigned long *bitmap = block->bmap;
2780 
2781     for (current = 0; current < end; ) {
2782         unsigned long one = find_next_bit(bitmap, end, current);
2783         unsigned long zero, discard_length;
2784 
2785         if (one >= end) {
2786             break;
2787         }
2788 
2789         zero = find_next_zero_bit(bitmap, end, one + 1);
2790 
2791         if (zero >= end) {
2792             discard_length = end - one;
2793         } else {
2794             discard_length = zero - one;
2795         }
2796         postcopy_discard_send_range(ms, one, discard_length);
2797         current = one + discard_length;
2798     }
2799 }
2800 
2801 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2802 
2803 /**
2804  * postcopy_each_ram_send_discard: discard all RAMBlocks
2805  *
2806  * Utility for the outgoing postcopy code.
2807  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2808  *   passing it bitmap indexes and name.
2809  * (qemu_ram_foreach_block ends up passing unscaled lengths
2810  *  which would mean postcopy code would have to deal with target page)
2811  *
2812  * @ms: current migration state
2813  */
2814 static void postcopy_each_ram_send_discard(MigrationState *ms)
2815 {
2816     struct RAMBlock *block;
2817 
2818     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2819         postcopy_discard_send_init(ms, block->idstr);
2820 
2821         /*
2822          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2823          * host-page size chunks, mark any partially dirty host-page size
2824          * chunks as all dirty.  In this case the host-page is the host-page
2825          * for the particular RAMBlock, i.e. it might be a huge page.
2826          */
2827         postcopy_chunk_hostpages_pass(ms, block);
2828 
2829         /*
2830          * Postcopy sends chunks of bitmap over the wire, but it
2831          * just needs indexes at this point, avoids it having
2832          * target page specific code.
2833          */
2834         postcopy_send_discard_bm_ram(ms, block);
2835         postcopy_discard_send_finish(ms);
2836     }
2837 }
2838 
2839 /**
2840  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2841  *
2842  * Helper for postcopy_chunk_hostpages; it's called twice to
2843  * canonicalize the two bitmaps, that are similar, but one is
2844  * inverted.
2845  *
2846  * Postcopy requires that all target pages in a hostpage are dirty or
2847  * clean, not a mix.  This function canonicalizes the bitmaps.
2848  *
2849  * @ms: current migration state
2850  * @block: block that contains the page we want to canonicalize
2851  */
2852 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2853 {
2854     RAMState *rs = ram_state;
2855     unsigned long *bitmap = block->bmap;
2856     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2857     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2858     unsigned long run_start;
2859 
2860     if (block->page_size == TARGET_PAGE_SIZE) {
2861         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2862         return;
2863     }
2864 
2865     /* Find a dirty page */
2866     run_start = find_next_bit(bitmap, pages, 0);
2867 
2868     while (run_start < pages) {
2869 
2870         /*
2871          * If the start of this run of pages is in the middle of a host
2872          * page, then we need to fixup this host page.
2873          */
2874         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2875             /* Find the end of this run */
2876             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2877             /*
2878              * If the end isn't at the start of a host page, then the
2879              * run doesn't finish at the end of a host page
2880              * and we need to discard.
2881              */
2882         }
2883 
2884         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2885             unsigned long page;
2886             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2887                                                              host_ratio);
2888             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2889 
2890             /* Clean up the bitmap */
2891             for (page = fixup_start_addr;
2892                  page < fixup_start_addr + host_ratio; page++) {
2893                 /*
2894                  * Remark them as dirty, updating the count for any pages
2895                  * that weren't previously dirty.
2896                  */
2897                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2898             }
2899         }
2900 
2901         /* Find the next dirty page for the next iteration */
2902         run_start = find_next_bit(bitmap, pages, run_start);
2903     }
2904 }
2905 
2906 /**
2907  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2908  *
2909  * Transmit the set of pages to be discarded after precopy to the target
2910  * these are pages that:
2911  *     a) Have been previously transmitted but are now dirty again
2912  *     b) Pages that have never been transmitted, this ensures that
2913  *        any pages on the destination that have been mapped by background
2914  *        tasks get discarded (transparent huge pages is the specific concern)
2915  * Hopefully this is pretty sparse
2916  *
2917  * @ms: current migration state
2918  */
2919 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2920 {
2921     RAMState *rs = ram_state;
2922 
2923     RCU_READ_LOCK_GUARD();
2924 
2925     /* This should be our last sync, the src is now paused */
2926     migration_bitmap_sync(rs);
2927 
2928     /* Easiest way to make sure we don't resume in the middle of a host-page */
2929     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2930     rs->last_seen_block = NULL;
2931     rs->last_page = 0;
2932 
2933     postcopy_each_ram_send_discard(ms);
2934 
2935     trace_ram_postcopy_send_discard_bitmap();
2936 }
2937 
2938 /**
2939  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2940  *
2941  * Returns zero on success
2942  *
2943  * @rbname: name of the RAMBlock of the request. NULL means the
2944  *          same that last one.
2945  * @start: RAMBlock starting page
2946  * @length: RAMBlock size
2947  */
2948 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2949 {
2950     trace_ram_discard_range(rbname, start, length);
2951 
2952     RCU_READ_LOCK_GUARD();
2953     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2954 
2955     if (!rb) {
2956         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2957         return -1;
2958     }
2959 
2960     /*
2961      * On source VM, we don't need to update the received bitmap since
2962      * we don't even have one.
2963      */
2964     if (rb->receivedmap) {
2965         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2966                      length >> qemu_target_page_bits());
2967     }
2968 
2969     return ram_block_discard_range(rb, start, length);
2970 }
2971 
2972 /*
2973  * For every allocation, we will try not to crash the VM if the
2974  * allocation failed.
2975  */
2976 static int xbzrle_init(void)
2977 {
2978     Error *local_err = NULL;
2979 
2980     if (!migrate_xbzrle()) {
2981         return 0;
2982     }
2983 
2984     XBZRLE_cache_lock();
2985 
2986     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2987     if (!XBZRLE.zero_target_page) {
2988         error_report("%s: Error allocating zero page", __func__);
2989         goto err_out;
2990     }
2991 
2992     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2993                               TARGET_PAGE_SIZE, &local_err);
2994     if (!XBZRLE.cache) {
2995         error_report_err(local_err);
2996         goto free_zero_page;
2997     }
2998 
2999     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3000     if (!XBZRLE.encoded_buf) {
3001         error_report("%s: Error allocating encoded_buf", __func__);
3002         goto free_cache;
3003     }
3004 
3005     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3006     if (!XBZRLE.current_buf) {
3007         error_report("%s: Error allocating current_buf", __func__);
3008         goto free_encoded_buf;
3009     }
3010 
3011     /* We are all good */
3012     XBZRLE_cache_unlock();
3013     return 0;
3014 
3015 free_encoded_buf:
3016     g_free(XBZRLE.encoded_buf);
3017     XBZRLE.encoded_buf = NULL;
3018 free_cache:
3019     cache_fini(XBZRLE.cache);
3020     XBZRLE.cache = NULL;
3021 free_zero_page:
3022     g_free(XBZRLE.zero_target_page);
3023     XBZRLE.zero_target_page = NULL;
3024 err_out:
3025     XBZRLE_cache_unlock();
3026     return -ENOMEM;
3027 }
3028 
3029 static int ram_state_init(RAMState **rsp)
3030 {
3031     *rsp = g_try_new0(RAMState, 1);
3032 
3033     if (!*rsp) {
3034         error_report("%s: Init ramstate fail", __func__);
3035         return -1;
3036     }
3037 
3038     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3039     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3040     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3041     (*rsp)->ram_bytes_total = ram_bytes_total();
3042 
3043     /*
3044      * Count the total number of pages used by ram blocks not including any
3045      * gaps due to alignment or unplugs.
3046      * This must match with the initial values of dirty bitmap.
3047      */
3048     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3049     ram_state_reset(*rsp);
3050 
3051     return 0;
3052 }
3053 
3054 static void ram_list_init_bitmaps(void)
3055 {
3056     MigrationState *ms = migrate_get_current();
3057     RAMBlock *block;
3058     unsigned long pages;
3059     uint8_t shift;
3060 
3061     /* Skip setting bitmap if there is no RAM */
3062     if (ram_bytes_total()) {
3063         shift = ms->clear_bitmap_shift;
3064         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3065             error_report("clear_bitmap_shift (%u) too big, using "
3066                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3067             shift = CLEAR_BITMAP_SHIFT_MAX;
3068         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3069             error_report("clear_bitmap_shift (%u) too small, using "
3070                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3071             shift = CLEAR_BITMAP_SHIFT_MIN;
3072         }
3073 
3074         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3075             pages = block->max_length >> TARGET_PAGE_BITS;
3076             /*
3077              * The initial dirty bitmap for migration must be set with all
3078              * ones to make sure we'll migrate every guest RAM page to
3079              * destination.
3080              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3081              * new migration after a failed migration, ram_list.
3082              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3083              * guest memory.
3084              */
3085             block->bmap = bitmap_new(pages);
3086             bitmap_set(block->bmap, 0, pages);
3087             block->clear_bmap_shift = shift;
3088             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3089         }
3090     }
3091 }
3092 
3093 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3094 {
3095     unsigned long pages;
3096     RAMBlock *rb;
3097 
3098     RCU_READ_LOCK_GUARD();
3099 
3100     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3101             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3102             rs->migration_dirty_pages -= pages;
3103     }
3104 }
3105 
3106 static void ram_init_bitmaps(RAMState *rs)
3107 {
3108     /* For memory_global_dirty_log_start below.  */
3109     qemu_mutex_lock_iothread();
3110     qemu_mutex_lock_ramlist();
3111 
3112     WITH_RCU_READ_LOCK_GUARD() {
3113         ram_list_init_bitmaps();
3114         /* We don't use dirty log with background snapshots */
3115         if (!migrate_background_snapshot()) {
3116             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3117             migration_bitmap_sync_precopy(rs);
3118         }
3119     }
3120     qemu_mutex_unlock_ramlist();
3121     qemu_mutex_unlock_iothread();
3122 
3123     /*
3124      * After an eventual first bitmap sync, fixup the initial bitmap
3125      * containing all 1s to exclude any discarded pages from migration.
3126      */
3127     migration_bitmap_clear_discarded_pages(rs);
3128 }
3129 
3130 static int ram_init_all(RAMState **rsp)
3131 {
3132     if (ram_state_init(rsp)) {
3133         return -1;
3134     }
3135 
3136     if (xbzrle_init()) {
3137         ram_state_cleanup(rsp);
3138         return -1;
3139     }
3140 
3141     ram_init_bitmaps(*rsp);
3142 
3143     return 0;
3144 }
3145 
3146 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3147 {
3148     RAMBlock *block;
3149     uint64_t pages = 0;
3150 
3151     /*
3152      * Postcopy is not using xbzrle/compression, so no need for that.
3153      * Also, since source are already halted, we don't need to care
3154      * about dirty page logging as well.
3155      */
3156 
3157     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3158         pages += bitmap_count_one(block->bmap,
3159                                   block->used_length >> TARGET_PAGE_BITS);
3160     }
3161 
3162     /* This may not be aligned with current bitmaps. Recalculate. */
3163     rs->migration_dirty_pages = pages;
3164 
3165     ram_state_reset(rs);
3166 
3167     /* Update RAMState cache of output QEMUFile */
3168     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3169 
3170     trace_ram_state_resume_prepare(pages);
3171 }
3172 
3173 /*
3174  * This function clears bits of the free pages reported by the caller from the
3175  * migration dirty bitmap. @addr is the host address corresponding to the
3176  * start of the continuous guest free pages, and @len is the total bytes of
3177  * those pages.
3178  */
3179 void qemu_guest_free_page_hint(void *addr, size_t len)
3180 {
3181     RAMBlock *block;
3182     ram_addr_t offset;
3183     size_t used_len, start, npages;
3184     MigrationState *s = migrate_get_current();
3185 
3186     /* This function is currently expected to be used during live migration */
3187     if (!migration_is_setup_or_active(s->state)) {
3188         return;
3189     }
3190 
3191     for (; len > 0; len -= used_len, addr += used_len) {
3192         block = qemu_ram_block_from_host(addr, false, &offset);
3193         if (unlikely(!block || offset >= block->used_length)) {
3194             /*
3195              * The implementation might not support RAMBlock resize during
3196              * live migration, but it could happen in theory with future
3197              * updates. So we add a check here to capture that case.
3198              */
3199             error_report_once("%s unexpected error", __func__);
3200             return;
3201         }
3202 
3203         if (len <= block->used_length - offset) {
3204             used_len = len;
3205         } else {
3206             used_len = block->used_length - offset;
3207         }
3208 
3209         start = offset >> TARGET_PAGE_BITS;
3210         npages = used_len >> TARGET_PAGE_BITS;
3211 
3212         qemu_mutex_lock(&ram_state->bitmap_mutex);
3213         /*
3214          * The skipped free pages are equavalent to be sent from clear_bmap's
3215          * perspective, so clear the bits from the memory region bitmap which
3216          * are initially set. Otherwise those skipped pages will be sent in
3217          * the next round after syncing from the memory region bitmap.
3218          */
3219         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3220         ram_state->migration_dirty_pages -=
3221                       bitmap_count_one_with_offset(block->bmap, start, npages);
3222         bitmap_clear(block->bmap, start, npages);
3223         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3224     }
3225 }
3226 
3227 /*
3228  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3229  * long-running RCU critical section.  When rcu-reclaims in the code
3230  * start to become numerous it will be necessary to reduce the
3231  * granularity of these critical sections.
3232  */
3233 
3234 /**
3235  * ram_save_setup: Setup RAM for migration
3236  *
3237  * Returns zero to indicate success and negative for error
3238  *
3239  * @f: QEMUFile where to send the data
3240  * @opaque: RAMState pointer
3241  */
3242 static int ram_save_setup(QEMUFile *f, void *opaque)
3243 {
3244     RAMState **rsp = opaque;
3245     RAMBlock *block;
3246     int ret;
3247 
3248     if (compress_threads_save_setup()) {
3249         return -1;
3250     }
3251 
3252     /* migration has already setup the bitmap, reuse it. */
3253     if (!migration_in_colo_state()) {
3254         if (ram_init_all(rsp) != 0) {
3255             compress_threads_save_cleanup();
3256             return -1;
3257         }
3258     }
3259     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3260 
3261     WITH_RCU_READ_LOCK_GUARD() {
3262         qemu_put_be64(f, ram_bytes_total_with_ignored()
3263                          | RAM_SAVE_FLAG_MEM_SIZE);
3264 
3265         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3266             qemu_put_byte(f, strlen(block->idstr));
3267             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3268             qemu_put_be64(f, block->used_length);
3269             if (migrate_postcopy_ram() && block->page_size !=
3270                                           qemu_host_page_size) {
3271                 qemu_put_be64(f, block->page_size);
3272             }
3273             if (migrate_ignore_shared()) {
3274                 qemu_put_be64(f, block->mr->addr);
3275             }
3276         }
3277     }
3278 
3279     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3280     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3281 
3282     migration_ops = g_malloc0(sizeof(MigrationOps));
3283     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3284     ret = multifd_send_sync_main(f);
3285     if (ret < 0) {
3286         return ret;
3287     }
3288 
3289     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3290     qemu_fflush(f);
3291 
3292     return 0;
3293 }
3294 
3295 /**
3296  * ram_save_iterate: iterative stage for migration
3297  *
3298  * Returns zero to indicate success and negative for error
3299  *
3300  * @f: QEMUFile where to send the data
3301  * @opaque: RAMState pointer
3302  */
3303 static int ram_save_iterate(QEMUFile *f, void *opaque)
3304 {
3305     RAMState **temp = opaque;
3306     RAMState *rs = *temp;
3307     int ret = 0;
3308     int i;
3309     int64_t t0;
3310     int done = 0;
3311 
3312     if (blk_mig_bulk_active()) {
3313         /* Avoid transferring ram during bulk phase of block migration as
3314          * the bulk phase will usually take a long time and transferring
3315          * ram updates during that time is pointless. */
3316         goto out;
3317     }
3318 
3319     /*
3320      * We'll take this lock a little bit long, but it's okay for two reasons.
3321      * Firstly, the only possible other thread to take it is who calls
3322      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3323      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3324      * guarantees that we'll at least released it in a regular basis.
3325      */
3326     qemu_mutex_lock(&rs->bitmap_mutex);
3327     WITH_RCU_READ_LOCK_GUARD() {
3328         if (ram_list.version != rs->last_version) {
3329             ram_state_reset(rs);
3330         }
3331 
3332         /* Read version before ram_list.blocks */
3333         smp_rmb();
3334 
3335         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3336 
3337         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3338         i = 0;
3339         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3340                postcopy_has_request(rs)) {
3341             int pages;
3342 
3343             if (qemu_file_get_error(f)) {
3344                 break;
3345             }
3346 
3347             pages = ram_find_and_save_block(rs);
3348             /* no more pages to sent */
3349             if (pages == 0) {
3350                 done = 1;
3351                 break;
3352             }
3353 
3354             if (pages < 0) {
3355                 qemu_file_set_error(f, pages);
3356                 break;
3357             }
3358 
3359             rs->target_page_count += pages;
3360 
3361             /*
3362              * During postcopy, it is necessary to make sure one whole host
3363              * page is sent in one chunk.
3364              */
3365             if (migrate_postcopy_ram()) {
3366                 flush_compressed_data(rs);
3367             }
3368 
3369             /*
3370              * we want to check in the 1st loop, just in case it was the 1st
3371              * time and we had to sync the dirty bitmap.
3372              * qemu_clock_get_ns() is a bit expensive, so we only check each
3373              * some iterations
3374              */
3375             if ((i & 63) == 0) {
3376                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3377                               1000000;
3378                 if (t1 > MAX_WAIT) {
3379                     trace_ram_save_iterate_big_wait(t1, i);
3380                     break;
3381                 }
3382             }
3383             i++;
3384         }
3385     }
3386     qemu_mutex_unlock(&rs->bitmap_mutex);
3387 
3388     /*
3389      * Must occur before EOS (or any QEMUFile operation)
3390      * because of RDMA protocol.
3391      */
3392     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3393 
3394 out:
3395     if (ret >= 0
3396         && migration_is_setup_or_active(migrate_get_current()->state)) {
3397         ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3398         if (ret < 0) {
3399             return ret;
3400         }
3401 
3402         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3403         qemu_fflush(f);
3404         ram_transferred_add(8);
3405 
3406         ret = qemu_file_get_error(f);
3407     }
3408     if (ret < 0) {
3409         return ret;
3410     }
3411 
3412     return done;
3413 }
3414 
3415 /**
3416  * ram_save_complete: function called to send the remaining amount of ram
3417  *
3418  * Returns zero to indicate success or negative on error
3419  *
3420  * Called with iothread lock
3421  *
3422  * @f: QEMUFile where to send the data
3423  * @opaque: RAMState pointer
3424  */
3425 static int ram_save_complete(QEMUFile *f, void *opaque)
3426 {
3427     RAMState **temp = opaque;
3428     RAMState *rs = *temp;
3429     int ret = 0;
3430 
3431     rs->last_stage = !migration_in_colo_state();
3432 
3433     WITH_RCU_READ_LOCK_GUARD() {
3434         if (!migration_in_postcopy()) {
3435             migration_bitmap_sync_precopy(rs);
3436         }
3437 
3438         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3439 
3440         /* try transferring iterative blocks of memory */
3441 
3442         /* flush all remaining blocks regardless of rate limiting */
3443         qemu_mutex_lock(&rs->bitmap_mutex);
3444         while (true) {
3445             int pages;
3446 
3447             pages = ram_find_and_save_block(rs);
3448             /* no more blocks to sent */
3449             if (pages == 0) {
3450                 break;
3451             }
3452             if (pages < 0) {
3453                 ret = pages;
3454                 break;
3455             }
3456         }
3457         qemu_mutex_unlock(&rs->bitmap_mutex);
3458 
3459         flush_compressed_data(rs);
3460         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3461     }
3462 
3463     if (ret < 0) {
3464         return ret;
3465     }
3466 
3467     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3468     if (ret < 0) {
3469         return ret;
3470     }
3471 
3472     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3473     qemu_fflush(f);
3474 
3475     return 0;
3476 }
3477 
3478 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3479                                        uint64_t *can_postcopy)
3480 {
3481     RAMState **temp = opaque;
3482     RAMState *rs = *temp;
3483 
3484     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3485 
3486     if (migrate_postcopy_ram()) {
3487         /* We can do postcopy, and all the data is postcopiable */
3488         *can_postcopy += remaining_size;
3489     } else {
3490         *must_precopy += remaining_size;
3491     }
3492 }
3493 
3494 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3495                                     uint64_t *can_postcopy)
3496 {
3497     MigrationState *s = migrate_get_current();
3498     RAMState **temp = opaque;
3499     RAMState *rs = *temp;
3500 
3501     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3502 
3503     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3504         qemu_mutex_lock_iothread();
3505         WITH_RCU_READ_LOCK_GUARD() {
3506             migration_bitmap_sync_precopy(rs);
3507         }
3508         qemu_mutex_unlock_iothread();
3509         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3510     }
3511 
3512     if (migrate_postcopy_ram()) {
3513         /* We can do postcopy, and all the data is postcopiable */
3514         *can_postcopy += remaining_size;
3515     } else {
3516         *must_precopy += remaining_size;
3517     }
3518 }
3519 
3520 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3521 {
3522     unsigned int xh_len;
3523     int xh_flags;
3524     uint8_t *loaded_data;
3525 
3526     /* extract RLE header */
3527     xh_flags = qemu_get_byte(f);
3528     xh_len = qemu_get_be16(f);
3529 
3530     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3531         error_report("Failed to load XBZRLE page - wrong compression!");
3532         return -1;
3533     }
3534 
3535     if (xh_len > TARGET_PAGE_SIZE) {
3536         error_report("Failed to load XBZRLE page - len overflow!");
3537         return -1;
3538     }
3539     loaded_data = XBZRLE.decoded_buf;
3540     /* load data and decode */
3541     /* it can change loaded_data to point to an internal buffer */
3542     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3543 
3544     /* decode RLE */
3545     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3546                              TARGET_PAGE_SIZE) == -1) {
3547         error_report("Failed to load XBZRLE page - decode error!");
3548         return -1;
3549     }
3550 
3551     return 0;
3552 }
3553 
3554 /**
3555  * ram_block_from_stream: read a RAMBlock id from the migration stream
3556  *
3557  * Must be called from within a rcu critical section.
3558  *
3559  * Returns a pointer from within the RCU-protected ram_list.
3560  *
3561  * @mis: the migration incoming state pointer
3562  * @f: QEMUFile where to read the data from
3563  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3564  * @channel: the channel we're using
3565  */
3566 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3567                                               QEMUFile *f, int flags,
3568                                               int channel)
3569 {
3570     RAMBlock *block = mis->last_recv_block[channel];
3571     char id[256];
3572     uint8_t len;
3573 
3574     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3575         if (!block) {
3576             error_report("Ack, bad migration stream!");
3577             return NULL;
3578         }
3579         return block;
3580     }
3581 
3582     len = qemu_get_byte(f);
3583     qemu_get_buffer(f, (uint8_t *)id, len);
3584     id[len] = 0;
3585 
3586     block = qemu_ram_block_by_name(id);
3587     if (!block) {
3588         error_report("Can't find block %s", id);
3589         return NULL;
3590     }
3591 
3592     if (ramblock_is_ignored(block)) {
3593         error_report("block %s should not be migrated !", id);
3594         return NULL;
3595     }
3596 
3597     mis->last_recv_block[channel] = block;
3598 
3599     return block;
3600 }
3601 
3602 static inline void *host_from_ram_block_offset(RAMBlock *block,
3603                                                ram_addr_t offset)
3604 {
3605     if (!offset_in_ramblock(block, offset)) {
3606         return NULL;
3607     }
3608 
3609     return block->host + offset;
3610 }
3611 
3612 static void *host_page_from_ram_block_offset(RAMBlock *block,
3613                                              ram_addr_t offset)
3614 {
3615     /* Note: Explicitly no check against offset_in_ramblock(). */
3616     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3617                                    block->page_size);
3618 }
3619 
3620 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3621                                                          ram_addr_t offset)
3622 {
3623     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3624 }
3625 
3626 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3627                              ram_addr_t offset, bool record_bitmap)
3628 {
3629     if (!offset_in_ramblock(block, offset)) {
3630         return NULL;
3631     }
3632     if (!block->colo_cache) {
3633         error_report("%s: colo_cache is NULL in block :%s",
3634                      __func__, block->idstr);
3635         return NULL;
3636     }
3637 
3638     /*
3639     * During colo checkpoint, we need bitmap of these migrated pages.
3640     * It help us to decide which pages in ram cache should be flushed
3641     * into VM's RAM later.
3642     */
3643     if (record_bitmap &&
3644         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3645         ram_state->migration_dirty_pages++;
3646     }
3647     return block->colo_cache + offset;
3648 }
3649 
3650 /**
3651  * ram_handle_compressed: handle the zero page case
3652  *
3653  * If a page (or a whole RDMA chunk) has been
3654  * determined to be zero, then zap it.
3655  *
3656  * @host: host address for the zero page
3657  * @ch: what the page is filled from.  We only support zero
3658  * @size: size of the zero page
3659  */
3660 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3661 {
3662     if (ch != 0 || !buffer_is_zero(host, size)) {
3663         memset(host, ch, size);
3664     }
3665 }
3666 
3667 /* return the size after decompression, or negative value on error */
3668 static int
3669 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3670                      const uint8_t *source, size_t source_len)
3671 {
3672     int err;
3673 
3674     err = inflateReset(stream);
3675     if (err != Z_OK) {
3676         return -1;
3677     }
3678 
3679     stream->avail_in = source_len;
3680     stream->next_in = (uint8_t *)source;
3681     stream->avail_out = dest_len;
3682     stream->next_out = dest;
3683 
3684     err = inflate(stream, Z_NO_FLUSH);
3685     if (err != Z_STREAM_END) {
3686         return -1;
3687     }
3688 
3689     return stream->total_out;
3690 }
3691 
3692 static void *do_data_decompress(void *opaque)
3693 {
3694     DecompressParam *param = opaque;
3695     unsigned long pagesize;
3696     uint8_t *des;
3697     int len, ret;
3698 
3699     qemu_mutex_lock(&param->mutex);
3700     while (!param->quit) {
3701         if (param->des) {
3702             des = param->des;
3703             len = param->len;
3704             param->des = 0;
3705             qemu_mutex_unlock(&param->mutex);
3706 
3707             pagesize = TARGET_PAGE_SIZE;
3708 
3709             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3710                                        param->compbuf, len);
3711             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3712                 error_report("decompress data failed");
3713                 qemu_file_set_error(decomp_file, ret);
3714             }
3715 
3716             qemu_mutex_lock(&decomp_done_lock);
3717             param->done = true;
3718             qemu_cond_signal(&decomp_done_cond);
3719             qemu_mutex_unlock(&decomp_done_lock);
3720 
3721             qemu_mutex_lock(&param->mutex);
3722         } else {
3723             qemu_cond_wait(&param->cond, &param->mutex);
3724         }
3725     }
3726     qemu_mutex_unlock(&param->mutex);
3727 
3728     return NULL;
3729 }
3730 
3731 static int wait_for_decompress_done(void)
3732 {
3733     int idx, thread_count;
3734 
3735     if (!migrate_compress()) {
3736         return 0;
3737     }
3738 
3739     thread_count = migrate_decompress_threads();
3740     qemu_mutex_lock(&decomp_done_lock);
3741     for (idx = 0; idx < thread_count; idx++) {
3742         while (!decomp_param[idx].done) {
3743             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3744         }
3745     }
3746     qemu_mutex_unlock(&decomp_done_lock);
3747     return qemu_file_get_error(decomp_file);
3748 }
3749 
3750 static void compress_threads_load_cleanup(void)
3751 {
3752     int i, thread_count;
3753 
3754     if (!migrate_compress()) {
3755         return;
3756     }
3757     thread_count = migrate_decompress_threads();
3758     for (i = 0; i < thread_count; i++) {
3759         /*
3760          * we use it as a indicator which shows if the thread is
3761          * properly init'd or not
3762          */
3763         if (!decomp_param[i].compbuf) {
3764             break;
3765         }
3766 
3767         qemu_mutex_lock(&decomp_param[i].mutex);
3768         decomp_param[i].quit = true;
3769         qemu_cond_signal(&decomp_param[i].cond);
3770         qemu_mutex_unlock(&decomp_param[i].mutex);
3771     }
3772     for (i = 0; i < thread_count; i++) {
3773         if (!decomp_param[i].compbuf) {
3774             break;
3775         }
3776 
3777         qemu_thread_join(decompress_threads + i);
3778         qemu_mutex_destroy(&decomp_param[i].mutex);
3779         qemu_cond_destroy(&decomp_param[i].cond);
3780         inflateEnd(&decomp_param[i].stream);
3781         g_free(decomp_param[i].compbuf);
3782         decomp_param[i].compbuf = NULL;
3783     }
3784     g_free(decompress_threads);
3785     g_free(decomp_param);
3786     decompress_threads = NULL;
3787     decomp_param = NULL;
3788     decomp_file = NULL;
3789 }
3790 
3791 static int compress_threads_load_setup(QEMUFile *f)
3792 {
3793     int i, thread_count;
3794 
3795     if (!migrate_compress()) {
3796         return 0;
3797     }
3798 
3799     thread_count = migrate_decompress_threads();
3800     decompress_threads = g_new0(QemuThread, thread_count);
3801     decomp_param = g_new0(DecompressParam, thread_count);
3802     qemu_mutex_init(&decomp_done_lock);
3803     qemu_cond_init(&decomp_done_cond);
3804     decomp_file = f;
3805     for (i = 0; i < thread_count; i++) {
3806         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3807             goto exit;
3808         }
3809 
3810         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3811         qemu_mutex_init(&decomp_param[i].mutex);
3812         qemu_cond_init(&decomp_param[i].cond);
3813         decomp_param[i].done = true;
3814         decomp_param[i].quit = false;
3815         qemu_thread_create(decompress_threads + i, "decompress",
3816                            do_data_decompress, decomp_param + i,
3817                            QEMU_THREAD_JOINABLE);
3818     }
3819     return 0;
3820 exit:
3821     compress_threads_load_cleanup();
3822     return -1;
3823 }
3824 
3825 static void decompress_data_with_multi_threads(QEMUFile *f,
3826                                                void *host, int len)
3827 {
3828     int idx, thread_count;
3829 
3830     thread_count = migrate_decompress_threads();
3831     QEMU_LOCK_GUARD(&decomp_done_lock);
3832     while (true) {
3833         for (idx = 0; idx < thread_count; idx++) {
3834             if (decomp_param[idx].done) {
3835                 decomp_param[idx].done = false;
3836                 qemu_mutex_lock(&decomp_param[idx].mutex);
3837                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3838                 decomp_param[idx].des = host;
3839                 decomp_param[idx].len = len;
3840                 qemu_cond_signal(&decomp_param[idx].cond);
3841                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3842                 break;
3843             }
3844         }
3845         if (idx < thread_count) {
3846             break;
3847         } else {
3848             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3849         }
3850     }
3851 }
3852 
3853 static void colo_init_ram_state(void)
3854 {
3855     ram_state_init(&ram_state);
3856 }
3857 
3858 /*
3859  * colo cache: this is for secondary VM, we cache the whole
3860  * memory of the secondary VM, it is need to hold the global lock
3861  * to call this helper.
3862  */
3863 int colo_init_ram_cache(void)
3864 {
3865     RAMBlock *block;
3866 
3867     WITH_RCU_READ_LOCK_GUARD() {
3868         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3869             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3870                                                     NULL, false, false);
3871             if (!block->colo_cache) {
3872                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3873                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3874                              block->used_length);
3875                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3876                     if (block->colo_cache) {
3877                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3878                         block->colo_cache = NULL;
3879                     }
3880                 }
3881                 return -errno;
3882             }
3883             if (!machine_dump_guest_core(current_machine)) {
3884                 qemu_madvise(block->colo_cache, block->used_length,
3885                              QEMU_MADV_DONTDUMP);
3886             }
3887         }
3888     }
3889 
3890     /*
3891     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3892     * with to decide which page in cache should be flushed into SVM's RAM. Here
3893     * we use the same name 'ram_bitmap' as for migration.
3894     */
3895     if (ram_bytes_total()) {
3896         RAMBlock *block;
3897 
3898         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3899             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3900             block->bmap = bitmap_new(pages);
3901         }
3902     }
3903 
3904     colo_init_ram_state();
3905     return 0;
3906 }
3907 
3908 /* TODO: duplicated with ram_init_bitmaps */
3909 void colo_incoming_start_dirty_log(void)
3910 {
3911     RAMBlock *block = NULL;
3912     /* For memory_global_dirty_log_start below. */
3913     qemu_mutex_lock_iothread();
3914     qemu_mutex_lock_ramlist();
3915 
3916     memory_global_dirty_log_sync();
3917     WITH_RCU_READ_LOCK_GUARD() {
3918         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3919             ramblock_sync_dirty_bitmap(ram_state, block);
3920             /* Discard this dirty bitmap record */
3921             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3922         }
3923         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3924     }
3925     ram_state->migration_dirty_pages = 0;
3926     qemu_mutex_unlock_ramlist();
3927     qemu_mutex_unlock_iothread();
3928 }
3929 
3930 /* It is need to hold the global lock to call this helper */
3931 void colo_release_ram_cache(void)
3932 {
3933     RAMBlock *block;
3934 
3935     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3936     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3937         g_free(block->bmap);
3938         block->bmap = NULL;
3939     }
3940 
3941     WITH_RCU_READ_LOCK_GUARD() {
3942         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3943             if (block->colo_cache) {
3944                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3945                 block->colo_cache = NULL;
3946             }
3947         }
3948     }
3949     ram_state_cleanup(&ram_state);
3950 }
3951 
3952 /**
3953  * ram_load_setup: Setup RAM for migration incoming side
3954  *
3955  * Returns zero to indicate success and negative for error
3956  *
3957  * @f: QEMUFile where to receive the data
3958  * @opaque: RAMState pointer
3959  */
3960 static int ram_load_setup(QEMUFile *f, void *opaque)
3961 {
3962     if (compress_threads_load_setup(f)) {
3963         return -1;
3964     }
3965 
3966     xbzrle_load_setup();
3967     ramblock_recv_map_init();
3968 
3969     return 0;
3970 }
3971 
3972 static int ram_load_cleanup(void *opaque)
3973 {
3974     RAMBlock *rb;
3975 
3976     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3977         qemu_ram_block_writeback(rb);
3978     }
3979 
3980     xbzrle_load_cleanup();
3981     compress_threads_load_cleanup();
3982 
3983     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3984         g_free(rb->receivedmap);
3985         rb->receivedmap = NULL;
3986     }
3987 
3988     return 0;
3989 }
3990 
3991 /**
3992  * ram_postcopy_incoming_init: allocate postcopy data structures
3993  *
3994  * Returns 0 for success and negative if there was one error
3995  *
3996  * @mis: current migration incoming state
3997  *
3998  * Allocate data structures etc needed by incoming migration with
3999  * postcopy-ram. postcopy-ram's similarly names
4000  * postcopy_ram_incoming_init does the work.
4001  */
4002 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4003 {
4004     return postcopy_ram_incoming_init(mis);
4005 }
4006 
4007 /**
4008  * ram_load_postcopy: load a page in postcopy case
4009  *
4010  * Returns 0 for success or -errno in case of error
4011  *
4012  * Called in postcopy mode by ram_load().
4013  * rcu_read_lock is taken prior to this being called.
4014  *
4015  * @f: QEMUFile where to send the data
4016  * @channel: the channel to use for loading
4017  */
4018 int ram_load_postcopy(QEMUFile *f, int channel)
4019 {
4020     int flags = 0, ret = 0;
4021     bool place_needed = false;
4022     bool matches_target_page_size = false;
4023     MigrationIncomingState *mis = migration_incoming_get_current();
4024     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4025 
4026     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4027         ram_addr_t addr;
4028         void *page_buffer = NULL;
4029         void *place_source = NULL;
4030         RAMBlock *block = NULL;
4031         uint8_t ch;
4032         int len;
4033 
4034         addr = qemu_get_be64(f);
4035 
4036         /*
4037          * If qemu file error, we should stop here, and then "addr"
4038          * may be invalid
4039          */
4040         ret = qemu_file_get_error(f);
4041         if (ret) {
4042             break;
4043         }
4044 
4045         flags = addr & ~TARGET_PAGE_MASK;
4046         addr &= TARGET_PAGE_MASK;
4047 
4048         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4049         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4050                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4051             block = ram_block_from_stream(mis, f, flags, channel);
4052             if (!block) {
4053                 ret = -EINVAL;
4054                 break;
4055             }
4056 
4057             /*
4058              * Relying on used_length is racy and can result in false positives.
4059              * We might place pages beyond used_length in case RAM was shrunk
4060              * while in postcopy, which is fine - trying to place via
4061              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4062              */
4063             if (!block->host || addr >= block->postcopy_length) {
4064                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4065                 ret = -EINVAL;
4066                 break;
4067             }
4068             tmp_page->target_pages++;
4069             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4070             /*
4071              * Postcopy requires that we place whole host pages atomically;
4072              * these may be huge pages for RAMBlocks that are backed by
4073              * hugetlbfs.
4074              * To make it atomic, the data is read into a temporary page
4075              * that's moved into place later.
4076              * The migration protocol uses,  possibly smaller, target-pages
4077              * however the source ensures it always sends all the components
4078              * of a host page in one chunk.
4079              */
4080             page_buffer = tmp_page->tmp_huge_page +
4081                           host_page_offset_from_ram_block_offset(block, addr);
4082             /* If all TP are zero then we can optimise the place */
4083             if (tmp_page->target_pages == 1) {
4084                 tmp_page->host_addr =
4085                     host_page_from_ram_block_offset(block, addr);
4086             } else if (tmp_page->host_addr !=
4087                        host_page_from_ram_block_offset(block, addr)) {
4088                 /* not the 1st TP within the HP */
4089                 error_report("Non-same host page detected on channel %d: "
4090                              "Target host page %p, received host page %p "
4091                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4092                              channel, tmp_page->host_addr,
4093                              host_page_from_ram_block_offset(block, addr),
4094                              block->idstr, addr, tmp_page->target_pages);
4095                 ret = -EINVAL;
4096                 break;
4097             }
4098 
4099             /*
4100              * If it's the last part of a host page then we place the host
4101              * page
4102              */
4103             if (tmp_page->target_pages ==
4104                 (block->page_size / TARGET_PAGE_SIZE)) {
4105                 place_needed = true;
4106             }
4107             place_source = tmp_page->tmp_huge_page;
4108         }
4109 
4110         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4111         case RAM_SAVE_FLAG_ZERO:
4112             ch = qemu_get_byte(f);
4113             /*
4114              * Can skip to set page_buffer when
4115              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4116              */
4117             if (ch || !matches_target_page_size) {
4118                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4119             }
4120             if (ch) {
4121                 tmp_page->all_zero = false;
4122             }
4123             break;
4124 
4125         case RAM_SAVE_FLAG_PAGE:
4126             tmp_page->all_zero = false;
4127             if (!matches_target_page_size) {
4128                 /* For huge pages, we always use temporary buffer */
4129                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4130             } else {
4131                 /*
4132                  * For small pages that matches target page size, we
4133                  * avoid the qemu_file copy.  Instead we directly use
4134                  * the buffer of QEMUFile to place the page.  Note: we
4135                  * cannot do any QEMUFile operation before using that
4136                  * buffer to make sure the buffer is valid when
4137                  * placing the page.
4138                  */
4139                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4140                                          TARGET_PAGE_SIZE);
4141             }
4142             break;
4143         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4144             tmp_page->all_zero = false;
4145             len = qemu_get_be32(f);
4146             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4147                 error_report("Invalid compressed data length: %d", len);
4148                 ret = -EINVAL;
4149                 break;
4150             }
4151             decompress_data_with_multi_threads(f, page_buffer, len);
4152             break;
4153 
4154         case RAM_SAVE_FLAG_EOS:
4155             /* normal exit */
4156             multifd_recv_sync_main();
4157             break;
4158         default:
4159             error_report("Unknown combination of migration flags: 0x%x"
4160                          " (postcopy mode)", flags);
4161             ret = -EINVAL;
4162             break;
4163         }
4164 
4165         /* Got the whole host page, wait for decompress before placing. */
4166         if (place_needed) {
4167             ret |= wait_for_decompress_done();
4168         }
4169 
4170         /* Detect for any possible file errors */
4171         if (!ret && qemu_file_get_error(f)) {
4172             ret = qemu_file_get_error(f);
4173         }
4174 
4175         if (!ret && place_needed) {
4176             if (tmp_page->all_zero) {
4177                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4178             } else {
4179                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4180                                           place_source, block);
4181             }
4182             place_needed = false;
4183             postcopy_temp_page_reset(tmp_page);
4184         }
4185     }
4186 
4187     return ret;
4188 }
4189 
4190 static bool postcopy_is_running(void)
4191 {
4192     PostcopyState ps = postcopy_state_get();
4193     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4194 }
4195 
4196 /*
4197  * Flush content of RAM cache into SVM's memory.
4198  * Only flush the pages that be dirtied by PVM or SVM or both.
4199  */
4200 void colo_flush_ram_cache(void)
4201 {
4202     RAMBlock *block = NULL;
4203     void *dst_host;
4204     void *src_host;
4205     unsigned long offset = 0;
4206 
4207     memory_global_dirty_log_sync();
4208     WITH_RCU_READ_LOCK_GUARD() {
4209         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4210             ramblock_sync_dirty_bitmap(ram_state, block);
4211         }
4212     }
4213 
4214     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4215     WITH_RCU_READ_LOCK_GUARD() {
4216         block = QLIST_FIRST_RCU(&ram_list.blocks);
4217 
4218         while (block) {
4219             unsigned long num = 0;
4220 
4221             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4222             if (!offset_in_ramblock(block,
4223                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4224                 offset = 0;
4225                 num = 0;
4226                 block = QLIST_NEXT_RCU(block, next);
4227             } else {
4228                 unsigned long i = 0;
4229 
4230                 for (i = 0; i < num; i++) {
4231                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4232                 }
4233                 dst_host = block->host
4234                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4235                 src_host = block->colo_cache
4236                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4237                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4238                 offset += num;
4239             }
4240         }
4241     }
4242     trace_colo_flush_ram_cache_end();
4243 }
4244 
4245 /**
4246  * ram_load_precopy: load pages in precopy case
4247  *
4248  * Returns 0 for success or -errno in case of error
4249  *
4250  * Called in precopy mode by ram_load().
4251  * rcu_read_lock is taken prior to this being called.
4252  *
4253  * @f: QEMUFile where to send the data
4254  */
4255 static int ram_load_precopy(QEMUFile *f)
4256 {
4257     MigrationIncomingState *mis = migration_incoming_get_current();
4258     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4259     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4260     bool postcopy_advised = migration_incoming_postcopy_advised();
4261     if (!migrate_compress()) {
4262         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4263     }
4264 
4265     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4266         ram_addr_t addr, total_ram_bytes;
4267         void *host = NULL, *host_bak = NULL;
4268         uint8_t ch;
4269 
4270         /*
4271          * Yield periodically to let main loop run, but an iteration of
4272          * the main loop is expensive, so do it each some iterations
4273          */
4274         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4275             aio_co_schedule(qemu_get_current_aio_context(),
4276                             qemu_coroutine_self());
4277             qemu_coroutine_yield();
4278         }
4279         i++;
4280 
4281         addr = qemu_get_be64(f);
4282         flags = addr & ~TARGET_PAGE_MASK;
4283         addr &= TARGET_PAGE_MASK;
4284 
4285         if (flags & invalid_flags) {
4286             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4287                 error_report("Received an unexpected compressed page");
4288             }
4289 
4290             ret = -EINVAL;
4291             break;
4292         }
4293 
4294         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4295                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4296             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4297                                                     RAM_CHANNEL_PRECOPY);
4298 
4299             host = host_from_ram_block_offset(block, addr);
4300             /*
4301              * After going into COLO stage, we should not load the page
4302              * into SVM's memory directly, we put them into colo_cache firstly.
4303              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4304              * Previously, we copied all these memory in preparing stage of COLO
4305              * while we need to stop VM, which is a time-consuming process.
4306              * Here we optimize it by a trick, back-up every page while in
4307              * migration process while COLO is enabled, though it affects the
4308              * speed of the migration, but it obviously reduce the downtime of
4309              * back-up all SVM'S memory in COLO preparing stage.
4310              */
4311             if (migration_incoming_colo_enabled()) {
4312                 if (migration_incoming_in_colo_state()) {
4313                     /* In COLO stage, put all pages into cache temporarily */
4314                     host = colo_cache_from_block_offset(block, addr, true);
4315                 } else {
4316                    /*
4317                     * In migration stage but before COLO stage,
4318                     * Put all pages into both cache and SVM's memory.
4319                     */
4320                     host_bak = colo_cache_from_block_offset(block, addr, false);
4321                 }
4322             }
4323             if (!host) {
4324                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4325                 ret = -EINVAL;
4326                 break;
4327             }
4328             if (!migration_incoming_in_colo_state()) {
4329                 ramblock_recv_bitmap_set(block, host);
4330             }
4331 
4332             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4333         }
4334 
4335         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4336         case RAM_SAVE_FLAG_MEM_SIZE:
4337             /* Synchronize RAM block list */
4338             total_ram_bytes = addr;
4339             while (!ret && total_ram_bytes) {
4340                 RAMBlock *block;
4341                 char id[256];
4342                 ram_addr_t length;
4343 
4344                 len = qemu_get_byte(f);
4345                 qemu_get_buffer(f, (uint8_t *)id, len);
4346                 id[len] = 0;
4347                 length = qemu_get_be64(f);
4348 
4349                 block = qemu_ram_block_by_name(id);
4350                 if (block && !qemu_ram_is_migratable(block)) {
4351                     error_report("block %s should not be migrated !", id);
4352                     ret = -EINVAL;
4353                 } else if (block) {
4354                     if (length != block->used_length) {
4355                         Error *local_err = NULL;
4356 
4357                         ret = qemu_ram_resize(block, length,
4358                                               &local_err);
4359                         if (local_err) {
4360                             error_report_err(local_err);
4361                         }
4362                     }
4363                     /* For postcopy we need to check hugepage sizes match */
4364                     if (postcopy_advised && migrate_postcopy_ram() &&
4365                         block->page_size != qemu_host_page_size) {
4366                         uint64_t remote_page_size = qemu_get_be64(f);
4367                         if (remote_page_size != block->page_size) {
4368                             error_report("Mismatched RAM page size %s "
4369                                          "(local) %zd != %" PRId64,
4370                                          id, block->page_size,
4371                                          remote_page_size);
4372                             ret = -EINVAL;
4373                         }
4374                     }
4375                     if (migrate_ignore_shared()) {
4376                         hwaddr addr = qemu_get_be64(f);
4377                         if (ramblock_is_ignored(block) &&
4378                             block->mr->addr != addr) {
4379                             error_report("Mismatched GPAs for block %s "
4380                                          "%" PRId64 "!= %" PRId64,
4381                                          id, (uint64_t)addr,
4382                                          (uint64_t)block->mr->addr);
4383                             ret = -EINVAL;
4384                         }
4385                     }
4386                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4387                                           block->idstr);
4388                 } else {
4389                     error_report("Unknown ramblock \"%s\", cannot "
4390                                  "accept migration", id);
4391                     ret = -EINVAL;
4392                 }
4393 
4394                 total_ram_bytes -= length;
4395             }
4396             break;
4397 
4398         case RAM_SAVE_FLAG_ZERO:
4399             ch = qemu_get_byte(f);
4400             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4401             break;
4402 
4403         case RAM_SAVE_FLAG_PAGE:
4404             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4405             break;
4406 
4407         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4408             len = qemu_get_be32(f);
4409             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4410                 error_report("Invalid compressed data length: %d", len);
4411                 ret = -EINVAL;
4412                 break;
4413             }
4414             decompress_data_with_multi_threads(f, host, len);
4415             break;
4416 
4417         case RAM_SAVE_FLAG_XBZRLE:
4418             if (load_xbzrle(f, addr, host) < 0) {
4419                 error_report("Failed to decompress XBZRLE page at "
4420                              RAM_ADDR_FMT, addr);
4421                 ret = -EINVAL;
4422                 break;
4423             }
4424             break;
4425         case RAM_SAVE_FLAG_EOS:
4426             /* normal exit */
4427             multifd_recv_sync_main();
4428             break;
4429         default:
4430             if (flags & RAM_SAVE_FLAG_HOOK) {
4431                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4432             } else {
4433                 error_report("Unknown combination of migration flags: 0x%x",
4434                              flags);
4435                 ret = -EINVAL;
4436             }
4437         }
4438         if (!ret) {
4439             ret = qemu_file_get_error(f);
4440         }
4441         if (!ret && host_bak) {
4442             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4443         }
4444     }
4445 
4446     ret |= wait_for_decompress_done();
4447     return ret;
4448 }
4449 
4450 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4451 {
4452     int ret = 0;
4453     static uint64_t seq_iter;
4454     /*
4455      * If system is running in postcopy mode, page inserts to host memory must
4456      * be atomic
4457      */
4458     bool postcopy_running = postcopy_is_running();
4459 
4460     seq_iter++;
4461 
4462     if (version_id != 4) {
4463         return -EINVAL;
4464     }
4465 
4466     /*
4467      * This RCU critical section can be very long running.
4468      * When RCU reclaims in the code start to become numerous,
4469      * it will be necessary to reduce the granularity of this
4470      * critical section.
4471      */
4472     WITH_RCU_READ_LOCK_GUARD() {
4473         if (postcopy_running) {
4474             /*
4475              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4476              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4477              * service fast page faults.
4478              */
4479             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4480         } else {
4481             ret = ram_load_precopy(f);
4482         }
4483     }
4484     trace_ram_load_complete(ret, seq_iter);
4485 
4486     return ret;
4487 }
4488 
4489 static bool ram_has_postcopy(void *opaque)
4490 {
4491     RAMBlock *rb;
4492     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4493         if (ramblock_is_pmem(rb)) {
4494             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4495                          "is not supported now!", rb->idstr, rb->host);
4496             return false;
4497         }
4498     }
4499 
4500     return migrate_postcopy_ram();
4501 }
4502 
4503 /* Sync all the dirty bitmap with destination VM.  */
4504 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4505 {
4506     RAMBlock *block;
4507     QEMUFile *file = s->to_dst_file;
4508     int ramblock_count = 0;
4509 
4510     trace_ram_dirty_bitmap_sync_start();
4511 
4512     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4513         qemu_savevm_send_recv_bitmap(file, block->idstr);
4514         trace_ram_dirty_bitmap_request(block->idstr);
4515         ramblock_count++;
4516     }
4517 
4518     trace_ram_dirty_bitmap_sync_wait();
4519 
4520     /* Wait until all the ramblocks' dirty bitmap synced */
4521     while (ramblock_count--) {
4522         qemu_sem_wait(&s->rp_state.rp_sem);
4523     }
4524 
4525     trace_ram_dirty_bitmap_sync_complete();
4526 
4527     return 0;
4528 }
4529 
4530 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4531 {
4532     qemu_sem_post(&s->rp_state.rp_sem);
4533 }
4534 
4535 /*
4536  * Read the received bitmap, revert it as the initial dirty bitmap.
4537  * This is only used when the postcopy migration is paused but wants
4538  * to resume from a middle point.
4539  */
4540 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4541 {
4542     int ret = -EINVAL;
4543     /* from_dst_file is always valid because we're within rp_thread */
4544     QEMUFile *file = s->rp_state.from_dst_file;
4545     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4546     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4547     uint64_t size, end_mark;
4548 
4549     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4550 
4551     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4552         error_report("%s: incorrect state %s", __func__,
4553                      MigrationStatus_str(s->state));
4554         return -EINVAL;
4555     }
4556 
4557     /*
4558      * Note: see comments in ramblock_recv_bitmap_send() on why we
4559      * need the endianness conversion, and the paddings.
4560      */
4561     local_size = ROUND_UP(local_size, 8);
4562 
4563     /* Add paddings */
4564     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4565 
4566     size = qemu_get_be64(file);
4567 
4568     /* The size of the bitmap should match with our ramblock */
4569     if (size != local_size) {
4570         error_report("%s: ramblock '%s' bitmap size mismatch "
4571                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4572                      block->idstr, size, local_size);
4573         ret = -EINVAL;
4574         goto out;
4575     }
4576 
4577     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4578     end_mark = qemu_get_be64(file);
4579 
4580     ret = qemu_file_get_error(file);
4581     if (ret || size != local_size) {
4582         error_report("%s: read bitmap failed for ramblock '%s': %d"
4583                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4584                      __func__, block->idstr, ret, local_size, size);
4585         ret = -EIO;
4586         goto out;
4587     }
4588 
4589     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4590         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4591                      __func__, block->idstr, end_mark);
4592         ret = -EINVAL;
4593         goto out;
4594     }
4595 
4596     /*
4597      * Endianness conversion. We are during postcopy (though paused).
4598      * The dirty bitmap won't change. We can directly modify it.
4599      */
4600     bitmap_from_le(block->bmap, le_bitmap, nbits);
4601 
4602     /*
4603      * What we received is "received bitmap". Revert it as the initial
4604      * dirty bitmap for this ramblock.
4605      */
4606     bitmap_complement(block->bmap, block->bmap, nbits);
4607 
4608     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4609     ramblock_dirty_bitmap_clear_discarded_pages(block);
4610 
4611     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4612     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4613 
4614     /*
4615      * We succeeded to sync bitmap for current ramblock. If this is
4616      * the last one to sync, we need to notify the main send thread.
4617      */
4618     ram_dirty_bitmap_reload_notify(s);
4619 
4620     ret = 0;
4621 out:
4622     g_free(le_bitmap);
4623     return ret;
4624 }
4625 
4626 static int ram_resume_prepare(MigrationState *s, void *opaque)
4627 {
4628     RAMState *rs = *(RAMState **)opaque;
4629     int ret;
4630 
4631     ret = ram_dirty_bitmap_sync_all(s, rs);
4632     if (ret) {
4633         return ret;
4634     }
4635 
4636     ram_state_resume_prepare(rs, s->to_dst_file);
4637 
4638     return 0;
4639 }
4640 
4641 void postcopy_preempt_shutdown_file(MigrationState *s)
4642 {
4643     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4644     qemu_fflush(s->postcopy_qemufile_src);
4645 }
4646 
4647 static SaveVMHandlers savevm_ram_handlers = {
4648     .save_setup = ram_save_setup,
4649     .save_live_iterate = ram_save_iterate,
4650     .save_live_complete_postcopy = ram_save_complete,
4651     .save_live_complete_precopy = ram_save_complete,
4652     .has_postcopy = ram_has_postcopy,
4653     .state_pending_exact = ram_state_pending_exact,
4654     .state_pending_estimate = ram_state_pending_estimate,
4655     .load_state = ram_load,
4656     .save_cleanup = ram_save_cleanup,
4657     .load_setup = ram_load_setup,
4658     .load_cleanup = ram_load_cleanup,
4659     .resume_prepare = ram_resume_prepare,
4660 };
4661 
4662 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4663                                       size_t old_size, size_t new_size)
4664 {
4665     PostcopyState ps = postcopy_state_get();
4666     ram_addr_t offset;
4667     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4668     Error *err = NULL;
4669 
4670     if (ramblock_is_ignored(rb)) {
4671         return;
4672     }
4673 
4674     if (!migration_is_idle()) {
4675         /*
4676          * Precopy code on the source cannot deal with the size of RAM blocks
4677          * changing at random points in time - especially after sending the
4678          * RAM block sizes in the migration stream, they must no longer change.
4679          * Abort and indicate a proper reason.
4680          */
4681         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4682         migration_cancel(err);
4683         error_free(err);
4684     }
4685 
4686     switch (ps) {
4687     case POSTCOPY_INCOMING_ADVISE:
4688         /*
4689          * Update what ram_postcopy_incoming_init()->init_range() does at the
4690          * time postcopy was advised. Syncing RAM blocks with the source will
4691          * result in RAM resizes.
4692          */
4693         if (old_size < new_size) {
4694             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4695                 error_report("RAM block '%s' discard of resized RAM failed",
4696                              rb->idstr);
4697             }
4698         }
4699         rb->postcopy_length = new_size;
4700         break;
4701     case POSTCOPY_INCOMING_NONE:
4702     case POSTCOPY_INCOMING_RUNNING:
4703     case POSTCOPY_INCOMING_END:
4704         /*
4705          * Once our guest is running, postcopy does no longer care about
4706          * resizes. When growing, the new memory was not available on the
4707          * source, no handler needed.
4708          */
4709         break;
4710     default:
4711         error_report("RAM block '%s' resized during postcopy state: %d",
4712                      rb->idstr, ps);
4713         exit(-1);
4714     }
4715 }
4716 
4717 static RAMBlockNotifier ram_mig_ram_notifier = {
4718     .ram_block_resized = ram_mig_ram_block_resized,
4719 };
4720 
4721 void ram_mig_init(void)
4722 {
4723     qemu_mutex_init(&XBZRLE.lock);
4724     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4725     ram_block_notifier_add(&ram_mig_ram_notifier);
4726 }
4727