xref: /openbmc/qemu/migration/ram.c (revision 081340d1ddfe27e80f653d707c278edf06a9a803)
1  /*
2   * QEMU System Emulator
3   *
4   * Copyright (c) 2003-2008 Fabrice Bellard
5   * Copyright (c) 2011-2015 Red Hat Inc
6   *
7   * Authors:
8   *  Juan Quintela <quintela@redhat.com>
9   *
10   * Permission is hereby granted, free of charge, to any person obtaining a copy
11   * of this software and associated documentation files (the "Software"), to deal
12   * in the Software without restriction, including without limitation the rights
13   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14   * copies of the Software, and to permit persons to whom the Software is
15   * furnished to do so, subject to the following conditions:
16   *
17   * The above copyright notice and this permission notice shall be included in
18   * all copies or substantial portions of the Software.
19   *
20   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26   * THE SOFTWARE.
27   */
28  
29  #include "qemu/osdep.h"
30  #include "qemu/cutils.h"
31  #include "qemu/bitops.h"
32  #include "qemu/bitmap.h"
33  #include "qemu/madvise.h"
34  #include "qemu/main-loop.h"
35  #include "xbzrle.h"
36  #include "ram-compress.h"
37  #include "ram.h"
38  #include "migration.h"
39  #include "migration-stats.h"
40  #include "migration/register.h"
41  #include "migration/misc.h"
42  #include "qemu-file.h"
43  #include "postcopy-ram.h"
44  #include "page_cache.h"
45  #include "qemu/error-report.h"
46  #include "qapi/error.h"
47  #include "qapi/qapi-types-migration.h"
48  #include "qapi/qapi-events-migration.h"
49  #include "qapi/qapi-commands-migration.h"
50  #include "qapi/qmp/qerror.h"
51  #include "trace.h"
52  #include "exec/ram_addr.h"
53  #include "exec/target_page.h"
54  #include "qemu/rcu_queue.h"
55  #include "migration/colo.h"
56  #include "block.h"
57  #include "sysemu/cpu-throttle.h"
58  #include "savevm.h"
59  #include "qemu/iov.h"
60  #include "multifd.h"
61  #include "sysemu/runstate.h"
62  #include "rdma.h"
63  #include "options.h"
64  #include "sysemu/dirtylimit.h"
65  #include "sysemu/kvm.h"
66  
67  #include "hw/boards.h" /* for machine_dump_guest_core() */
68  
69  #if defined(__linux__)
70  #include "qemu/userfaultfd.h"
71  #endif /* defined(__linux__) */
72  
73  /***********************************************************/
74  /* ram save/restore */
75  
76  /*
77   * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
78   * worked for pages that were filled with the same char.  We switched
79   * it to only search for the zero value.  And to avoid confusion with
80   * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
81   */
82  /*
83   * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
84   */
85  #define RAM_SAVE_FLAG_FULL     0x01
86  #define RAM_SAVE_FLAG_ZERO     0x02
87  #define RAM_SAVE_FLAG_MEM_SIZE 0x04
88  #define RAM_SAVE_FLAG_PAGE     0x08
89  #define RAM_SAVE_FLAG_EOS      0x10
90  #define RAM_SAVE_FLAG_CONTINUE 0x20
91  #define RAM_SAVE_FLAG_XBZRLE   0x40
92  /* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */
93  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
94  #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
95  /* We can't use any flag that is bigger than 0x200 */
96  
97  /*
98   * mapped-ram migration supports O_DIRECT, so we need to make sure the
99   * userspace buffer, the IO operation size and the file offset are
100   * aligned according to the underlying device's block size. The first
101   * two are already aligned to page size, but we need to add padding to
102   * the file to align the offset.  We cannot read the block size
103   * dynamically because the migration file can be moved between
104   * different systems, so use 1M to cover most block sizes and to keep
105   * the file offset aligned at page size as well.
106   */
107  #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000
108  
109  /*
110   * When doing mapped-ram migration, this is the amount we read from
111   * the pages region in the migration file at a time.
112   */
113  #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000
114  
115  XBZRLECacheStats xbzrle_counters;
116  
117  /* used by the search for pages to send */
118  struct PageSearchStatus {
119      /* The migration channel used for a specific host page */
120      QEMUFile    *pss_channel;
121      /* Last block from where we have sent data */
122      RAMBlock *last_sent_block;
123      /* Current block being searched */
124      RAMBlock    *block;
125      /* Current page to search from */
126      unsigned long page;
127      /* Set once we wrap around */
128      bool         complete_round;
129      /* Whether we're sending a host page */
130      bool          host_page_sending;
131      /* The start/end of current host page.  Invalid if host_page_sending==false */
132      unsigned long host_page_start;
133      unsigned long host_page_end;
134  };
135  typedef struct PageSearchStatus PageSearchStatus;
136  
137  /* struct contains XBZRLE cache and a static page
138     used by the compression */
139  static struct {
140      /* buffer used for XBZRLE encoding */
141      uint8_t *encoded_buf;
142      /* buffer for storing page content */
143      uint8_t *current_buf;
144      /* Cache for XBZRLE, Protected by lock. */
145      PageCache *cache;
146      QemuMutex lock;
147      /* it will store a page full of zeros */
148      uint8_t *zero_target_page;
149      /* buffer used for XBZRLE decoding */
150      uint8_t *decoded_buf;
151  } XBZRLE;
152  
153  static void XBZRLE_cache_lock(void)
154  {
155      if (migrate_xbzrle()) {
156          qemu_mutex_lock(&XBZRLE.lock);
157      }
158  }
159  
160  static void XBZRLE_cache_unlock(void)
161  {
162      if (migrate_xbzrle()) {
163          qemu_mutex_unlock(&XBZRLE.lock);
164      }
165  }
166  
167  /**
168   * xbzrle_cache_resize: resize the xbzrle cache
169   *
170   * This function is called from migrate_params_apply in main
171   * thread, possibly while a migration is in progress.  A running
172   * migration may be using the cache and might finish during this call,
173   * hence changes to the cache are protected by XBZRLE.lock().
174   *
175   * Returns 0 for success or -1 for error
176   *
177   * @new_size: new cache size
178   * @errp: set *errp if the check failed, with reason
179   */
180  int xbzrle_cache_resize(uint64_t new_size, Error **errp)
181  {
182      PageCache *new_cache;
183      int64_t ret = 0;
184  
185      /* Check for truncation */
186      if (new_size != (size_t)new_size) {
187          error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
188                     "exceeding address space");
189          return -1;
190      }
191  
192      if (new_size == migrate_xbzrle_cache_size()) {
193          /* nothing to do */
194          return 0;
195      }
196  
197      XBZRLE_cache_lock();
198  
199      if (XBZRLE.cache != NULL) {
200          new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
201          if (!new_cache) {
202              ret = -1;
203              goto out;
204          }
205  
206          cache_fini(XBZRLE.cache);
207          XBZRLE.cache = new_cache;
208      }
209  out:
210      XBZRLE_cache_unlock();
211      return ret;
212  }
213  
214  static bool postcopy_preempt_active(void)
215  {
216      return migrate_postcopy_preempt() && migration_in_postcopy();
217  }
218  
219  bool migrate_ram_is_ignored(RAMBlock *block)
220  {
221      return !qemu_ram_is_migratable(block) ||
222             (migrate_ignore_shared() && qemu_ram_is_shared(block)
223                                      && qemu_ram_is_named_file(block));
224  }
225  
226  #undef RAMBLOCK_FOREACH
227  
228  int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
229  {
230      RAMBlock *block;
231      int ret = 0;
232  
233      RCU_READ_LOCK_GUARD();
234  
235      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
236          ret = func(block, opaque);
237          if (ret) {
238              break;
239          }
240      }
241      return ret;
242  }
243  
244  static void ramblock_recv_map_init(void)
245  {
246      RAMBlock *rb;
247  
248      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
249          assert(!rb->receivedmap);
250          rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
251      }
252  }
253  
254  int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
255  {
256      return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
257                      rb->receivedmap);
258  }
259  
260  bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
261  {
262      return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
263  }
264  
265  void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
266  {
267      set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
268  }
269  
270  void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
271                                      size_t nr)
272  {
273      bitmap_set_atomic(rb->receivedmap,
274                        ramblock_recv_bitmap_offset(host_addr, rb),
275                        nr);
276  }
277  
278  #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
279  
280  /*
281   * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
282   *
283   * Returns >0 if success with sent bytes, or <0 if error.
284   */
285  int64_t ramblock_recv_bitmap_send(QEMUFile *file,
286                                    const char *block_name)
287  {
288      RAMBlock *block = qemu_ram_block_by_name(block_name);
289      unsigned long *le_bitmap, nbits;
290      uint64_t size;
291  
292      if (!block) {
293          error_report("%s: invalid block name: %s", __func__, block_name);
294          return -1;
295      }
296  
297      nbits = block->postcopy_length >> TARGET_PAGE_BITS;
298  
299      /*
300       * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
301       * machines we may need 4 more bytes for padding (see below
302       * comment). So extend it a bit before hand.
303       */
304      le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
305  
306      /*
307       * Always use little endian when sending the bitmap. This is
308       * required that when source and destination VMs are not using the
309       * same endianness. (Note: big endian won't work.)
310       */
311      bitmap_to_le(le_bitmap, block->receivedmap, nbits);
312  
313      /* Size of the bitmap, in bytes */
314      size = DIV_ROUND_UP(nbits, 8);
315  
316      /*
317       * size is always aligned to 8 bytes for 64bit machines, but it
318       * may not be true for 32bit machines. We need this padding to
319       * make sure the migration can survive even between 32bit and
320       * 64bit machines.
321       */
322      size = ROUND_UP(size, 8);
323  
324      qemu_put_be64(file, size);
325      qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
326      g_free(le_bitmap);
327      /*
328       * Mark as an end, in case the middle part is screwed up due to
329       * some "mysterious" reason.
330       */
331      qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
332      int ret = qemu_fflush(file);
333      if (ret) {
334          return ret;
335      }
336  
337      return size + sizeof(size);
338  }
339  
340  /*
341   * An outstanding page request, on the source, having been received
342   * and queued
343   */
344  struct RAMSrcPageRequest {
345      RAMBlock *rb;
346      hwaddr    offset;
347      hwaddr    len;
348  
349      QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
350  };
351  
352  /* State of RAM for migration */
353  struct RAMState {
354      /*
355       * PageSearchStatus structures for the channels when send pages.
356       * Protected by the bitmap_mutex.
357       */
358      PageSearchStatus pss[RAM_CHANNEL_MAX];
359      /* UFFD file descriptor, used in 'write-tracking' migration */
360      int uffdio_fd;
361      /* total ram size in bytes */
362      uint64_t ram_bytes_total;
363      /* Last block that we have visited searching for dirty pages */
364      RAMBlock *last_seen_block;
365      /* Last dirty target page we have sent */
366      ram_addr_t last_page;
367      /* last ram version we have seen */
368      uint32_t last_version;
369      /* How many times we have dirty too many pages */
370      int dirty_rate_high_cnt;
371      /* these variables are used for bitmap sync */
372      /* last time we did a full bitmap_sync */
373      int64_t time_last_bitmap_sync;
374      /* bytes transferred at start_time */
375      uint64_t bytes_xfer_prev;
376      /* number of dirty pages since start_time */
377      uint64_t num_dirty_pages_period;
378      /* xbzrle misses since the beginning of the period */
379      uint64_t xbzrle_cache_miss_prev;
380      /* Amount of xbzrle pages since the beginning of the period */
381      uint64_t xbzrle_pages_prev;
382      /* Amount of xbzrle encoded bytes since the beginning of the period */
383      uint64_t xbzrle_bytes_prev;
384      /* Are we really using XBZRLE (e.g., after the first round). */
385      bool xbzrle_started;
386      /* Are we on the last stage of migration */
387      bool last_stage;
388  
389      /* total handled target pages at the beginning of period */
390      uint64_t target_page_count_prev;
391      /* total handled target pages since start */
392      uint64_t target_page_count;
393      /* number of dirty bits in the bitmap */
394      uint64_t migration_dirty_pages;
395      /*
396       * Protects:
397       * - dirty/clear bitmap
398       * - migration_dirty_pages
399       * - pss structures
400       */
401      QemuMutex bitmap_mutex;
402      /* The RAMBlock used in the last src_page_requests */
403      RAMBlock *last_req_rb;
404      /* Queue of outstanding page requests from the destination */
405      QemuMutex src_page_req_mutex;
406      QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
407  
408      /*
409       * This is only used when postcopy is in recovery phase, to communicate
410       * between the migration thread and the return path thread on dirty
411       * bitmap synchronizations.  This field is unused in other stages of
412       * RAM migration.
413       */
414      unsigned int postcopy_bmap_sync_requested;
415  };
416  typedef struct RAMState RAMState;
417  
418  static RAMState *ram_state;
419  
420  static NotifierWithReturnList precopy_notifier_list;
421  
422  /* Whether postcopy has queued requests? */
423  static bool postcopy_has_request(RAMState *rs)
424  {
425      return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
426  }
427  
428  void precopy_infrastructure_init(void)
429  {
430      notifier_with_return_list_init(&precopy_notifier_list);
431  }
432  
433  void precopy_add_notifier(NotifierWithReturn *n)
434  {
435      notifier_with_return_list_add(&precopy_notifier_list, n);
436  }
437  
438  void precopy_remove_notifier(NotifierWithReturn *n)
439  {
440      notifier_with_return_remove(n);
441  }
442  
443  int precopy_notify(PrecopyNotifyReason reason, Error **errp)
444  {
445      PrecopyNotifyData pnd;
446      pnd.reason = reason;
447  
448      return notifier_with_return_list_notify(&precopy_notifier_list, &pnd, errp);
449  }
450  
451  uint64_t ram_bytes_remaining(void)
452  {
453      return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
454                         0;
455  }
456  
457  void ram_transferred_add(uint64_t bytes)
458  {
459      if (runstate_is_running()) {
460          stat64_add(&mig_stats.precopy_bytes, bytes);
461      } else if (migration_in_postcopy()) {
462          stat64_add(&mig_stats.postcopy_bytes, bytes);
463      } else {
464          stat64_add(&mig_stats.downtime_bytes, bytes);
465      }
466  }
467  
468  struct MigrationOps {
469      int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
470  };
471  typedef struct MigrationOps MigrationOps;
472  
473  MigrationOps *migration_ops;
474  
475  static int ram_save_host_page_urgent(PageSearchStatus *pss);
476  
477  /* NOTE: page is the PFN not real ram_addr_t. */
478  static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
479  {
480      pss->block = rb;
481      pss->page = page;
482      pss->complete_round = false;
483  }
484  
485  /*
486   * Check whether two PSSs are actively sending the same page.  Return true
487   * if it is, false otherwise.
488   */
489  static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
490  {
491      return pss1->host_page_sending && pss2->host_page_sending &&
492          (pss1->host_page_start == pss2->host_page_start);
493  }
494  
495  /**
496   * save_page_header: write page header to wire
497   *
498   * If this is the 1st block, it also writes the block identification
499   *
500   * Returns the number of bytes written
501   *
502   * @pss: current PSS channel status
503   * @block: block that contains the page we want to send
504   * @offset: offset inside the block for the page
505   *          in the lower bits, it contains flags
506   */
507  static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
508                                 RAMBlock *block, ram_addr_t offset)
509  {
510      size_t size, len;
511      bool same_block = (block == pss->last_sent_block);
512  
513      if (same_block) {
514          offset |= RAM_SAVE_FLAG_CONTINUE;
515      }
516      qemu_put_be64(f, offset);
517      size = 8;
518  
519      if (!same_block) {
520          len = strlen(block->idstr);
521          qemu_put_byte(f, len);
522          qemu_put_buffer(f, (uint8_t *)block->idstr, len);
523          size += 1 + len;
524          pss->last_sent_block = block;
525      }
526      return size;
527  }
528  
529  /**
530   * mig_throttle_guest_down: throttle down the guest
531   *
532   * Reduce amount of guest cpu execution to hopefully slow down memory
533   * writes. If guest dirty memory rate is reduced below the rate at
534   * which we can transfer pages to the destination then we should be
535   * able to complete migration. Some workloads dirty memory way too
536   * fast and will not effectively converge, even with auto-converge.
537   */
538  static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
539                                      uint64_t bytes_dirty_threshold)
540  {
541      uint64_t pct_initial = migrate_cpu_throttle_initial();
542      uint64_t pct_increment = migrate_cpu_throttle_increment();
543      bool pct_tailslow = migrate_cpu_throttle_tailslow();
544      int pct_max = migrate_max_cpu_throttle();
545  
546      uint64_t throttle_now = cpu_throttle_get_percentage();
547      uint64_t cpu_now, cpu_ideal, throttle_inc;
548  
549      /* We have not started throttling yet. Let's start it. */
550      if (!cpu_throttle_active()) {
551          cpu_throttle_set(pct_initial);
552      } else {
553          /* Throttling already on, just increase the rate */
554          if (!pct_tailslow) {
555              throttle_inc = pct_increment;
556          } else {
557              /* Compute the ideal CPU percentage used by Guest, which may
558               * make the dirty rate match the dirty rate threshold. */
559              cpu_now = 100 - throttle_now;
560              cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
561                          bytes_dirty_period);
562              throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
563          }
564          cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
565      }
566  }
567  
568  void mig_throttle_counter_reset(void)
569  {
570      RAMState *rs = ram_state;
571  
572      rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
573      rs->num_dirty_pages_period = 0;
574      rs->bytes_xfer_prev = migration_transferred_bytes();
575  }
576  
577  /**
578   * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
579   *
580   * @current_addr: address for the zero page
581   *
582   * Update the xbzrle cache to reflect a page that's been sent as all 0.
583   * The important thing is that a stale (not-yet-0'd) page be replaced
584   * by the new data.
585   * As a bonus, if the page wasn't in the cache it gets added so that
586   * when a small write is made into the 0'd page it gets XBZRLE sent.
587   */
588  static void xbzrle_cache_zero_page(ram_addr_t current_addr)
589  {
590      /* We don't care if this fails to allocate a new cache page
591       * as long as it updated an old one */
592      cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
593                   stat64_get(&mig_stats.dirty_sync_count));
594  }
595  
596  #define ENCODING_FLAG_XBZRLE 0x1
597  
598  /**
599   * save_xbzrle_page: compress and send current page
600   *
601   * Returns: 1 means that we wrote the page
602   *          0 means that page is identical to the one already sent
603   *          -1 means that xbzrle would be longer than normal
604   *
605   * @rs: current RAM state
606   * @pss: current PSS channel
607   * @current_data: pointer to the address of the page contents
608   * @current_addr: addr of the page
609   * @block: block that contains the page we want to send
610   * @offset: offset inside the block for the page
611   */
612  static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
613                              uint8_t **current_data, ram_addr_t current_addr,
614                              RAMBlock *block, ram_addr_t offset)
615  {
616      int encoded_len = 0, bytes_xbzrle;
617      uint8_t *prev_cached_page;
618      QEMUFile *file = pss->pss_channel;
619      uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
620  
621      if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
622          xbzrle_counters.cache_miss++;
623          if (!rs->last_stage) {
624              if (cache_insert(XBZRLE.cache, current_addr, *current_data,
625                               generation) == -1) {
626                  return -1;
627              } else {
628                  /* update *current_data when the page has been
629                     inserted into cache */
630                  *current_data = get_cached_data(XBZRLE.cache, current_addr);
631              }
632          }
633          return -1;
634      }
635  
636      /*
637       * Reaching here means the page has hit the xbzrle cache, no matter what
638       * encoding result it is (normal encoding, overflow or skipping the page),
639       * count the page as encoded. This is used to calculate the encoding rate.
640       *
641       * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
642       * 2nd page turns out to be skipped (i.e. no new bytes written to the
643       * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
644       * skipped page included. In this way, the encoding rate can tell if the
645       * guest page is good for xbzrle encoding.
646       */
647      xbzrle_counters.pages++;
648      prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
649  
650      /* save current buffer into memory */
651      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
652  
653      /* XBZRLE encoding (if there is no overflow) */
654      encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
655                                         TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
656                                         TARGET_PAGE_SIZE);
657  
658      /*
659       * Update the cache contents, so that it corresponds to the data
660       * sent, in all cases except where we skip the page.
661       */
662      if (!rs->last_stage && encoded_len != 0) {
663          memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
664          /*
665           * In the case where we couldn't compress, ensure that the caller
666           * sends the data from the cache, since the guest might have
667           * changed the RAM since we copied it.
668           */
669          *current_data = prev_cached_page;
670      }
671  
672      if (encoded_len == 0) {
673          trace_save_xbzrle_page_skipping();
674          return 0;
675      } else if (encoded_len == -1) {
676          trace_save_xbzrle_page_overflow();
677          xbzrle_counters.overflow++;
678          xbzrle_counters.bytes += TARGET_PAGE_SIZE;
679          return -1;
680      }
681  
682      /* Send XBZRLE based compressed page */
683      bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
684                                      offset | RAM_SAVE_FLAG_XBZRLE);
685      qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
686      qemu_put_be16(file, encoded_len);
687      qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
688      bytes_xbzrle += encoded_len + 1 + 2;
689      /*
690       * Like compressed_size (please see update_compress_thread_counts),
691       * the xbzrle encoded bytes don't count the 8 byte header with
692       * RAM_SAVE_FLAG_CONTINUE.
693       */
694      xbzrle_counters.bytes += bytes_xbzrle - 8;
695      ram_transferred_add(bytes_xbzrle);
696  
697      return 1;
698  }
699  
700  /**
701   * pss_find_next_dirty: find the next dirty page of current ramblock
702   *
703   * This function updates pss->page to point to the next dirty page index
704   * within the ramblock to migrate, or the end of ramblock when nothing
705   * found.  Note that when pss->host_page_sending==true it means we're
706   * during sending a host page, so we won't look for dirty page that is
707   * outside the host page boundary.
708   *
709   * @pss: the current page search status
710   */
711  static void pss_find_next_dirty(PageSearchStatus *pss)
712  {
713      RAMBlock *rb = pss->block;
714      unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
715      unsigned long *bitmap = rb->bmap;
716  
717      if (migrate_ram_is_ignored(rb)) {
718          /* Points directly to the end, so we know no dirty page */
719          pss->page = size;
720          return;
721      }
722  
723      /*
724       * If during sending a host page, only look for dirty pages within the
725       * current host page being send.
726       */
727      if (pss->host_page_sending) {
728          assert(pss->host_page_end);
729          size = MIN(size, pss->host_page_end);
730      }
731  
732      pss->page = find_next_bit(bitmap, size, pss->page);
733  }
734  
735  static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
736                                                         unsigned long page)
737  {
738      uint8_t shift;
739      hwaddr size, start;
740  
741      if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
742          return;
743      }
744  
745      shift = rb->clear_bmap_shift;
746      /*
747       * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
748       * can make things easier sometimes since then start address
749       * of the small chunk will always be 64 pages aligned so the
750       * bitmap will always be aligned to unsigned long. We should
751       * even be able to remove this restriction but I'm simply
752       * keeping it.
753       */
754      assert(shift >= 6);
755  
756      size = 1ULL << (TARGET_PAGE_BITS + shift);
757      start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
758      trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
759      memory_region_clear_dirty_bitmap(rb->mr, start, size);
760  }
761  
762  static void
763  migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
764                                                   unsigned long start,
765                                                   unsigned long npages)
766  {
767      unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
768      unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
769      unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
770  
771      /*
772       * Clear pages from start to start + npages - 1, so the end boundary is
773       * exclusive.
774       */
775      for (i = chunk_start; i < chunk_end; i += chunk_pages) {
776          migration_clear_memory_region_dirty_bitmap(rb, i);
777      }
778  }
779  
780  /*
781   * colo_bitmap_find_diry:find contiguous dirty pages from start
782   *
783   * Returns the page offset within memory region of the start of the contiguout
784   * dirty page
785   *
786   * @rs: current RAM state
787   * @rb: RAMBlock where to search for dirty pages
788   * @start: page where we start the search
789   * @num: the number of contiguous dirty pages
790   */
791  static inline
792  unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
793                                       unsigned long start, unsigned long *num)
794  {
795      unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
796      unsigned long *bitmap = rb->bmap;
797      unsigned long first, next;
798  
799      *num = 0;
800  
801      if (migrate_ram_is_ignored(rb)) {
802          return size;
803      }
804  
805      first = find_next_bit(bitmap, size, start);
806      if (first >= size) {
807          return first;
808      }
809      next = find_next_zero_bit(bitmap, size, first + 1);
810      assert(next >= first);
811      *num = next - first;
812      return first;
813  }
814  
815  static inline bool migration_bitmap_clear_dirty(RAMState *rs,
816                                                  RAMBlock *rb,
817                                                  unsigned long page)
818  {
819      bool ret;
820  
821      /*
822       * Clear dirty bitmap if needed.  This _must_ be called before we
823       * send any of the page in the chunk because we need to make sure
824       * we can capture further page content changes when we sync dirty
825       * log the next time.  So as long as we are going to send any of
826       * the page in the chunk we clear the remote dirty bitmap for all.
827       * Clearing it earlier won't be a problem, but too late will.
828       */
829      migration_clear_memory_region_dirty_bitmap(rb, page);
830  
831      ret = test_and_clear_bit(page, rb->bmap);
832      if (ret) {
833          rs->migration_dirty_pages--;
834      }
835  
836      return ret;
837  }
838  
839  static void dirty_bitmap_clear_section(MemoryRegionSection *section,
840                                         void *opaque)
841  {
842      const hwaddr offset = section->offset_within_region;
843      const hwaddr size = int128_get64(section->size);
844      const unsigned long start = offset >> TARGET_PAGE_BITS;
845      const unsigned long npages = size >> TARGET_PAGE_BITS;
846      RAMBlock *rb = section->mr->ram_block;
847      uint64_t *cleared_bits = opaque;
848  
849      /*
850       * We don't grab ram_state->bitmap_mutex because we expect to run
851       * only when starting migration or during postcopy recovery where
852       * we don't have concurrent access.
853       */
854      if (!migration_in_postcopy() && !migrate_background_snapshot()) {
855          migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
856      }
857      *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
858      bitmap_clear(rb->bmap, start, npages);
859  }
860  
861  /*
862   * Exclude all dirty pages from migration that fall into a discarded range as
863   * managed by a RamDiscardManager responsible for the mapped memory region of
864   * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
865   *
866   * Discarded pages ("logically unplugged") have undefined content and must
867   * not get migrated, because even reading these pages for migration might
868   * result in undesired behavior.
869   *
870   * Returns the number of cleared bits in the RAMBlock dirty bitmap.
871   *
872   * Note: The result is only stable while migrating (precopy/postcopy).
873   */
874  static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
875  {
876      uint64_t cleared_bits = 0;
877  
878      if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
879          RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
880          MemoryRegionSection section = {
881              .mr = rb->mr,
882              .offset_within_region = 0,
883              .size = int128_make64(qemu_ram_get_used_length(rb)),
884          };
885  
886          ram_discard_manager_replay_discarded(rdm, &section,
887                                               dirty_bitmap_clear_section,
888                                               &cleared_bits);
889      }
890      return cleared_bits;
891  }
892  
893  /*
894   * Check if a host-page aligned page falls into a discarded range as managed by
895   * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
896   *
897   * Note: The result is only stable while migrating (precopy/postcopy).
898   */
899  bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
900  {
901      if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
902          RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
903          MemoryRegionSection section = {
904              .mr = rb->mr,
905              .offset_within_region = start,
906              .size = int128_make64(qemu_ram_pagesize(rb)),
907          };
908  
909          return !ram_discard_manager_is_populated(rdm, &section);
910      }
911      return false;
912  }
913  
914  /* Called with RCU critical section */
915  static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
916  {
917      uint64_t new_dirty_pages =
918          cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
919  
920      rs->migration_dirty_pages += new_dirty_pages;
921      rs->num_dirty_pages_period += new_dirty_pages;
922  }
923  
924  /**
925   * ram_pagesize_summary: calculate all the pagesizes of a VM
926   *
927   * Returns a summary bitmap of the page sizes of all RAMBlocks
928   *
929   * For VMs with just normal pages this is equivalent to the host page
930   * size. If it's got some huge pages then it's the OR of all the
931   * different page sizes.
932   */
933  uint64_t ram_pagesize_summary(void)
934  {
935      RAMBlock *block;
936      uint64_t summary = 0;
937  
938      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
939          summary |= block->page_size;
940      }
941  
942      return summary;
943  }
944  
945  uint64_t ram_get_total_transferred_pages(void)
946  {
947      return stat64_get(&mig_stats.normal_pages) +
948          stat64_get(&mig_stats.zero_pages) +
949          compress_ram_pages() + xbzrle_counters.pages;
950  }
951  
952  static void migration_update_rates(RAMState *rs, int64_t end_time)
953  {
954      uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
955  
956      /* calculate period counters */
957      stat64_set(&mig_stats.dirty_pages_rate,
958                 rs->num_dirty_pages_period * 1000 /
959                 (end_time - rs->time_last_bitmap_sync));
960  
961      if (!page_count) {
962          return;
963      }
964  
965      if (migrate_xbzrle()) {
966          double encoded_size, unencoded_size;
967  
968          xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
969              rs->xbzrle_cache_miss_prev) / page_count;
970          rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
971          unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
972                           TARGET_PAGE_SIZE;
973          encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
974          if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
975              xbzrle_counters.encoding_rate = 0;
976          } else {
977              xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
978          }
979          rs->xbzrle_pages_prev = xbzrle_counters.pages;
980          rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
981      }
982      compress_update_rates(page_count);
983  }
984  
985  /*
986   * Enable dirty-limit to throttle down the guest
987   */
988  static void migration_dirty_limit_guest(void)
989  {
990      /*
991       * dirty page rate quota for all vCPUs fetched from
992       * migration parameter 'vcpu_dirty_limit'
993       */
994      static int64_t quota_dirtyrate;
995      MigrationState *s = migrate_get_current();
996  
997      /*
998       * If dirty limit already enabled and migration parameter
999       * vcpu-dirty-limit untouched.
1000       */
1001      if (dirtylimit_in_service() &&
1002          quota_dirtyrate == s->parameters.vcpu_dirty_limit) {
1003          return;
1004      }
1005  
1006      quota_dirtyrate = s->parameters.vcpu_dirty_limit;
1007  
1008      /*
1009       * Set all vCPU a quota dirtyrate, note that the second
1010       * parameter will be ignored if setting all vCPU for the vm
1011       */
1012      qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL);
1013      trace_migration_dirty_limit_guest(quota_dirtyrate);
1014  }
1015  
1016  static void migration_trigger_throttle(RAMState *rs)
1017  {
1018      uint64_t threshold = migrate_throttle_trigger_threshold();
1019      uint64_t bytes_xfer_period =
1020          migration_transferred_bytes() - rs->bytes_xfer_prev;
1021      uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1022      uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1023  
1024      /* During block migration the auto-converge logic incorrectly detects
1025       * that ram migration makes no progress. Avoid this by disabling the
1026       * throttling logic during the bulk phase of block migration. */
1027      if (blk_mig_bulk_active()) {
1028          return;
1029      }
1030  
1031      /*
1032       * The following detection logic can be refined later. For now:
1033       * Check to see if the ratio between dirtied bytes and the approx.
1034       * amount of bytes that just got transferred since the last time
1035       * we were in this routine reaches the threshold. If that happens
1036       * twice, start or increase throttling.
1037       */
1038      if ((bytes_dirty_period > bytes_dirty_threshold) &&
1039          (++rs->dirty_rate_high_cnt >= 2)) {
1040          rs->dirty_rate_high_cnt = 0;
1041          if (migrate_auto_converge()) {
1042              trace_migration_throttle();
1043              mig_throttle_guest_down(bytes_dirty_period,
1044                                      bytes_dirty_threshold);
1045          } else if (migrate_dirty_limit()) {
1046              migration_dirty_limit_guest();
1047          }
1048      }
1049  }
1050  
1051  static void migration_bitmap_sync(RAMState *rs, bool last_stage)
1052  {
1053      RAMBlock *block;
1054      int64_t end_time;
1055  
1056      stat64_add(&mig_stats.dirty_sync_count, 1);
1057  
1058      if (!rs->time_last_bitmap_sync) {
1059          rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1060      }
1061  
1062      trace_migration_bitmap_sync_start();
1063      memory_global_dirty_log_sync(last_stage);
1064  
1065      qemu_mutex_lock(&rs->bitmap_mutex);
1066      WITH_RCU_READ_LOCK_GUARD() {
1067          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1068              ramblock_sync_dirty_bitmap(rs, block);
1069          }
1070          stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1071      }
1072      qemu_mutex_unlock(&rs->bitmap_mutex);
1073  
1074      memory_global_after_dirty_log_sync();
1075      trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1076  
1077      end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1078  
1079      /* more than 1 second = 1000 millisecons */
1080      if (end_time > rs->time_last_bitmap_sync + 1000) {
1081          migration_trigger_throttle(rs);
1082  
1083          migration_update_rates(rs, end_time);
1084  
1085          rs->target_page_count_prev = rs->target_page_count;
1086  
1087          /* reset period counters */
1088          rs->time_last_bitmap_sync = end_time;
1089          rs->num_dirty_pages_period = 0;
1090          rs->bytes_xfer_prev = migration_transferred_bytes();
1091      }
1092      if (migrate_events()) {
1093          uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1094          qapi_event_send_migration_pass(generation);
1095      }
1096  }
1097  
1098  static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
1099  {
1100      Error *local_err = NULL;
1101  
1102      /*
1103       * The current notifier usage is just an optimization to migration, so we
1104       * don't stop the normal migration process in the error case.
1105       */
1106      if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1107          error_report_err(local_err);
1108          local_err = NULL;
1109      }
1110  
1111      migration_bitmap_sync(rs, last_stage);
1112  
1113      if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1114          error_report_err(local_err);
1115      }
1116  }
1117  
1118  void ram_release_page(const char *rbname, uint64_t offset)
1119  {
1120      if (!migrate_release_ram() || !migration_in_postcopy()) {
1121          return;
1122      }
1123  
1124      ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1125  }
1126  
1127  /**
1128   * save_zero_page: send the zero page to the stream
1129   *
1130   * Returns the number of pages written.
1131   *
1132   * @rs: current RAM state
1133   * @pss: current PSS channel
1134   * @offset: offset inside the block for the page
1135   */
1136  static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
1137                            ram_addr_t offset)
1138  {
1139      uint8_t *p = pss->block->host + offset;
1140      QEMUFile *file = pss->pss_channel;
1141      int len = 0;
1142  
1143      if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) {
1144          return 0;
1145      }
1146  
1147      if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1148          return 0;
1149      }
1150  
1151      stat64_add(&mig_stats.zero_pages, 1);
1152  
1153      if (migrate_mapped_ram()) {
1154          /* zero pages are not transferred with mapped-ram */
1155          clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap);
1156          return 1;
1157      }
1158  
1159      len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
1160      qemu_put_byte(file, 0);
1161      len += 1;
1162      ram_release_page(pss->block->idstr, offset);
1163      ram_transferred_add(len);
1164  
1165      /*
1166       * Must let xbzrle know, otherwise a previous (now 0'd) cached
1167       * page would be stale.
1168       */
1169      if (rs->xbzrle_started) {
1170          XBZRLE_cache_lock();
1171          xbzrle_cache_zero_page(pss->block->offset + offset);
1172          XBZRLE_cache_unlock();
1173      }
1174  
1175      return len;
1176  }
1177  
1178  /*
1179   * @pages: the number of pages written by the control path,
1180   *        < 0 - error
1181   *        > 0 - number of pages written
1182   *
1183   * Return true if the pages has been saved, otherwise false is returned.
1184   */
1185  static bool control_save_page(PageSearchStatus *pss,
1186                                ram_addr_t offset, int *pages)
1187  {
1188      int ret;
1189  
1190      ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset,
1191                                   TARGET_PAGE_SIZE);
1192      if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1193          return false;
1194      }
1195  
1196      if (ret == RAM_SAVE_CONTROL_DELAYED) {
1197          *pages = 1;
1198          return true;
1199      }
1200      *pages = ret;
1201      return true;
1202  }
1203  
1204  /*
1205   * directly send the page to the stream
1206   *
1207   * Returns the number of pages written.
1208   *
1209   * @pss: current PSS channel
1210   * @block: block that contains the page we want to send
1211   * @offset: offset inside the block for the page
1212   * @buf: the page to be sent
1213   * @async: send to page asyncly
1214   */
1215  static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1216                              ram_addr_t offset, uint8_t *buf, bool async)
1217  {
1218      QEMUFile *file = pss->pss_channel;
1219  
1220      if (migrate_mapped_ram()) {
1221          qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
1222                             block->pages_offset + offset);
1223          set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap);
1224      } else {
1225          ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1226                                               offset | RAM_SAVE_FLAG_PAGE));
1227          if (async) {
1228              qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1229                                    migrate_release_ram() &&
1230                                    migration_in_postcopy());
1231          } else {
1232              qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1233          }
1234      }
1235      ram_transferred_add(TARGET_PAGE_SIZE);
1236      stat64_add(&mig_stats.normal_pages, 1);
1237      return 1;
1238  }
1239  
1240  /**
1241   * ram_save_page: send the given page to the stream
1242   *
1243   * Returns the number of pages written.
1244   *          < 0 - error
1245   *          >=0 - Number of pages written - this might legally be 0
1246   *                if xbzrle noticed the page was the same.
1247   *
1248   * @rs: current RAM state
1249   * @block: block that contains the page we want to send
1250   * @offset: offset inside the block for the page
1251   */
1252  static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1253  {
1254      int pages = -1;
1255      uint8_t *p;
1256      bool send_async = true;
1257      RAMBlock *block = pss->block;
1258      ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1259      ram_addr_t current_addr = block->offset + offset;
1260  
1261      p = block->host + offset;
1262      trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1263  
1264      XBZRLE_cache_lock();
1265      if (rs->xbzrle_started && !migration_in_postcopy()) {
1266          pages = save_xbzrle_page(rs, pss, &p, current_addr,
1267                                   block, offset);
1268          if (!rs->last_stage) {
1269              /* Can't send this cached data async, since the cache page
1270               * might get updated before it gets to the wire
1271               */
1272              send_async = false;
1273          }
1274      }
1275  
1276      /* XBZRLE overflow or normal page */
1277      if (pages == -1) {
1278          pages = save_normal_page(pss, block, offset, p, send_async);
1279      }
1280  
1281      XBZRLE_cache_unlock();
1282  
1283      return pages;
1284  }
1285  
1286  static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
1287  {
1288      if (!multifd_queue_page(block, offset)) {
1289          return -1;
1290      }
1291  
1292      return 1;
1293  }
1294  
1295  int compress_send_queued_data(CompressParam *param)
1296  {
1297      PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1298      MigrationState *ms = migrate_get_current();
1299      QEMUFile *file = ms->to_dst_file;
1300      int len = 0;
1301  
1302      RAMBlock *block = param->block;
1303      ram_addr_t offset = param->offset;
1304  
1305      if (param->result == RES_NONE) {
1306          return 0;
1307      }
1308  
1309      assert(block == pss->last_sent_block);
1310  
1311      if (param->result == RES_ZEROPAGE) {
1312          assert(qemu_file_buffer_empty(param->file));
1313          len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1314          qemu_put_byte(file, 0);
1315          len += 1;
1316          ram_release_page(block->idstr, offset);
1317      } else if (param->result == RES_COMPRESS) {
1318          assert(!qemu_file_buffer_empty(param->file));
1319          len += save_page_header(pss, file, block,
1320                                  offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1321          len += qemu_put_qemu_file(file, param->file);
1322      } else {
1323          abort();
1324      }
1325  
1326      update_compress_thread_counts(param, len);
1327  
1328      return len;
1329  }
1330  
1331  #define PAGE_ALL_CLEAN 0
1332  #define PAGE_TRY_AGAIN 1
1333  #define PAGE_DIRTY_FOUND 2
1334  /**
1335   * find_dirty_block: find the next dirty page and update any state
1336   * associated with the search process.
1337   *
1338   * Returns:
1339   *         <0: An error happened
1340   *         PAGE_ALL_CLEAN: no dirty page found, give up
1341   *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1342   *         PAGE_DIRTY_FOUND: dirty page found
1343   *
1344   * @rs: current RAM state
1345   * @pss: data about the state of the current dirty page scan
1346   * @again: set to false if the search has scanned the whole of RAM
1347   */
1348  static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1349  {
1350      /* Update pss->page for the next dirty bit in ramblock */
1351      pss_find_next_dirty(pss);
1352  
1353      if (pss->complete_round && pss->block == rs->last_seen_block &&
1354          pss->page >= rs->last_page) {
1355          /*
1356           * We've been once around the RAM and haven't found anything.
1357           * Give up.
1358           */
1359          return PAGE_ALL_CLEAN;
1360      }
1361      if (!offset_in_ramblock(pss->block,
1362                              ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1363          /* Didn't find anything in this RAM Block */
1364          pss->page = 0;
1365          pss->block = QLIST_NEXT_RCU(pss->block, next);
1366          if (!pss->block) {
1367              if (migrate_multifd() &&
1368                  (!migrate_multifd_flush_after_each_section() ||
1369                   migrate_mapped_ram())) {
1370                  QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1371                  int ret = multifd_send_sync_main();
1372                  if (ret < 0) {
1373                      return ret;
1374                  }
1375  
1376                  if (!migrate_mapped_ram()) {
1377                      qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1378                      qemu_fflush(f);
1379                  }
1380              }
1381              /*
1382               * If memory migration starts over, we will meet a dirtied page
1383               * which may still exists in compression threads's ring, so we
1384               * should flush the compressed data to make sure the new page
1385               * is not overwritten by the old one in the destination.
1386               *
1387               * Also If xbzrle is on, stop using the data compression at this
1388               * point. In theory, xbzrle can do better than compression.
1389               */
1390              compress_flush_data();
1391  
1392              /* Hit the end of the list */
1393              pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1394              /* Flag that we've looped */
1395              pss->complete_round = true;
1396              /* After the first round, enable XBZRLE. */
1397              if (migrate_xbzrle()) {
1398                  rs->xbzrle_started = true;
1399              }
1400          }
1401          /* Didn't find anything this time, but try again on the new block */
1402          return PAGE_TRY_AGAIN;
1403      } else {
1404          /* We've found something */
1405          return PAGE_DIRTY_FOUND;
1406      }
1407  }
1408  
1409  /**
1410   * unqueue_page: gets a page of the queue
1411   *
1412   * Helper for 'get_queued_page' - gets a page off the queue
1413   *
1414   * Returns the block of the page (or NULL if none available)
1415   *
1416   * @rs: current RAM state
1417   * @offset: used to return the offset within the RAMBlock
1418   */
1419  static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1420  {
1421      struct RAMSrcPageRequest *entry;
1422      RAMBlock *block = NULL;
1423  
1424      if (!postcopy_has_request(rs)) {
1425          return NULL;
1426      }
1427  
1428      QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1429  
1430      /*
1431       * This should _never_ change even after we take the lock, because no one
1432       * should be taking anything off the request list other than us.
1433       */
1434      assert(postcopy_has_request(rs));
1435  
1436      entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1437      block = entry->rb;
1438      *offset = entry->offset;
1439  
1440      if (entry->len > TARGET_PAGE_SIZE) {
1441          entry->len -= TARGET_PAGE_SIZE;
1442          entry->offset += TARGET_PAGE_SIZE;
1443      } else {
1444          memory_region_unref(block->mr);
1445          QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1446          g_free(entry);
1447          migration_consume_urgent_request();
1448      }
1449  
1450      return block;
1451  }
1452  
1453  #if defined(__linux__)
1454  /**
1455   * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1456   *   is found, return RAM block pointer and page offset
1457   *
1458   * Returns pointer to the RAMBlock containing faulting page,
1459   *   NULL if no write faults are pending
1460   *
1461   * @rs: current RAM state
1462   * @offset: page offset from the beginning of the block
1463   */
1464  static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1465  {
1466      struct uffd_msg uffd_msg;
1467      void *page_address;
1468      RAMBlock *block;
1469      int res;
1470  
1471      if (!migrate_background_snapshot()) {
1472          return NULL;
1473      }
1474  
1475      res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1476      if (res <= 0) {
1477          return NULL;
1478      }
1479  
1480      page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1481      block = qemu_ram_block_from_host(page_address, false, offset);
1482      assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1483      return block;
1484  }
1485  
1486  /**
1487   * ram_save_release_protection: release UFFD write protection after
1488   *   a range of pages has been saved
1489   *
1490   * @rs: current RAM state
1491   * @pss: page-search-status structure
1492   * @start_page: index of the first page in the range relative to pss->block
1493   *
1494   * Returns 0 on success, negative value in case of an error
1495  */
1496  static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1497          unsigned long start_page)
1498  {
1499      int res = 0;
1500  
1501      /* Check if page is from UFFD-managed region. */
1502      if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1503          void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1504          uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1505  
1506          /* Flush async buffers before un-protect. */
1507          qemu_fflush(pss->pss_channel);
1508          /* Un-protect memory range. */
1509          res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1510                  false, false);
1511      }
1512  
1513      return res;
1514  }
1515  
1516  /* ram_write_tracking_available: check if kernel supports required UFFD features
1517   *
1518   * Returns true if supports, false otherwise
1519   */
1520  bool ram_write_tracking_available(void)
1521  {
1522      uint64_t uffd_features;
1523      int res;
1524  
1525      res = uffd_query_features(&uffd_features);
1526      return (res == 0 &&
1527              (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1528  }
1529  
1530  /* ram_write_tracking_compatible: check if guest configuration is
1531   *   compatible with 'write-tracking'
1532   *
1533   * Returns true if compatible, false otherwise
1534   */
1535  bool ram_write_tracking_compatible(void)
1536  {
1537      const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1538      int uffd_fd;
1539      RAMBlock *block;
1540      bool ret = false;
1541  
1542      /* Open UFFD file descriptor */
1543      uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1544      if (uffd_fd < 0) {
1545          return false;
1546      }
1547  
1548      RCU_READ_LOCK_GUARD();
1549  
1550      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1551          uint64_t uffd_ioctls;
1552  
1553          /* Nothing to do with read-only and MMIO-writable regions */
1554          if (block->mr->readonly || block->mr->rom_device) {
1555              continue;
1556          }
1557          /* Try to register block memory via UFFD-IO to track writes */
1558          if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1559                  UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1560              goto out;
1561          }
1562          if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1563              goto out;
1564          }
1565      }
1566      ret = true;
1567  
1568  out:
1569      uffd_close_fd(uffd_fd);
1570      return ret;
1571  }
1572  
1573  static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1574                                         ram_addr_t size)
1575  {
1576      const ram_addr_t end = offset + size;
1577  
1578      /*
1579       * We read one byte of each page; this will preallocate page tables if
1580       * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1581       * where no page was populated yet. This might require adaption when
1582       * supporting other mappings, like shmem.
1583       */
1584      for (; offset < end; offset += block->page_size) {
1585          char tmp = *((char *)block->host + offset);
1586  
1587          /* Don't optimize the read out */
1588          asm volatile("" : "+r" (tmp));
1589      }
1590  }
1591  
1592  static inline int populate_read_section(MemoryRegionSection *section,
1593                                          void *opaque)
1594  {
1595      const hwaddr size = int128_get64(section->size);
1596      hwaddr offset = section->offset_within_region;
1597      RAMBlock *block = section->mr->ram_block;
1598  
1599      populate_read_range(block, offset, size);
1600      return 0;
1601  }
1602  
1603  /*
1604   * ram_block_populate_read: preallocate page tables and populate pages in the
1605   *   RAM block by reading a byte of each page.
1606   *
1607   * Since it's solely used for userfault_fd WP feature, here we just
1608   *   hardcode page size to qemu_real_host_page_size.
1609   *
1610   * @block: RAM block to populate
1611   */
1612  static void ram_block_populate_read(RAMBlock *rb)
1613  {
1614      /*
1615       * Skip populating all pages that fall into a discarded range as managed by
1616       * a RamDiscardManager responsible for the mapped memory region of the
1617       * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1618       * must not get populated automatically. We don't have to track
1619       * modifications via userfaultfd WP reliably, because these pages will
1620       * not be part of the migration stream either way -- see
1621       * ramblock_dirty_bitmap_exclude_discarded_pages().
1622       *
1623       * Note: The result is only stable while migrating (precopy/postcopy).
1624       */
1625      if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1626          RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1627          MemoryRegionSection section = {
1628              .mr = rb->mr,
1629              .offset_within_region = 0,
1630              .size = rb->mr->size,
1631          };
1632  
1633          ram_discard_manager_replay_populated(rdm, &section,
1634                                               populate_read_section, NULL);
1635      } else {
1636          populate_read_range(rb, 0, rb->used_length);
1637      }
1638  }
1639  
1640  /*
1641   * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1642   */
1643  void ram_write_tracking_prepare(void)
1644  {
1645      RAMBlock *block;
1646  
1647      RCU_READ_LOCK_GUARD();
1648  
1649      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1650          /* Nothing to do with read-only and MMIO-writable regions */
1651          if (block->mr->readonly || block->mr->rom_device) {
1652              continue;
1653          }
1654  
1655          /*
1656           * Populate pages of the RAM block before enabling userfault_fd
1657           * write protection.
1658           *
1659           * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1660           * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1661           * pages with pte_none() entries in page table.
1662           */
1663          ram_block_populate_read(block);
1664      }
1665  }
1666  
1667  static inline int uffd_protect_section(MemoryRegionSection *section,
1668                                         void *opaque)
1669  {
1670      const hwaddr size = int128_get64(section->size);
1671      const hwaddr offset = section->offset_within_region;
1672      RAMBlock *rb = section->mr->ram_block;
1673      int uffd_fd = (uintptr_t)opaque;
1674  
1675      return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1676                                    false);
1677  }
1678  
1679  static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1680  {
1681      assert(rb->flags & RAM_UF_WRITEPROTECT);
1682  
1683      /* See ram_block_populate_read() */
1684      if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1685          RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1686          MemoryRegionSection section = {
1687              .mr = rb->mr,
1688              .offset_within_region = 0,
1689              .size = rb->mr->size,
1690          };
1691  
1692          return ram_discard_manager_replay_populated(rdm, &section,
1693                                                      uffd_protect_section,
1694                                                      (void *)(uintptr_t)uffd_fd);
1695      }
1696      return uffd_change_protection(uffd_fd, rb->host,
1697                                    rb->used_length, true, false);
1698  }
1699  
1700  /*
1701   * ram_write_tracking_start: start UFFD-WP memory tracking
1702   *
1703   * Returns 0 for success or negative value in case of error
1704   */
1705  int ram_write_tracking_start(void)
1706  {
1707      int uffd_fd;
1708      RAMState *rs = ram_state;
1709      RAMBlock *block;
1710  
1711      /* Open UFFD file descriptor */
1712      uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1713      if (uffd_fd < 0) {
1714          return uffd_fd;
1715      }
1716      rs->uffdio_fd = uffd_fd;
1717  
1718      RCU_READ_LOCK_GUARD();
1719  
1720      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1721          /* Nothing to do with read-only and MMIO-writable regions */
1722          if (block->mr->readonly || block->mr->rom_device) {
1723              continue;
1724          }
1725  
1726          /* Register block memory with UFFD to track writes */
1727          if (uffd_register_memory(rs->uffdio_fd, block->host,
1728                  block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1729              goto fail;
1730          }
1731          block->flags |= RAM_UF_WRITEPROTECT;
1732          memory_region_ref(block->mr);
1733  
1734          /* Apply UFFD write protection to the block memory range */
1735          if (ram_block_uffd_protect(block, uffd_fd)) {
1736              goto fail;
1737          }
1738  
1739          trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1740                  block->host, block->max_length);
1741      }
1742  
1743      return 0;
1744  
1745  fail:
1746      error_report("ram_write_tracking_start() failed: restoring initial memory state");
1747  
1748      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1749          if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1750              continue;
1751          }
1752          uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1753          /* Cleanup flags and remove reference */
1754          block->flags &= ~RAM_UF_WRITEPROTECT;
1755          memory_region_unref(block->mr);
1756      }
1757  
1758      uffd_close_fd(uffd_fd);
1759      rs->uffdio_fd = -1;
1760      return -1;
1761  }
1762  
1763  /**
1764   * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1765   */
1766  void ram_write_tracking_stop(void)
1767  {
1768      RAMState *rs = ram_state;
1769      RAMBlock *block;
1770  
1771      RCU_READ_LOCK_GUARD();
1772  
1773      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1774          if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1775              continue;
1776          }
1777          uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1778  
1779          trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1780                  block->host, block->max_length);
1781  
1782          /* Cleanup flags and remove reference */
1783          block->flags &= ~RAM_UF_WRITEPROTECT;
1784          memory_region_unref(block->mr);
1785      }
1786  
1787      /* Finally close UFFD file descriptor */
1788      uffd_close_fd(rs->uffdio_fd);
1789      rs->uffdio_fd = -1;
1790  }
1791  
1792  #else
1793  /* No target OS support, stubs just fail or ignore */
1794  
1795  static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1796  {
1797      (void) rs;
1798      (void) offset;
1799  
1800      return NULL;
1801  }
1802  
1803  static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1804          unsigned long start_page)
1805  {
1806      (void) rs;
1807      (void) pss;
1808      (void) start_page;
1809  
1810      return 0;
1811  }
1812  
1813  bool ram_write_tracking_available(void)
1814  {
1815      return false;
1816  }
1817  
1818  bool ram_write_tracking_compatible(void)
1819  {
1820      assert(0);
1821      return false;
1822  }
1823  
1824  int ram_write_tracking_start(void)
1825  {
1826      assert(0);
1827      return -1;
1828  }
1829  
1830  void ram_write_tracking_stop(void)
1831  {
1832      assert(0);
1833  }
1834  #endif /* defined(__linux__) */
1835  
1836  /**
1837   * get_queued_page: unqueue a page from the postcopy requests
1838   *
1839   * Skips pages that are already sent (!dirty)
1840   *
1841   * Returns true if a queued page is found
1842   *
1843   * @rs: current RAM state
1844   * @pss: data about the state of the current dirty page scan
1845   */
1846  static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1847  {
1848      RAMBlock  *block;
1849      ram_addr_t offset;
1850      bool dirty;
1851  
1852      do {
1853          block = unqueue_page(rs, &offset);
1854          /*
1855           * We're sending this page, and since it's postcopy nothing else
1856           * will dirty it, and we must make sure it doesn't get sent again
1857           * even if this queue request was received after the background
1858           * search already sent it.
1859           */
1860          if (block) {
1861              unsigned long page;
1862  
1863              page = offset >> TARGET_PAGE_BITS;
1864              dirty = test_bit(page, block->bmap);
1865              if (!dirty) {
1866                  trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1867                                                  page);
1868              } else {
1869                  trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1870              }
1871          }
1872  
1873      } while (block && !dirty);
1874  
1875      if (!block) {
1876          /*
1877           * Poll write faults too if background snapshot is enabled; that's
1878           * when we have vcpus got blocked by the write protected pages.
1879           */
1880          block = poll_fault_page(rs, &offset);
1881      }
1882  
1883      if (block) {
1884          /*
1885           * We want the background search to continue from the queued page
1886           * since the guest is likely to want other pages near to the page
1887           * it just requested.
1888           */
1889          pss->block = block;
1890          pss->page = offset >> TARGET_PAGE_BITS;
1891  
1892          /*
1893           * This unqueued page would break the "one round" check, even is
1894           * really rare.
1895           */
1896          pss->complete_round = false;
1897      }
1898  
1899      return !!block;
1900  }
1901  
1902  /**
1903   * migration_page_queue_free: drop any remaining pages in the ram
1904   * request queue
1905   *
1906   * It should be empty at the end anyway, but in error cases there may
1907   * be some left.  in case that there is any page left, we drop it.
1908   *
1909   */
1910  static void migration_page_queue_free(RAMState *rs)
1911  {
1912      struct RAMSrcPageRequest *mspr, *next_mspr;
1913      /* This queue generally should be empty - but in the case of a failed
1914       * migration might have some droppings in.
1915       */
1916      RCU_READ_LOCK_GUARD();
1917      QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1918          memory_region_unref(mspr->rb->mr);
1919          QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1920          g_free(mspr);
1921      }
1922  }
1923  
1924  /**
1925   * ram_save_queue_pages: queue the page for transmission
1926   *
1927   * A request from postcopy destination for example.
1928   *
1929   * Returns zero on success or negative on error
1930   *
1931   * @rbname: Name of the RAMBLock of the request. NULL means the
1932   *          same that last one.
1933   * @start: starting address from the start of the RAMBlock
1934   * @len: length (in bytes) to send
1935   */
1936  int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len,
1937                           Error **errp)
1938  {
1939      RAMBlock *ramblock;
1940      RAMState *rs = ram_state;
1941  
1942      stat64_add(&mig_stats.postcopy_requests, 1);
1943      RCU_READ_LOCK_GUARD();
1944  
1945      if (!rbname) {
1946          /* Reuse last RAMBlock */
1947          ramblock = rs->last_req_rb;
1948  
1949          if (!ramblock) {
1950              /*
1951               * Shouldn't happen, we can't reuse the last RAMBlock if
1952               * it's the 1st request.
1953               */
1954              error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block");
1955              return -1;
1956          }
1957      } else {
1958          ramblock = qemu_ram_block_by_name(rbname);
1959  
1960          if (!ramblock) {
1961              /* We shouldn't be asked for a non-existent RAMBlock */
1962              error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname);
1963              return -1;
1964          }
1965          rs->last_req_rb = ramblock;
1966      }
1967      trace_ram_save_queue_pages(ramblock->idstr, start, len);
1968      if (!offset_in_ramblock(ramblock, start + len - 1)) {
1969          error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, "
1970                     "start=" RAM_ADDR_FMT " len="
1971                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1972                     start, len, ramblock->used_length);
1973          return -1;
1974      }
1975  
1976      /*
1977       * When with postcopy preempt, we send back the page directly in the
1978       * rp-return thread.
1979       */
1980      if (postcopy_preempt_active()) {
1981          ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1982          size_t page_size = qemu_ram_pagesize(ramblock);
1983          PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1984          int ret = 0;
1985  
1986          qemu_mutex_lock(&rs->bitmap_mutex);
1987  
1988          pss_init(pss, ramblock, page_start);
1989          /*
1990           * Always use the preempt channel, and make sure it's there.  It's
1991           * safe to access without lock, because when rp-thread is running
1992           * we should be the only one who operates on the qemufile
1993           */
1994          pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
1995          assert(pss->pss_channel);
1996  
1997          /*
1998           * It must be either one or multiple of host page size.  Just
1999           * assert; if something wrong we're mostly split brain anyway.
2000           */
2001          assert(len % page_size == 0);
2002          while (len) {
2003              if (ram_save_host_page_urgent(pss)) {
2004                  error_setg(errp, "ram_save_host_page_urgent() failed: "
2005                             "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2006                             ramblock->idstr, start);
2007                  ret = -1;
2008                  break;
2009              }
2010              /*
2011               * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2012               * will automatically be moved and point to the next host page
2013               * we're going to send, so no need to update here.
2014               *
2015               * Normally QEMU never sends >1 host page in requests, so
2016               * logically we don't even need that as the loop should only
2017               * run once, but just to be consistent.
2018               */
2019              len -= page_size;
2020          };
2021          qemu_mutex_unlock(&rs->bitmap_mutex);
2022  
2023          return ret;
2024      }
2025  
2026      struct RAMSrcPageRequest *new_entry =
2027          g_new0(struct RAMSrcPageRequest, 1);
2028      new_entry->rb = ramblock;
2029      new_entry->offset = start;
2030      new_entry->len = len;
2031  
2032      memory_region_ref(ramblock->mr);
2033      qemu_mutex_lock(&rs->src_page_req_mutex);
2034      QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2035      migration_make_urgent_request();
2036      qemu_mutex_unlock(&rs->src_page_req_mutex);
2037  
2038      return 0;
2039  }
2040  
2041  /*
2042   * try to compress the page before posting it out, return true if the page
2043   * has been properly handled by compression, otherwise needs other
2044   * paths to handle it
2045   */
2046  static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2047                                 ram_addr_t offset)
2048  {
2049      if (!migrate_compress()) {
2050          return false;
2051      }
2052  
2053      /*
2054       * When starting the process of a new block, the first page of
2055       * the block should be sent out before other pages in the same
2056       * block, and all the pages in last block should have been sent
2057       * out, keeping this order is important, because the 'cont' flag
2058       * is used to avoid resending the block name.
2059       *
2060       * We post the fist page as normal page as compression will take
2061       * much CPU resource.
2062       */
2063      if (pss->block != pss->last_sent_block) {
2064          compress_flush_data();
2065          return false;
2066      }
2067  
2068      return compress_page_with_multi_thread(pss->block, offset,
2069                                             compress_send_queued_data);
2070  }
2071  
2072  /**
2073   * ram_save_target_page_legacy: save one target page
2074   *
2075   * Returns the number of pages written
2076   *
2077   * @rs: current RAM state
2078   * @pss: data about the page we want to send
2079   */
2080  static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2081  {
2082      ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2083      int res;
2084  
2085      if (control_save_page(pss, offset, &res)) {
2086          return res;
2087      }
2088  
2089      if (save_compress_page(rs, pss, offset)) {
2090          return 1;
2091      }
2092  
2093      if (save_zero_page(rs, pss, offset)) {
2094          return 1;
2095      }
2096  
2097      return ram_save_page(rs, pss);
2098  }
2099  
2100  /**
2101   * ram_save_target_page_multifd: send one target page to multifd workers
2102   *
2103   * Returns 1 if the page was queued, -1 otherwise.
2104   *
2105   * @rs: current RAM state
2106   * @pss: data about the page we want to send
2107   */
2108  static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss)
2109  {
2110      RAMBlock *block = pss->block;
2111      ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2112  
2113      /*
2114       * While using multifd live migration, we still need to handle zero
2115       * page checking on the migration main thread.
2116       */
2117      if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) {
2118          if (save_zero_page(rs, pss, offset)) {
2119              return 1;
2120          }
2121      }
2122  
2123      return ram_save_multifd_page(block, offset);
2124  }
2125  
2126  /* Should be called before sending a host page */
2127  static void pss_host_page_prepare(PageSearchStatus *pss)
2128  {
2129      /* How many guest pages are there in one host page? */
2130      size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2131  
2132      pss->host_page_sending = true;
2133      if (guest_pfns <= 1) {
2134          /*
2135           * This covers both when guest psize == host psize, or when guest
2136           * has larger psize than the host (guest_pfns==0).
2137           *
2138           * For the latter, we always send one whole guest page per
2139           * iteration of the host page (example: an Alpha VM on x86 host
2140           * will have guest psize 8K while host psize 4K).
2141           */
2142          pss->host_page_start = pss->page;
2143          pss->host_page_end = pss->page + 1;
2144      } else {
2145          /*
2146           * The host page spans over multiple guest pages, we send them
2147           * within the same host page iteration.
2148           */
2149          pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2150          pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2151      }
2152  }
2153  
2154  /*
2155   * Whether the page pointed by PSS is within the host page being sent.
2156   * Must be called after a previous pss_host_page_prepare().
2157   */
2158  static bool pss_within_range(PageSearchStatus *pss)
2159  {
2160      ram_addr_t ram_addr;
2161  
2162      assert(pss->host_page_sending);
2163  
2164      /* Over host-page boundary? */
2165      if (pss->page >= pss->host_page_end) {
2166          return false;
2167      }
2168  
2169      ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2170  
2171      return offset_in_ramblock(pss->block, ram_addr);
2172  }
2173  
2174  static void pss_host_page_finish(PageSearchStatus *pss)
2175  {
2176      pss->host_page_sending = false;
2177      /* This is not needed, but just to reset it */
2178      pss->host_page_start = pss->host_page_end = 0;
2179  }
2180  
2181  /*
2182   * Send an urgent host page specified by `pss'.  Need to be called with
2183   * bitmap_mutex held.
2184   *
2185   * Returns 0 if save host page succeeded, false otherwise.
2186   */
2187  static int ram_save_host_page_urgent(PageSearchStatus *pss)
2188  {
2189      bool page_dirty, sent = false;
2190      RAMState *rs = ram_state;
2191      int ret = 0;
2192  
2193      trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2194      pss_host_page_prepare(pss);
2195  
2196      /*
2197       * If precopy is sending the same page, let it be done in precopy, or
2198       * we could send the same page in two channels and none of them will
2199       * receive the whole page.
2200       */
2201      if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2202          trace_postcopy_preempt_hit(pss->block->idstr,
2203                                     pss->page << TARGET_PAGE_BITS);
2204          return 0;
2205      }
2206  
2207      do {
2208          page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2209  
2210          if (page_dirty) {
2211              /* Be strict to return code; it must be 1, or what else? */
2212              if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2213                  error_report_once("%s: ram_save_target_page failed", __func__);
2214                  ret = -1;
2215                  goto out;
2216              }
2217              sent = true;
2218          }
2219          pss_find_next_dirty(pss);
2220      } while (pss_within_range(pss));
2221  out:
2222      pss_host_page_finish(pss);
2223      /* For urgent requests, flush immediately if sent */
2224      if (sent) {
2225          qemu_fflush(pss->pss_channel);
2226      }
2227      return ret;
2228  }
2229  
2230  /**
2231   * ram_save_host_page: save a whole host page
2232   *
2233   * Starting at *offset send pages up to the end of the current host
2234   * page. It's valid for the initial offset to point into the middle of
2235   * a host page in which case the remainder of the hostpage is sent.
2236   * Only dirty target pages are sent. Note that the host page size may
2237   * be a huge page for this block.
2238   *
2239   * The saving stops at the boundary of the used_length of the block
2240   * if the RAMBlock isn't a multiple of the host page size.
2241   *
2242   * The caller must be with ram_state.bitmap_mutex held to call this
2243   * function.  Note that this function can temporarily release the lock, but
2244   * when the function is returned it'll make sure the lock is still held.
2245   *
2246   * Returns the number of pages written or negative on error
2247   *
2248   * @rs: current RAM state
2249   * @pss: data about the page we want to send
2250   */
2251  static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2252  {
2253      bool page_dirty, preempt_active = postcopy_preempt_active();
2254      int tmppages, pages = 0;
2255      size_t pagesize_bits =
2256          qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2257      unsigned long start_page = pss->page;
2258      int res;
2259  
2260      if (migrate_ram_is_ignored(pss->block)) {
2261          error_report("block %s should not be migrated !", pss->block->idstr);
2262          return 0;
2263      }
2264  
2265      /* Update host page boundary information */
2266      pss_host_page_prepare(pss);
2267  
2268      do {
2269          page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2270  
2271          /* Check the pages is dirty and if it is send it */
2272          if (page_dirty) {
2273              /*
2274               * Properly yield the lock only in postcopy preempt mode
2275               * because both migration thread and rp-return thread can
2276               * operate on the bitmaps.
2277               */
2278              if (preempt_active) {
2279                  qemu_mutex_unlock(&rs->bitmap_mutex);
2280              }
2281              tmppages = migration_ops->ram_save_target_page(rs, pss);
2282              if (tmppages >= 0) {
2283                  pages += tmppages;
2284                  /*
2285                   * Allow rate limiting to happen in the middle of huge pages if
2286                   * something is sent in the current iteration.
2287                   */
2288                  if (pagesize_bits > 1 && tmppages > 0) {
2289                      migration_rate_limit();
2290                  }
2291              }
2292              if (preempt_active) {
2293                  qemu_mutex_lock(&rs->bitmap_mutex);
2294              }
2295          } else {
2296              tmppages = 0;
2297          }
2298  
2299          if (tmppages < 0) {
2300              pss_host_page_finish(pss);
2301              return tmppages;
2302          }
2303  
2304          pss_find_next_dirty(pss);
2305      } while (pss_within_range(pss));
2306  
2307      pss_host_page_finish(pss);
2308  
2309      res = ram_save_release_protection(rs, pss, start_page);
2310      return (res < 0 ? res : pages);
2311  }
2312  
2313  /**
2314   * ram_find_and_save_block: finds a dirty page and sends it to f
2315   *
2316   * Called within an RCU critical section.
2317   *
2318   * Returns the number of pages written where zero means no dirty pages,
2319   * or negative on error
2320   *
2321   * @rs: current RAM state
2322   *
2323   * On systems where host-page-size > target-page-size it will send all the
2324   * pages in a host page that are dirty.
2325   */
2326  static int ram_find_and_save_block(RAMState *rs)
2327  {
2328      PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2329      int pages = 0;
2330  
2331      /* No dirty page as there is zero RAM */
2332      if (!rs->ram_bytes_total) {
2333          return pages;
2334      }
2335  
2336      /*
2337       * Always keep last_seen_block/last_page valid during this procedure,
2338       * because find_dirty_block() relies on these values (e.g., we compare
2339       * last_seen_block with pss.block to see whether we searched all the
2340       * ramblocks) to detect the completion of migration.  Having NULL value
2341       * of last_seen_block can conditionally cause below loop to run forever.
2342       */
2343      if (!rs->last_seen_block) {
2344          rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2345          rs->last_page = 0;
2346      }
2347  
2348      pss_init(pss, rs->last_seen_block, rs->last_page);
2349  
2350      while (true){
2351          if (!get_queued_page(rs, pss)) {
2352              /* priority queue empty, so just search for something dirty */
2353              int res = find_dirty_block(rs, pss);
2354              if (res != PAGE_DIRTY_FOUND) {
2355                  if (res == PAGE_ALL_CLEAN) {
2356                      break;
2357                  } else if (res == PAGE_TRY_AGAIN) {
2358                      continue;
2359                  } else if (res < 0) {
2360                      pages = res;
2361                      break;
2362                  }
2363              }
2364          }
2365          pages = ram_save_host_page(rs, pss);
2366          if (pages) {
2367              break;
2368          }
2369      }
2370  
2371      rs->last_seen_block = pss->block;
2372      rs->last_page = pss->page;
2373  
2374      return pages;
2375  }
2376  
2377  static uint64_t ram_bytes_total_with_ignored(void)
2378  {
2379      RAMBlock *block;
2380      uint64_t total = 0;
2381  
2382      RCU_READ_LOCK_GUARD();
2383  
2384      RAMBLOCK_FOREACH_MIGRATABLE(block) {
2385          total += block->used_length;
2386      }
2387      return total;
2388  }
2389  
2390  uint64_t ram_bytes_total(void)
2391  {
2392      RAMBlock *block;
2393      uint64_t total = 0;
2394  
2395      RCU_READ_LOCK_GUARD();
2396  
2397      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2398          total += block->used_length;
2399      }
2400      return total;
2401  }
2402  
2403  static void xbzrle_load_setup(void)
2404  {
2405      XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2406  }
2407  
2408  static void xbzrle_load_cleanup(void)
2409  {
2410      g_free(XBZRLE.decoded_buf);
2411      XBZRLE.decoded_buf = NULL;
2412  }
2413  
2414  static void ram_state_cleanup(RAMState **rsp)
2415  {
2416      if (*rsp) {
2417          migration_page_queue_free(*rsp);
2418          qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2419          qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2420          g_free(*rsp);
2421          *rsp = NULL;
2422      }
2423  }
2424  
2425  static void xbzrle_cleanup(void)
2426  {
2427      XBZRLE_cache_lock();
2428      if (XBZRLE.cache) {
2429          cache_fini(XBZRLE.cache);
2430          g_free(XBZRLE.encoded_buf);
2431          g_free(XBZRLE.current_buf);
2432          g_free(XBZRLE.zero_target_page);
2433          XBZRLE.cache = NULL;
2434          XBZRLE.encoded_buf = NULL;
2435          XBZRLE.current_buf = NULL;
2436          XBZRLE.zero_target_page = NULL;
2437      }
2438      XBZRLE_cache_unlock();
2439  }
2440  
2441  static void ram_save_cleanup(void *opaque)
2442  {
2443      RAMState **rsp = opaque;
2444      RAMBlock *block;
2445  
2446      /* We don't use dirty log with background snapshots */
2447      if (!migrate_background_snapshot()) {
2448          /* caller have hold BQL or is in a bh, so there is
2449           * no writing race against the migration bitmap
2450           */
2451          if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2452              /*
2453               * do not stop dirty log without starting it, since
2454               * memory_global_dirty_log_stop will assert that
2455               * memory_global_dirty_log_start/stop used in pairs
2456               */
2457              memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2458          }
2459      }
2460  
2461      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2462          g_free(block->clear_bmap);
2463          block->clear_bmap = NULL;
2464          g_free(block->bmap);
2465          block->bmap = NULL;
2466      }
2467  
2468      xbzrle_cleanup();
2469      compress_threads_save_cleanup();
2470      ram_state_cleanup(rsp);
2471      g_free(migration_ops);
2472      migration_ops = NULL;
2473  }
2474  
2475  static void ram_state_reset(RAMState *rs)
2476  {
2477      int i;
2478  
2479      for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2480          rs->pss[i].last_sent_block = NULL;
2481      }
2482  
2483      rs->last_seen_block = NULL;
2484      rs->last_page = 0;
2485      rs->last_version = ram_list.version;
2486      rs->xbzrle_started = false;
2487  }
2488  
2489  #define MAX_WAIT 50 /* ms, half buffered_file limit */
2490  
2491  /* **** functions for postcopy ***** */
2492  
2493  void ram_postcopy_migrated_memory_release(MigrationState *ms)
2494  {
2495      struct RAMBlock *block;
2496  
2497      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2498          unsigned long *bitmap = block->bmap;
2499          unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2500          unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2501  
2502          while (run_start < range) {
2503              unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2504              ram_discard_range(block->idstr,
2505                                ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2506                                ((ram_addr_t)(run_end - run_start))
2507                                  << TARGET_PAGE_BITS);
2508              run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2509          }
2510      }
2511  }
2512  
2513  /**
2514   * postcopy_send_discard_bm_ram: discard a RAMBlock
2515   *
2516   * Callback from postcopy_each_ram_send_discard for each RAMBlock
2517   *
2518   * @ms: current migration state
2519   * @block: RAMBlock to discard
2520   */
2521  static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2522  {
2523      unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2524      unsigned long current;
2525      unsigned long *bitmap = block->bmap;
2526  
2527      for (current = 0; current < end; ) {
2528          unsigned long one = find_next_bit(bitmap, end, current);
2529          unsigned long zero, discard_length;
2530  
2531          if (one >= end) {
2532              break;
2533          }
2534  
2535          zero = find_next_zero_bit(bitmap, end, one + 1);
2536  
2537          if (zero >= end) {
2538              discard_length = end - one;
2539          } else {
2540              discard_length = zero - one;
2541          }
2542          postcopy_discard_send_range(ms, one, discard_length);
2543          current = one + discard_length;
2544      }
2545  }
2546  
2547  static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2548  
2549  /**
2550   * postcopy_each_ram_send_discard: discard all RAMBlocks
2551   *
2552   * Utility for the outgoing postcopy code.
2553   *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2554   *   passing it bitmap indexes and name.
2555   * (qemu_ram_foreach_block ends up passing unscaled lengths
2556   *  which would mean postcopy code would have to deal with target page)
2557   *
2558   * @ms: current migration state
2559   */
2560  static void postcopy_each_ram_send_discard(MigrationState *ms)
2561  {
2562      struct RAMBlock *block;
2563  
2564      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2565          postcopy_discard_send_init(ms, block->idstr);
2566  
2567          /*
2568           * Deal with TPS != HPS and huge pages.  It discard any partially sent
2569           * host-page size chunks, mark any partially dirty host-page size
2570           * chunks as all dirty.  In this case the host-page is the host-page
2571           * for the particular RAMBlock, i.e. it might be a huge page.
2572           */
2573          postcopy_chunk_hostpages_pass(ms, block);
2574  
2575          /*
2576           * Postcopy sends chunks of bitmap over the wire, but it
2577           * just needs indexes at this point, avoids it having
2578           * target page specific code.
2579           */
2580          postcopy_send_discard_bm_ram(ms, block);
2581          postcopy_discard_send_finish(ms);
2582      }
2583  }
2584  
2585  /**
2586   * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2587   *
2588   * Helper for postcopy_chunk_hostpages; it's called twice to
2589   * canonicalize the two bitmaps, that are similar, but one is
2590   * inverted.
2591   *
2592   * Postcopy requires that all target pages in a hostpage are dirty or
2593   * clean, not a mix.  This function canonicalizes the bitmaps.
2594   *
2595   * @ms: current migration state
2596   * @block: block that contains the page we want to canonicalize
2597   */
2598  static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2599  {
2600      RAMState *rs = ram_state;
2601      unsigned long *bitmap = block->bmap;
2602      unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2603      unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2604      unsigned long run_start;
2605  
2606      if (block->page_size == TARGET_PAGE_SIZE) {
2607          /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2608          return;
2609      }
2610  
2611      /* Find a dirty page */
2612      run_start = find_next_bit(bitmap, pages, 0);
2613  
2614      while (run_start < pages) {
2615  
2616          /*
2617           * If the start of this run of pages is in the middle of a host
2618           * page, then we need to fixup this host page.
2619           */
2620          if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2621              /* Find the end of this run */
2622              run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2623              /*
2624               * If the end isn't at the start of a host page, then the
2625               * run doesn't finish at the end of a host page
2626               * and we need to discard.
2627               */
2628          }
2629  
2630          if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2631              unsigned long page;
2632              unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2633                                                               host_ratio);
2634              run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2635  
2636              /* Clean up the bitmap */
2637              for (page = fixup_start_addr;
2638                   page < fixup_start_addr + host_ratio; page++) {
2639                  /*
2640                   * Remark them as dirty, updating the count for any pages
2641                   * that weren't previously dirty.
2642                   */
2643                  rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2644              }
2645          }
2646  
2647          /* Find the next dirty page for the next iteration */
2648          run_start = find_next_bit(bitmap, pages, run_start);
2649      }
2650  }
2651  
2652  /**
2653   * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2654   *
2655   * Transmit the set of pages to be discarded after precopy to the target
2656   * these are pages that:
2657   *     a) Have been previously transmitted but are now dirty again
2658   *     b) Pages that have never been transmitted, this ensures that
2659   *        any pages on the destination that have been mapped by background
2660   *        tasks get discarded (transparent huge pages is the specific concern)
2661   * Hopefully this is pretty sparse
2662   *
2663   * @ms: current migration state
2664   */
2665  void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2666  {
2667      RAMState *rs = ram_state;
2668  
2669      RCU_READ_LOCK_GUARD();
2670  
2671      /* This should be our last sync, the src is now paused */
2672      migration_bitmap_sync(rs, false);
2673  
2674      /* Easiest way to make sure we don't resume in the middle of a host-page */
2675      rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2676      rs->last_seen_block = NULL;
2677      rs->last_page = 0;
2678  
2679      postcopy_each_ram_send_discard(ms);
2680  
2681      trace_ram_postcopy_send_discard_bitmap();
2682  }
2683  
2684  /**
2685   * ram_discard_range: discard dirtied pages at the beginning of postcopy
2686   *
2687   * Returns zero on success
2688   *
2689   * @rbname: name of the RAMBlock of the request. NULL means the
2690   *          same that last one.
2691   * @start: RAMBlock starting page
2692   * @length: RAMBlock size
2693   */
2694  int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2695  {
2696      trace_ram_discard_range(rbname, start, length);
2697  
2698      RCU_READ_LOCK_GUARD();
2699      RAMBlock *rb = qemu_ram_block_by_name(rbname);
2700  
2701      if (!rb) {
2702          error_report("ram_discard_range: Failed to find block '%s'", rbname);
2703          return -1;
2704      }
2705  
2706      /*
2707       * On source VM, we don't need to update the received bitmap since
2708       * we don't even have one.
2709       */
2710      if (rb->receivedmap) {
2711          bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2712                       length >> qemu_target_page_bits());
2713      }
2714  
2715      return ram_block_discard_range(rb, start, length);
2716  }
2717  
2718  /*
2719   * For every allocation, we will try not to crash the VM if the
2720   * allocation failed.
2721   */
2722  static int xbzrle_init(void)
2723  {
2724      Error *local_err = NULL;
2725  
2726      if (!migrate_xbzrle()) {
2727          return 0;
2728      }
2729  
2730      XBZRLE_cache_lock();
2731  
2732      XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2733      if (!XBZRLE.zero_target_page) {
2734          error_report("%s: Error allocating zero page", __func__);
2735          goto err_out;
2736      }
2737  
2738      XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2739                                TARGET_PAGE_SIZE, &local_err);
2740      if (!XBZRLE.cache) {
2741          error_report_err(local_err);
2742          goto free_zero_page;
2743      }
2744  
2745      XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2746      if (!XBZRLE.encoded_buf) {
2747          error_report("%s: Error allocating encoded_buf", __func__);
2748          goto free_cache;
2749      }
2750  
2751      XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2752      if (!XBZRLE.current_buf) {
2753          error_report("%s: Error allocating current_buf", __func__);
2754          goto free_encoded_buf;
2755      }
2756  
2757      /* We are all good */
2758      XBZRLE_cache_unlock();
2759      return 0;
2760  
2761  free_encoded_buf:
2762      g_free(XBZRLE.encoded_buf);
2763      XBZRLE.encoded_buf = NULL;
2764  free_cache:
2765      cache_fini(XBZRLE.cache);
2766      XBZRLE.cache = NULL;
2767  free_zero_page:
2768      g_free(XBZRLE.zero_target_page);
2769      XBZRLE.zero_target_page = NULL;
2770  err_out:
2771      XBZRLE_cache_unlock();
2772      return -ENOMEM;
2773  }
2774  
2775  static int ram_state_init(RAMState **rsp)
2776  {
2777      *rsp = g_try_new0(RAMState, 1);
2778  
2779      if (!*rsp) {
2780          error_report("%s: Init ramstate fail", __func__);
2781          return -1;
2782      }
2783  
2784      qemu_mutex_init(&(*rsp)->bitmap_mutex);
2785      qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2786      QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2787      (*rsp)->ram_bytes_total = ram_bytes_total();
2788  
2789      /*
2790       * Count the total number of pages used by ram blocks not including any
2791       * gaps due to alignment or unplugs.
2792       * This must match with the initial values of dirty bitmap.
2793       */
2794      (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
2795      ram_state_reset(*rsp);
2796  
2797      return 0;
2798  }
2799  
2800  static void ram_list_init_bitmaps(void)
2801  {
2802      MigrationState *ms = migrate_get_current();
2803      RAMBlock *block;
2804      unsigned long pages;
2805      uint8_t shift;
2806  
2807      /* Skip setting bitmap if there is no RAM */
2808      if (ram_bytes_total()) {
2809          shift = ms->clear_bitmap_shift;
2810          if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2811              error_report("clear_bitmap_shift (%u) too big, using "
2812                           "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2813              shift = CLEAR_BITMAP_SHIFT_MAX;
2814          } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2815              error_report("clear_bitmap_shift (%u) too small, using "
2816                           "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2817              shift = CLEAR_BITMAP_SHIFT_MIN;
2818          }
2819  
2820          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2821              pages = block->max_length >> TARGET_PAGE_BITS;
2822              /*
2823               * The initial dirty bitmap for migration must be set with all
2824               * ones to make sure we'll migrate every guest RAM page to
2825               * destination.
2826               * Here we set RAMBlock.bmap all to 1 because when rebegin a
2827               * new migration after a failed migration, ram_list.
2828               * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2829               * guest memory.
2830               */
2831              block->bmap = bitmap_new(pages);
2832              bitmap_set(block->bmap, 0, pages);
2833              if (migrate_mapped_ram()) {
2834                  block->file_bmap = bitmap_new(pages);
2835              }
2836              block->clear_bmap_shift = shift;
2837              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2838          }
2839      }
2840  }
2841  
2842  static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2843  {
2844      unsigned long pages;
2845      RAMBlock *rb;
2846  
2847      RCU_READ_LOCK_GUARD();
2848  
2849      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2850              pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2851              rs->migration_dirty_pages -= pages;
2852      }
2853  }
2854  
2855  static void ram_init_bitmaps(RAMState *rs)
2856  {
2857      qemu_mutex_lock_ramlist();
2858  
2859      WITH_RCU_READ_LOCK_GUARD() {
2860          ram_list_init_bitmaps();
2861          /* We don't use dirty log with background snapshots */
2862          if (!migrate_background_snapshot()) {
2863              memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2864              migration_bitmap_sync_precopy(rs, false);
2865          }
2866      }
2867      qemu_mutex_unlock_ramlist();
2868  
2869      /*
2870       * After an eventual first bitmap sync, fixup the initial bitmap
2871       * containing all 1s to exclude any discarded pages from migration.
2872       */
2873      migration_bitmap_clear_discarded_pages(rs);
2874  }
2875  
2876  static int ram_init_all(RAMState **rsp)
2877  {
2878      if (ram_state_init(rsp)) {
2879          return -1;
2880      }
2881  
2882      if (xbzrle_init()) {
2883          ram_state_cleanup(rsp);
2884          return -1;
2885      }
2886  
2887      ram_init_bitmaps(*rsp);
2888  
2889      return 0;
2890  }
2891  
2892  static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2893  {
2894      RAMBlock *block;
2895      uint64_t pages = 0;
2896  
2897      /*
2898       * Postcopy is not using xbzrle/compression, so no need for that.
2899       * Also, since source are already halted, we don't need to care
2900       * about dirty page logging as well.
2901       */
2902  
2903      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2904          pages += bitmap_count_one(block->bmap,
2905                                    block->used_length >> TARGET_PAGE_BITS);
2906      }
2907  
2908      /* This may not be aligned with current bitmaps. Recalculate. */
2909      rs->migration_dirty_pages = pages;
2910  
2911      ram_state_reset(rs);
2912  
2913      /* Update RAMState cache of output QEMUFile */
2914      rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
2915  
2916      trace_ram_state_resume_prepare(pages);
2917  }
2918  
2919  /*
2920   * This function clears bits of the free pages reported by the caller from the
2921   * migration dirty bitmap. @addr is the host address corresponding to the
2922   * start of the continuous guest free pages, and @len is the total bytes of
2923   * those pages.
2924   */
2925  void qemu_guest_free_page_hint(void *addr, size_t len)
2926  {
2927      RAMBlock *block;
2928      ram_addr_t offset;
2929      size_t used_len, start, npages;
2930  
2931      /* This function is currently expected to be used during live migration */
2932      if (!migration_is_setup_or_active()) {
2933          return;
2934      }
2935  
2936      for (; len > 0; len -= used_len, addr += used_len) {
2937          block = qemu_ram_block_from_host(addr, false, &offset);
2938          if (unlikely(!block || offset >= block->used_length)) {
2939              /*
2940               * The implementation might not support RAMBlock resize during
2941               * live migration, but it could happen in theory with future
2942               * updates. So we add a check here to capture that case.
2943               */
2944              error_report_once("%s unexpected error", __func__);
2945              return;
2946          }
2947  
2948          if (len <= block->used_length - offset) {
2949              used_len = len;
2950          } else {
2951              used_len = block->used_length - offset;
2952          }
2953  
2954          start = offset >> TARGET_PAGE_BITS;
2955          npages = used_len >> TARGET_PAGE_BITS;
2956  
2957          qemu_mutex_lock(&ram_state->bitmap_mutex);
2958          /*
2959           * The skipped free pages are equavalent to be sent from clear_bmap's
2960           * perspective, so clear the bits from the memory region bitmap which
2961           * are initially set. Otherwise those skipped pages will be sent in
2962           * the next round after syncing from the memory region bitmap.
2963           */
2964          migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2965          ram_state->migration_dirty_pages -=
2966                        bitmap_count_one_with_offset(block->bmap, start, npages);
2967          bitmap_clear(block->bmap, start, npages);
2968          qemu_mutex_unlock(&ram_state->bitmap_mutex);
2969      }
2970  }
2971  
2972  #define MAPPED_RAM_HDR_VERSION 1
2973  struct MappedRamHeader {
2974      uint32_t version;
2975      /*
2976       * The target's page size, so we know how many pages are in the
2977       * bitmap.
2978       */
2979      uint64_t page_size;
2980      /*
2981       * The offset in the migration file where the pages bitmap is
2982       * stored.
2983       */
2984      uint64_t bitmap_offset;
2985      /*
2986       * The offset in the migration file where the actual pages (data)
2987       * are stored.
2988       */
2989      uint64_t pages_offset;
2990  } QEMU_PACKED;
2991  typedef struct MappedRamHeader MappedRamHeader;
2992  
2993  static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block)
2994  {
2995      g_autofree MappedRamHeader *header = NULL;
2996      size_t header_size, bitmap_size;
2997      long num_pages;
2998  
2999      header = g_new0(MappedRamHeader, 1);
3000      header_size = sizeof(MappedRamHeader);
3001  
3002      num_pages = block->used_length >> TARGET_PAGE_BITS;
3003      bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
3004  
3005      /*
3006       * Save the file offsets of where the bitmap and the pages should
3007       * go as they are written at the end of migration and during the
3008       * iterative phase, respectively.
3009       */
3010      block->bitmap_offset = qemu_get_offset(file) + header_size;
3011      block->pages_offset = ROUND_UP(block->bitmap_offset +
3012                                     bitmap_size,
3013                                     MAPPED_RAM_FILE_OFFSET_ALIGNMENT);
3014  
3015      header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION);
3016      header->page_size = cpu_to_be64(TARGET_PAGE_SIZE);
3017      header->bitmap_offset = cpu_to_be64(block->bitmap_offset);
3018      header->pages_offset = cpu_to_be64(block->pages_offset);
3019  
3020      qemu_put_buffer(file, (uint8_t *) header, header_size);
3021  
3022      /* prepare offset for next ramblock */
3023      qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET);
3024  }
3025  
3026  static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header,
3027                                     Error **errp)
3028  {
3029      size_t ret, header_size = sizeof(MappedRamHeader);
3030  
3031      ret = qemu_get_buffer(file, (uint8_t *)header, header_size);
3032      if (ret != header_size) {
3033          error_setg(errp, "Could not read whole mapped-ram migration header "
3034                     "(expected %zd, got %zd bytes)", header_size, ret);
3035          return false;
3036      }
3037  
3038      /* migration stream is big-endian */
3039      header->version = be32_to_cpu(header->version);
3040  
3041      if (header->version > MAPPED_RAM_HDR_VERSION) {
3042          error_setg(errp, "Migration mapped-ram capability version not "
3043                     "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION,
3044                     header->version);
3045          return false;
3046      }
3047  
3048      header->page_size = be64_to_cpu(header->page_size);
3049      header->bitmap_offset = be64_to_cpu(header->bitmap_offset);
3050      header->pages_offset = be64_to_cpu(header->pages_offset);
3051  
3052      return true;
3053  }
3054  
3055  /*
3056   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3057   * long-running RCU critical section.  When rcu-reclaims in the code
3058   * start to become numerous it will be necessary to reduce the
3059   * granularity of these critical sections.
3060   */
3061  
3062  /**
3063   * ram_save_setup: Setup RAM for migration
3064   *
3065   * Returns zero to indicate success and negative for error
3066   *
3067   * @f: QEMUFile where to send the data
3068   * @opaque: RAMState pointer
3069   */
3070  static int ram_save_setup(QEMUFile *f, void *opaque)
3071  {
3072      RAMState **rsp = opaque;
3073      RAMBlock *block;
3074      int ret, max_hg_page_size;
3075  
3076      if (compress_threads_save_setup()) {
3077          return -1;
3078      }
3079  
3080      /* migration has already setup the bitmap, reuse it. */
3081      if (!migration_in_colo_state()) {
3082          if (ram_init_all(rsp) != 0) {
3083              compress_threads_save_cleanup();
3084              return -1;
3085          }
3086      }
3087      (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3088  
3089      /*
3090       * ??? Mirrors the previous value of qemu_host_page_size,
3091       * but is this really what was intended for the migration?
3092       */
3093      max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);
3094  
3095      WITH_RCU_READ_LOCK_GUARD() {
3096          qemu_put_be64(f, ram_bytes_total_with_ignored()
3097                           | RAM_SAVE_FLAG_MEM_SIZE);
3098  
3099          RAMBLOCK_FOREACH_MIGRATABLE(block) {
3100              qemu_put_byte(f, strlen(block->idstr));
3101              qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3102              qemu_put_be64(f, block->used_length);
3103              if (migrate_postcopy_ram() &&
3104                  block->page_size != max_hg_page_size) {
3105                  qemu_put_be64(f, block->page_size);
3106              }
3107              if (migrate_ignore_shared()) {
3108                  qemu_put_be64(f, block->mr->addr);
3109              }
3110  
3111              if (migrate_mapped_ram()) {
3112                  mapped_ram_setup_ramblock(f, block);
3113              }
3114          }
3115      }
3116  
3117      ret = rdma_registration_start(f, RAM_CONTROL_SETUP);
3118      if (ret < 0) {
3119          qemu_file_set_error(f, ret);
3120          return ret;
3121      }
3122  
3123      ret = rdma_registration_stop(f, RAM_CONTROL_SETUP);
3124      if (ret < 0) {
3125          qemu_file_set_error(f, ret);
3126          return ret;
3127      }
3128  
3129      migration_ops = g_malloc0(sizeof(MigrationOps));
3130  
3131      if (migrate_multifd()) {
3132          migration_ops->ram_save_target_page = ram_save_target_page_multifd;
3133      } else {
3134          migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3135      }
3136  
3137      bql_unlock();
3138      ret = multifd_send_sync_main();
3139      bql_lock();
3140      if (ret < 0) {
3141          return ret;
3142      }
3143  
3144      if (migrate_multifd() && !migrate_multifd_flush_after_each_section()
3145          && !migrate_mapped_ram()) {
3146          qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3147      }
3148  
3149      qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3150      return qemu_fflush(f);
3151  }
3152  
3153  static void ram_save_file_bmap(QEMUFile *f)
3154  {
3155      RAMBlock *block;
3156  
3157      RAMBLOCK_FOREACH_MIGRATABLE(block) {
3158          long num_pages = block->used_length >> TARGET_PAGE_BITS;
3159          long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
3160  
3161          qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size,
3162                             block->bitmap_offset);
3163          ram_transferred_add(bitmap_size);
3164  
3165          /*
3166           * Free the bitmap here to catch any synchronization issues
3167           * with multifd channels. No channels should be sending pages
3168           * after we've written the bitmap to file.
3169           */
3170          g_free(block->file_bmap);
3171          block->file_bmap = NULL;
3172      }
3173  }
3174  
3175  void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set)
3176  {
3177      if (set) {
3178          set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap);
3179      } else {
3180          clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap);
3181      }
3182  }
3183  
3184  /**
3185   * ram_save_iterate: iterative stage for migration
3186   *
3187   * Returns zero to indicate success and negative for error
3188   *
3189   * @f: QEMUFile where to send the data
3190   * @opaque: RAMState pointer
3191   */
3192  static int ram_save_iterate(QEMUFile *f, void *opaque)
3193  {
3194      RAMState **temp = opaque;
3195      RAMState *rs = *temp;
3196      int ret = 0;
3197      int i;
3198      int64_t t0;
3199      int done = 0;
3200  
3201      if (blk_mig_bulk_active()) {
3202          /* Avoid transferring ram during bulk phase of block migration as
3203           * the bulk phase will usually take a long time and transferring
3204           * ram updates during that time is pointless. */
3205          goto out;
3206      }
3207  
3208      /*
3209       * We'll take this lock a little bit long, but it's okay for two reasons.
3210       * Firstly, the only possible other thread to take it is who calls
3211       * qemu_guest_free_page_hint(), which should be rare; secondly, see
3212       * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3213       * guarantees that we'll at least released it in a regular basis.
3214       */
3215      WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) {
3216          WITH_RCU_READ_LOCK_GUARD() {
3217              if (ram_list.version != rs->last_version) {
3218                  ram_state_reset(rs);
3219              }
3220  
3221              /* Read version before ram_list.blocks */
3222              smp_rmb();
3223  
3224              ret = rdma_registration_start(f, RAM_CONTROL_ROUND);
3225              if (ret < 0) {
3226                  qemu_file_set_error(f, ret);
3227                  goto out;
3228              }
3229  
3230              t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3231              i = 0;
3232              while ((ret = migration_rate_exceeded(f)) == 0 ||
3233                     postcopy_has_request(rs)) {
3234                  int pages;
3235  
3236                  if (qemu_file_get_error(f)) {
3237                      break;
3238                  }
3239  
3240                  pages = ram_find_and_save_block(rs);
3241                  /* no more pages to sent */
3242                  if (pages == 0) {
3243                      done = 1;
3244                      break;
3245                  }
3246  
3247                  if (pages < 0) {
3248                      qemu_file_set_error(f, pages);
3249                      break;
3250                  }
3251  
3252                  rs->target_page_count += pages;
3253  
3254                  /*
3255                   * During postcopy, it is necessary to make sure one whole host
3256                   * page is sent in one chunk.
3257                   */
3258                  if (migrate_postcopy_ram()) {
3259                      compress_flush_data();
3260                  }
3261  
3262                  /*
3263                   * we want to check in the 1st loop, just in case it was the 1st
3264                   * time and we had to sync the dirty bitmap.
3265                   * qemu_clock_get_ns() is a bit expensive, so we only check each
3266                   * some iterations
3267                   */
3268                  if ((i & 63) == 0) {
3269                      uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3270                          1000000;
3271                      if (t1 > MAX_WAIT) {
3272                          trace_ram_save_iterate_big_wait(t1, i);
3273                          break;
3274                      }
3275                  }
3276                  i++;
3277              }
3278          }
3279      }
3280  
3281      /*
3282       * Must occur before EOS (or any QEMUFile operation)
3283       * because of RDMA protocol.
3284       */
3285      ret = rdma_registration_stop(f, RAM_CONTROL_ROUND);
3286      if (ret < 0) {
3287          qemu_file_set_error(f, ret);
3288      }
3289  
3290  out:
3291      if (ret >= 0
3292          && migration_is_setup_or_active()) {
3293          if (migrate_multifd() && migrate_multifd_flush_after_each_section() &&
3294              !migrate_mapped_ram()) {
3295              ret = multifd_send_sync_main();
3296              if (ret < 0) {
3297                  return ret;
3298              }
3299          }
3300  
3301          qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3302          ram_transferred_add(8);
3303          ret = qemu_fflush(f);
3304      }
3305      if (ret < 0) {
3306          return ret;
3307      }
3308  
3309      return done;
3310  }
3311  
3312  /**
3313   * ram_save_complete: function called to send the remaining amount of ram
3314   *
3315   * Returns zero to indicate success or negative on error
3316   *
3317   * Called with the BQL
3318   *
3319   * @f: QEMUFile where to send the data
3320   * @opaque: RAMState pointer
3321   */
3322  static int ram_save_complete(QEMUFile *f, void *opaque)
3323  {
3324      RAMState **temp = opaque;
3325      RAMState *rs = *temp;
3326      int ret = 0;
3327  
3328      rs->last_stage = !migration_in_colo_state();
3329  
3330      WITH_RCU_READ_LOCK_GUARD() {
3331          if (!migration_in_postcopy()) {
3332              migration_bitmap_sync_precopy(rs, true);
3333          }
3334  
3335          ret = rdma_registration_start(f, RAM_CONTROL_FINISH);
3336          if (ret < 0) {
3337              qemu_file_set_error(f, ret);
3338              return ret;
3339          }
3340  
3341          /* try transferring iterative blocks of memory */
3342  
3343          /* flush all remaining blocks regardless of rate limiting */
3344          qemu_mutex_lock(&rs->bitmap_mutex);
3345          while (true) {
3346              int pages;
3347  
3348              pages = ram_find_and_save_block(rs);
3349              /* no more blocks to sent */
3350              if (pages == 0) {
3351                  break;
3352              }
3353              if (pages < 0) {
3354                  qemu_mutex_unlock(&rs->bitmap_mutex);
3355                  return pages;
3356              }
3357          }
3358          qemu_mutex_unlock(&rs->bitmap_mutex);
3359  
3360          compress_flush_data();
3361  
3362          ret = rdma_registration_stop(f, RAM_CONTROL_FINISH);
3363          if (ret < 0) {
3364              qemu_file_set_error(f, ret);
3365              return ret;
3366          }
3367      }
3368  
3369      ret = multifd_send_sync_main();
3370      if (ret < 0) {
3371          return ret;
3372      }
3373  
3374      if (migrate_mapped_ram()) {
3375          ram_save_file_bmap(f);
3376  
3377          if (qemu_file_get_error(f)) {
3378              Error *local_err = NULL;
3379              int err = qemu_file_get_error_obj(f, &local_err);
3380  
3381              error_reportf_err(local_err, "Failed to write bitmap to file: ");
3382              return -err;
3383          }
3384      }
3385  
3386      if (migrate_multifd() && !migrate_multifd_flush_after_each_section() &&
3387          !migrate_mapped_ram()) {
3388          qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3389      }
3390      qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3391      return qemu_fflush(f);
3392  }
3393  
3394  static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3395                                         uint64_t *can_postcopy)
3396  {
3397      RAMState **temp = opaque;
3398      RAMState *rs = *temp;
3399  
3400      uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3401  
3402      if (migrate_postcopy_ram()) {
3403          /* We can do postcopy, and all the data is postcopiable */
3404          *can_postcopy += remaining_size;
3405      } else {
3406          *must_precopy += remaining_size;
3407      }
3408  }
3409  
3410  static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3411                                      uint64_t *can_postcopy)
3412  {
3413      RAMState **temp = opaque;
3414      RAMState *rs = *temp;
3415      uint64_t remaining_size;
3416  
3417      if (!migration_in_postcopy()) {
3418          bql_lock();
3419          WITH_RCU_READ_LOCK_GUARD() {
3420              migration_bitmap_sync_precopy(rs, false);
3421          }
3422          bql_unlock();
3423      }
3424  
3425      remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3426  
3427      if (migrate_postcopy_ram()) {
3428          /* We can do postcopy, and all the data is postcopiable */
3429          *can_postcopy += remaining_size;
3430      } else {
3431          *must_precopy += remaining_size;
3432      }
3433  }
3434  
3435  static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3436  {
3437      unsigned int xh_len;
3438      int xh_flags;
3439      uint8_t *loaded_data;
3440  
3441      /* extract RLE header */
3442      xh_flags = qemu_get_byte(f);
3443      xh_len = qemu_get_be16(f);
3444  
3445      if (xh_flags != ENCODING_FLAG_XBZRLE) {
3446          error_report("Failed to load XBZRLE page - wrong compression!");
3447          return -1;
3448      }
3449  
3450      if (xh_len > TARGET_PAGE_SIZE) {
3451          error_report("Failed to load XBZRLE page - len overflow!");
3452          return -1;
3453      }
3454      loaded_data = XBZRLE.decoded_buf;
3455      /* load data and decode */
3456      /* it can change loaded_data to point to an internal buffer */
3457      qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3458  
3459      /* decode RLE */
3460      if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3461                               TARGET_PAGE_SIZE) == -1) {
3462          error_report("Failed to load XBZRLE page - decode error!");
3463          return -1;
3464      }
3465  
3466      return 0;
3467  }
3468  
3469  /**
3470   * ram_block_from_stream: read a RAMBlock id from the migration stream
3471   *
3472   * Must be called from within a rcu critical section.
3473   *
3474   * Returns a pointer from within the RCU-protected ram_list.
3475   *
3476   * @mis: the migration incoming state pointer
3477   * @f: QEMUFile where to read the data from
3478   * @flags: Page flags (mostly to see if it's a continuation of previous block)
3479   * @channel: the channel we're using
3480   */
3481  static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3482                                                QEMUFile *f, int flags,
3483                                                int channel)
3484  {
3485      RAMBlock *block = mis->last_recv_block[channel];
3486      char id[256];
3487      uint8_t len;
3488  
3489      if (flags & RAM_SAVE_FLAG_CONTINUE) {
3490          if (!block) {
3491              error_report("Ack, bad migration stream!");
3492              return NULL;
3493          }
3494          return block;
3495      }
3496  
3497      len = qemu_get_byte(f);
3498      qemu_get_buffer(f, (uint8_t *)id, len);
3499      id[len] = 0;
3500  
3501      block = qemu_ram_block_by_name(id);
3502      if (!block) {
3503          error_report("Can't find block %s", id);
3504          return NULL;
3505      }
3506  
3507      if (migrate_ram_is_ignored(block)) {
3508          error_report("block %s should not be migrated !", id);
3509          return NULL;
3510      }
3511  
3512      mis->last_recv_block[channel] = block;
3513  
3514      return block;
3515  }
3516  
3517  static inline void *host_from_ram_block_offset(RAMBlock *block,
3518                                                 ram_addr_t offset)
3519  {
3520      if (!offset_in_ramblock(block, offset)) {
3521          return NULL;
3522      }
3523  
3524      return block->host + offset;
3525  }
3526  
3527  static void *host_page_from_ram_block_offset(RAMBlock *block,
3528                                               ram_addr_t offset)
3529  {
3530      /* Note: Explicitly no check against offset_in_ramblock(). */
3531      return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3532                                     block->page_size);
3533  }
3534  
3535  static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3536                                                           ram_addr_t offset)
3537  {
3538      return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3539  }
3540  
3541  void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3542  {
3543      qemu_mutex_lock(&ram_state->bitmap_mutex);
3544      for (int i = 0; i < pages; i++) {
3545          ram_addr_t offset = normal[i];
3546          ram_state->migration_dirty_pages += !test_and_set_bit(
3547                                                  offset >> TARGET_PAGE_BITS,
3548                                                  block->bmap);
3549      }
3550      qemu_mutex_unlock(&ram_state->bitmap_mutex);
3551  }
3552  
3553  static inline void *colo_cache_from_block_offset(RAMBlock *block,
3554                               ram_addr_t offset, bool record_bitmap)
3555  {
3556      if (!offset_in_ramblock(block, offset)) {
3557          return NULL;
3558      }
3559      if (!block->colo_cache) {
3560          error_report("%s: colo_cache is NULL in block :%s",
3561                       __func__, block->idstr);
3562          return NULL;
3563      }
3564  
3565      /*
3566      * During colo checkpoint, we need bitmap of these migrated pages.
3567      * It help us to decide which pages in ram cache should be flushed
3568      * into VM's RAM later.
3569      */
3570      if (record_bitmap) {
3571          colo_record_bitmap(block, &offset, 1);
3572      }
3573      return block->colo_cache + offset;
3574  }
3575  
3576  /**
3577   * ram_handle_zero: handle the zero page case
3578   *
3579   * If a page (or a whole RDMA chunk) has been
3580   * determined to be zero, then zap it.
3581   *
3582   * @host: host address for the zero page
3583   * @ch: what the page is filled from.  We only support zero
3584   * @size: size of the zero page
3585   */
3586  void ram_handle_zero(void *host, uint64_t size)
3587  {
3588      if (!buffer_is_zero(host, size)) {
3589          memset(host, 0, size);
3590      }
3591  }
3592  
3593  static void colo_init_ram_state(void)
3594  {
3595      ram_state_init(&ram_state);
3596  }
3597  
3598  /*
3599   * colo cache: this is for secondary VM, we cache the whole
3600   * memory of the secondary VM, it is need to hold the global lock
3601   * to call this helper.
3602   */
3603  int colo_init_ram_cache(void)
3604  {
3605      RAMBlock *block;
3606  
3607      WITH_RCU_READ_LOCK_GUARD() {
3608          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3609              block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3610                                                      NULL, false, false);
3611              if (!block->colo_cache) {
3612                  error_report("%s: Can't alloc memory for COLO cache of block %s,"
3613                               "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3614                               block->used_length);
3615                  RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3616                      if (block->colo_cache) {
3617                          qemu_anon_ram_free(block->colo_cache, block->used_length);
3618                          block->colo_cache = NULL;
3619                      }
3620                  }
3621                  return -errno;
3622              }
3623              if (!machine_dump_guest_core(current_machine)) {
3624                  qemu_madvise(block->colo_cache, block->used_length,
3625                               QEMU_MADV_DONTDUMP);
3626              }
3627          }
3628      }
3629  
3630      /*
3631      * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3632      * with to decide which page in cache should be flushed into SVM's RAM. Here
3633      * we use the same name 'ram_bitmap' as for migration.
3634      */
3635      if (ram_bytes_total()) {
3636          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3637              unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3638              block->bmap = bitmap_new(pages);
3639          }
3640      }
3641  
3642      colo_init_ram_state();
3643      return 0;
3644  }
3645  
3646  /* TODO: duplicated with ram_init_bitmaps */
3647  void colo_incoming_start_dirty_log(void)
3648  {
3649      RAMBlock *block = NULL;
3650      /* For memory_global_dirty_log_start below. */
3651      bql_lock();
3652      qemu_mutex_lock_ramlist();
3653  
3654      memory_global_dirty_log_sync(false);
3655      WITH_RCU_READ_LOCK_GUARD() {
3656          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3657              ramblock_sync_dirty_bitmap(ram_state, block);
3658              /* Discard this dirty bitmap record */
3659              bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3660          }
3661          memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3662      }
3663      ram_state->migration_dirty_pages = 0;
3664      qemu_mutex_unlock_ramlist();
3665      bql_unlock();
3666  }
3667  
3668  /* It is need to hold the global lock to call this helper */
3669  void colo_release_ram_cache(void)
3670  {
3671      RAMBlock *block;
3672  
3673      memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3674      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3675          g_free(block->bmap);
3676          block->bmap = NULL;
3677      }
3678  
3679      WITH_RCU_READ_LOCK_GUARD() {
3680          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3681              if (block->colo_cache) {
3682                  qemu_anon_ram_free(block->colo_cache, block->used_length);
3683                  block->colo_cache = NULL;
3684              }
3685          }
3686      }
3687      ram_state_cleanup(&ram_state);
3688  }
3689  
3690  /**
3691   * ram_load_setup: Setup RAM for migration incoming side
3692   *
3693   * Returns zero to indicate success and negative for error
3694   *
3695   * @f: QEMUFile where to receive the data
3696   * @opaque: RAMState pointer
3697   */
3698  static int ram_load_setup(QEMUFile *f, void *opaque)
3699  {
3700      xbzrle_load_setup();
3701      ramblock_recv_map_init();
3702  
3703      return 0;
3704  }
3705  
3706  static int ram_load_cleanup(void *opaque)
3707  {
3708      RAMBlock *rb;
3709  
3710      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3711          qemu_ram_block_writeback(rb);
3712      }
3713  
3714      xbzrle_load_cleanup();
3715  
3716      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3717          g_free(rb->receivedmap);
3718          rb->receivedmap = NULL;
3719      }
3720  
3721      return 0;
3722  }
3723  
3724  /**
3725   * ram_postcopy_incoming_init: allocate postcopy data structures
3726   *
3727   * Returns 0 for success and negative if there was one error
3728   *
3729   * @mis: current migration incoming state
3730   *
3731   * Allocate data structures etc needed by incoming migration with
3732   * postcopy-ram. postcopy-ram's similarly names
3733   * postcopy_ram_incoming_init does the work.
3734   */
3735  int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3736  {
3737      return postcopy_ram_incoming_init(mis);
3738  }
3739  
3740  /**
3741   * ram_load_postcopy: load a page in postcopy case
3742   *
3743   * Returns 0 for success or -errno in case of error
3744   *
3745   * Called in postcopy mode by ram_load().
3746   * rcu_read_lock is taken prior to this being called.
3747   *
3748   * @f: QEMUFile where to send the data
3749   * @channel: the channel to use for loading
3750   */
3751  int ram_load_postcopy(QEMUFile *f, int channel)
3752  {
3753      int flags = 0, ret = 0;
3754      bool place_needed = false;
3755      bool matches_target_page_size = false;
3756      MigrationIncomingState *mis = migration_incoming_get_current();
3757      PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3758  
3759      while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3760          ram_addr_t addr;
3761          void *page_buffer = NULL;
3762          void *place_source = NULL;
3763          RAMBlock *block = NULL;
3764          uint8_t ch;
3765          int len;
3766  
3767          addr = qemu_get_be64(f);
3768  
3769          /*
3770           * If qemu file error, we should stop here, and then "addr"
3771           * may be invalid
3772           */
3773          ret = qemu_file_get_error(f);
3774          if (ret) {
3775              break;
3776          }
3777  
3778          flags = addr & ~TARGET_PAGE_MASK;
3779          addr &= TARGET_PAGE_MASK;
3780  
3781          trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3782          if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3783                       RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3784              block = ram_block_from_stream(mis, f, flags, channel);
3785              if (!block) {
3786                  ret = -EINVAL;
3787                  break;
3788              }
3789  
3790              /*
3791               * Relying on used_length is racy and can result in false positives.
3792               * We might place pages beyond used_length in case RAM was shrunk
3793               * while in postcopy, which is fine - trying to place via
3794               * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3795               */
3796              if (!block->host || addr >= block->postcopy_length) {
3797                  error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3798                  ret = -EINVAL;
3799                  break;
3800              }
3801              tmp_page->target_pages++;
3802              matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3803              /*
3804               * Postcopy requires that we place whole host pages atomically;
3805               * these may be huge pages for RAMBlocks that are backed by
3806               * hugetlbfs.
3807               * To make it atomic, the data is read into a temporary page
3808               * that's moved into place later.
3809               * The migration protocol uses,  possibly smaller, target-pages
3810               * however the source ensures it always sends all the components
3811               * of a host page in one chunk.
3812               */
3813              page_buffer = tmp_page->tmp_huge_page +
3814                            host_page_offset_from_ram_block_offset(block, addr);
3815              /* If all TP are zero then we can optimise the place */
3816              if (tmp_page->target_pages == 1) {
3817                  tmp_page->host_addr =
3818                      host_page_from_ram_block_offset(block, addr);
3819              } else if (tmp_page->host_addr !=
3820                         host_page_from_ram_block_offset(block, addr)) {
3821                  /* not the 1st TP within the HP */
3822                  error_report("Non-same host page detected on channel %d: "
3823                               "Target host page %p, received host page %p "
3824                               "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3825                               channel, tmp_page->host_addr,
3826                               host_page_from_ram_block_offset(block, addr),
3827                               block->idstr, addr, tmp_page->target_pages);
3828                  ret = -EINVAL;
3829                  break;
3830              }
3831  
3832              /*
3833               * If it's the last part of a host page then we place the host
3834               * page
3835               */
3836              if (tmp_page->target_pages ==
3837                  (block->page_size / TARGET_PAGE_SIZE)) {
3838                  place_needed = true;
3839              }
3840              place_source = tmp_page->tmp_huge_page;
3841          }
3842  
3843          switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3844          case RAM_SAVE_FLAG_ZERO:
3845              ch = qemu_get_byte(f);
3846              if (ch != 0) {
3847                  error_report("Found a zero page with value %d", ch);
3848                  ret = -EINVAL;
3849                  break;
3850              }
3851              /*
3852               * Can skip to set page_buffer when
3853               * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3854               */
3855              if (!matches_target_page_size) {
3856                  memset(page_buffer, ch, TARGET_PAGE_SIZE);
3857              }
3858              break;
3859  
3860          case RAM_SAVE_FLAG_PAGE:
3861              tmp_page->all_zero = false;
3862              if (!matches_target_page_size) {
3863                  /* For huge pages, we always use temporary buffer */
3864                  qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3865              } else {
3866                  /*
3867                   * For small pages that matches target page size, we
3868                   * avoid the qemu_file copy.  Instead we directly use
3869                   * the buffer of QEMUFile to place the page.  Note: we
3870                   * cannot do any QEMUFile operation before using that
3871                   * buffer to make sure the buffer is valid when
3872                   * placing the page.
3873                   */
3874                  qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3875                                           TARGET_PAGE_SIZE);
3876              }
3877              break;
3878          case RAM_SAVE_FLAG_COMPRESS_PAGE:
3879              tmp_page->all_zero = false;
3880              len = qemu_get_be32(f);
3881              if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3882                  error_report("Invalid compressed data length: %d", len);
3883                  ret = -EINVAL;
3884                  break;
3885              }
3886              decompress_data_with_multi_threads(f, page_buffer, len);
3887              break;
3888          case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3889              multifd_recv_sync_main();
3890              break;
3891          case RAM_SAVE_FLAG_EOS:
3892              /* normal exit */
3893              if (migrate_multifd() &&
3894                  migrate_multifd_flush_after_each_section()) {
3895                  multifd_recv_sync_main();
3896              }
3897              break;
3898          default:
3899              error_report("Unknown combination of migration flags: 0x%x"
3900                           " (postcopy mode)", flags);
3901              ret = -EINVAL;
3902              break;
3903          }
3904  
3905          /* Got the whole host page, wait for decompress before placing. */
3906          if (place_needed) {
3907              ret |= wait_for_decompress_done();
3908          }
3909  
3910          /* Detect for any possible file errors */
3911          if (!ret && qemu_file_get_error(f)) {
3912              ret = qemu_file_get_error(f);
3913          }
3914  
3915          if (!ret && place_needed) {
3916              if (tmp_page->all_zero) {
3917                  ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3918              } else {
3919                  ret = postcopy_place_page(mis, tmp_page->host_addr,
3920                                            place_source, block);
3921              }
3922              place_needed = false;
3923              postcopy_temp_page_reset(tmp_page);
3924          }
3925      }
3926  
3927      return ret;
3928  }
3929  
3930  static bool postcopy_is_running(void)
3931  {
3932      PostcopyState ps = postcopy_state_get();
3933      return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3934  }
3935  
3936  /*
3937   * Flush content of RAM cache into SVM's memory.
3938   * Only flush the pages that be dirtied by PVM or SVM or both.
3939   */
3940  void colo_flush_ram_cache(void)
3941  {
3942      RAMBlock *block = NULL;
3943      void *dst_host;
3944      void *src_host;
3945      unsigned long offset = 0;
3946  
3947      memory_global_dirty_log_sync(false);
3948      qemu_mutex_lock(&ram_state->bitmap_mutex);
3949      WITH_RCU_READ_LOCK_GUARD() {
3950          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3951              ramblock_sync_dirty_bitmap(ram_state, block);
3952          }
3953      }
3954  
3955      trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3956      WITH_RCU_READ_LOCK_GUARD() {
3957          block = QLIST_FIRST_RCU(&ram_list.blocks);
3958  
3959          while (block) {
3960              unsigned long num = 0;
3961  
3962              offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3963              if (!offset_in_ramblock(block,
3964                                      ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3965                  offset = 0;
3966                  num = 0;
3967                  block = QLIST_NEXT_RCU(block, next);
3968              } else {
3969                  unsigned long i = 0;
3970  
3971                  for (i = 0; i < num; i++) {
3972                      migration_bitmap_clear_dirty(ram_state, block, offset + i);
3973                  }
3974                  dst_host = block->host
3975                           + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3976                  src_host = block->colo_cache
3977                           + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3978                  memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3979                  offset += num;
3980              }
3981          }
3982      }
3983      qemu_mutex_unlock(&ram_state->bitmap_mutex);
3984      trace_colo_flush_ram_cache_end();
3985  }
3986  
3987  static size_t ram_load_multifd_pages(void *host_addr, size_t size,
3988                                       uint64_t offset)
3989  {
3990      MultiFDRecvData *data = multifd_get_recv_data();
3991  
3992      data->opaque = host_addr;
3993      data->file_offset = offset;
3994      data->size = size;
3995  
3996      if (!multifd_recv()) {
3997          return 0;
3998      }
3999  
4000      return size;
4001  }
4002  
4003  static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block,
4004                                       long num_pages, unsigned long *bitmap,
4005                                       Error **errp)
4006  {
4007      ERRP_GUARD();
4008      unsigned long set_bit_idx, clear_bit_idx;
4009      ram_addr_t offset;
4010      void *host;
4011      size_t read, unread, size;
4012  
4013      for (set_bit_idx = find_first_bit(bitmap, num_pages);
4014           set_bit_idx < num_pages;
4015           set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) {
4016  
4017          clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
4018  
4019          unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx);
4020          offset = set_bit_idx << TARGET_PAGE_BITS;
4021  
4022          while (unread > 0) {
4023              host = host_from_ram_block_offset(block, offset);
4024              if (!host) {
4025                  error_setg(errp, "page outside of ramblock %s range",
4026                             block->idstr);
4027                  return false;
4028              }
4029  
4030              size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE);
4031  
4032              if (migrate_multifd()) {
4033                  read = ram_load_multifd_pages(host, size,
4034                                                block->pages_offset + offset);
4035              } else {
4036                  read = qemu_get_buffer_at(f, host, size,
4037                                            block->pages_offset + offset);
4038              }
4039  
4040              if (!read) {
4041                  goto err;
4042              }
4043              offset += read;
4044              unread -= read;
4045          }
4046      }
4047  
4048      return true;
4049  
4050  err:
4051      qemu_file_get_error_obj(f, errp);
4052      error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT
4053                    "from file offset %" PRIx64 ": ", block->idstr, offset,
4054                    block->pages_offset + offset);
4055      return false;
4056  }
4057  
4058  static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block,
4059                                        ram_addr_t length, Error **errp)
4060  {
4061      g_autofree unsigned long *bitmap = NULL;
4062      MappedRamHeader header;
4063      size_t bitmap_size;
4064      long num_pages;
4065  
4066      if (!mapped_ram_read_header(f, &header, errp)) {
4067          return;
4068      }
4069  
4070      block->pages_offset = header.pages_offset;
4071  
4072      /*
4073       * Check the alignment of the file region that contains pages. We
4074       * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that
4075       * value to change in the future. Do only a sanity check with page
4076       * size alignment.
4077       */
4078      if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) {
4079          error_setg(errp,
4080                     "Error reading ramblock %s pages, region has bad alignment",
4081                     block->idstr);
4082          return;
4083      }
4084  
4085      num_pages = length / header.page_size;
4086      bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
4087  
4088      bitmap = g_malloc0(bitmap_size);
4089      if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size,
4090                             header.bitmap_offset) != bitmap_size) {
4091          error_setg(errp, "Error reading dirty bitmap");
4092          return;
4093      }
4094  
4095      if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) {
4096          return;
4097      }
4098  
4099      /* Skip pages array */
4100      qemu_set_offset(f, block->pages_offset + length, SEEK_SET);
4101  
4102      return;
4103  }
4104  
4105  static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
4106  {
4107      int ret = 0;
4108      /* ADVISE is earlier, it shows the source has the postcopy capability on */
4109      bool postcopy_advised = migration_incoming_postcopy_advised();
4110      int max_hg_page_size;
4111      Error *local_err = NULL;
4112  
4113      assert(block);
4114  
4115      if (migrate_mapped_ram()) {
4116          parse_ramblock_mapped_ram(f, block, length, &local_err);
4117          if (local_err) {
4118              error_report_err(local_err);
4119              return -EINVAL;
4120          }
4121          return 0;
4122      }
4123  
4124      if (!qemu_ram_is_migratable(block)) {
4125          error_report("block %s should not be migrated !", block->idstr);
4126          return -EINVAL;
4127      }
4128  
4129      if (length != block->used_length) {
4130          ret = qemu_ram_resize(block, length, &local_err);
4131          if (local_err) {
4132              error_report_err(local_err);
4133              return ret;
4134          }
4135      }
4136  
4137      /*
4138       * ??? Mirrors the previous value of qemu_host_page_size,
4139       * but is this really what was intended for the migration?
4140       */
4141      max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);
4142  
4143      /* For postcopy we need to check hugepage sizes match */
4144      if (postcopy_advised && migrate_postcopy_ram() &&
4145          block->page_size != max_hg_page_size) {
4146          uint64_t remote_page_size = qemu_get_be64(f);
4147          if (remote_page_size != block->page_size) {
4148              error_report("Mismatched RAM page size %s "
4149                           "(local) %zd != %" PRId64, block->idstr,
4150                           block->page_size, remote_page_size);
4151              return -EINVAL;
4152          }
4153      }
4154      if (migrate_ignore_shared()) {
4155          hwaddr addr = qemu_get_be64(f);
4156          if (migrate_ram_is_ignored(block) &&
4157              block->mr->addr != addr) {
4158              error_report("Mismatched GPAs for block %s "
4159                           "%" PRId64 "!= %" PRId64, block->idstr,
4160                           (uint64_t)addr, (uint64_t)block->mr->addr);
4161              return -EINVAL;
4162          }
4163      }
4164      ret = rdma_block_notification_handle(f, block->idstr);
4165      if (ret < 0) {
4166          qemu_file_set_error(f, ret);
4167      }
4168  
4169      return ret;
4170  }
4171  
4172  static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes)
4173  {
4174      int ret = 0;
4175  
4176      /* Synchronize RAM block list */
4177      while (!ret && total_ram_bytes) {
4178          RAMBlock *block;
4179          char id[256];
4180          ram_addr_t length;
4181          int len = qemu_get_byte(f);
4182  
4183          qemu_get_buffer(f, (uint8_t *)id, len);
4184          id[len] = 0;
4185          length = qemu_get_be64(f);
4186  
4187          block = qemu_ram_block_by_name(id);
4188          if (block) {
4189              ret = parse_ramblock(f, block, length);
4190          } else {
4191              error_report("Unknown ramblock \"%s\", cannot accept "
4192                           "migration", id);
4193              ret = -EINVAL;
4194          }
4195          total_ram_bytes -= length;
4196      }
4197  
4198      return ret;
4199  }
4200  
4201  /**
4202   * ram_load_precopy: load pages in precopy case
4203   *
4204   * Returns 0 for success or -errno in case of error
4205   *
4206   * Called in precopy mode by ram_load().
4207   * rcu_read_lock is taken prior to this being called.
4208   *
4209   * @f: QEMUFile where to send the data
4210   */
4211  static int ram_load_precopy(QEMUFile *f)
4212  {
4213      MigrationIncomingState *mis = migration_incoming_get_current();
4214      int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4215  
4216      if (!migrate_compress()) {
4217          invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4218      }
4219  
4220      if (migrate_mapped_ram()) {
4221          invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH |
4222                            RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE |
4223                            RAM_SAVE_FLAG_ZERO);
4224      }
4225  
4226      while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4227          ram_addr_t addr;
4228          void *host = NULL, *host_bak = NULL;
4229          uint8_t ch;
4230  
4231          /*
4232           * Yield periodically to let main loop run, but an iteration of
4233           * the main loop is expensive, so do it each some iterations
4234           */
4235          if ((i & 32767) == 0 && qemu_in_coroutine()) {
4236              aio_co_schedule(qemu_get_current_aio_context(),
4237                              qemu_coroutine_self());
4238              qemu_coroutine_yield();
4239          }
4240          i++;
4241  
4242          addr = qemu_get_be64(f);
4243          ret = qemu_file_get_error(f);
4244          if (ret) {
4245              error_report("Getting RAM address failed");
4246              break;
4247          }
4248  
4249          flags = addr & ~TARGET_PAGE_MASK;
4250          addr &= TARGET_PAGE_MASK;
4251  
4252          if (flags & invalid_flags) {
4253              error_report("Unexpected RAM flags: %d", flags & invalid_flags);
4254  
4255              if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4256                  error_report("Received an unexpected compressed page");
4257              }
4258  
4259              ret = -EINVAL;
4260              break;
4261          }
4262  
4263          if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4264                       RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4265              RAMBlock *block = ram_block_from_stream(mis, f, flags,
4266                                                      RAM_CHANNEL_PRECOPY);
4267  
4268              host = host_from_ram_block_offset(block, addr);
4269              /*
4270               * After going into COLO stage, we should not load the page
4271               * into SVM's memory directly, we put them into colo_cache firstly.
4272               * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4273               * Previously, we copied all these memory in preparing stage of COLO
4274               * while we need to stop VM, which is a time-consuming process.
4275               * Here we optimize it by a trick, back-up every page while in
4276               * migration process while COLO is enabled, though it affects the
4277               * speed of the migration, but it obviously reduce the downtime of
4278               * back-up all SVM'S memory in COLO preparing stage.
4279               */
4280              if (migration_incoming_colo_enabled()) {
4281                  if (migration_incoming_in_colo_state()) {
4282                      /* In COLO stage, put all pages into cache temporarily */
4283                      host = colo_cache_from_block_offset(block, addr, true);
4284                  } else {
4285                     /*
4286                      * In migration stage but before COLO stage,
4287                      * Put all pages into both cache and SVM's memory.
4288                      */
4289                      host_bak = colo_cache_from_block_offset(block, addr, false);
4290                  }
4291              }
4292              if (!host) {
4293                  error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4294                  ret = -EINVAL;
4295                  break;
4296              }
4297              if (!migration_incoming_in_colo_state()) {
4298                  ramblock_recv_bitmap_set(block, host);
4299              }
4300  
4301              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4302          }
4303  
4304          switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4305          case RAM_SAVE_FLAG_MEM_SIZE:
4306              ret = parse_ramblocks(f, addr);
4307              /*
4308               * For mapped-ram migration (to a file) using multifd, we sync
4309               * once and for all here to make sure all tasks we queued to
4310               * multifd threads are completed, so that all the ramblocks
4311               * (including all the guest memory pages within) are fully
4312               * loaded after this sync returns.
4313               */
4314              if (migrate_mapped_ram()) {
4315                  multifd_recv_sync_main();
4316              }
4317              break;
4318  
4319          case RAM_SAVE_FLAG_ZERO:
4320              ch = qemu_get_byte(f);
4321              if (ch != 0) {
4322                  error_report("Found a zero page with value %d", ch);
4323                  ret = -EINVAL;
4324                  break;
4325              }
4326              ram_handle_zero(host, TARGET_PAGE_SIZE);
4327              break;
4328  
4329          case RAM_SAVE_FLAG_PAGE:
4330              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4331              break;
4332  
4333          case RAM_SAVE_FLAG_COMPRESS_PAGE:
4334              len = qemu_get_be32(f);
4335              if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4336                  error_report("Invalid compressed data length: %d", len);
4337                  ret = -EINVAL;
4338                  break;
4339              }
4340              decompress_data_with_multi_threads(f, host, len);
4341              break;
4342  
4343          case RAM_SAVE_FLAG_XBZRLE:
4344              if (load_xbzrle(f, addr, host) < 0) {
4345                  error_report("Failed to decompress XBZRLE page at "
4346                               RAM_ADDR_FMT, addr);
4347                  ret = -EINVAL;
4348                  break;
4349              }
4350              break;
4351          case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4352              multifd_recv_sync_main();
4353              break;
4354          case RAM_SAVE_FLAG_EOS:
4355              /* normal exit */
4356              if (migrate_multifd() &&
4357                  migrate_multifd_flush_after_each_section() &&
4358                  /*
4359                   * Mapped-ram migration flushes once and for all after
4360                   * parsing ramblocks. Always ignore EOS for it.
4361                   */
4362                  !migrate_mapped_ram()) {
4363                  multifd_recv_sync_main();
4364              }
4365              break;
4366          case RAM_SAVE_FLAG_HOOK:
4367              ret = rdma_registration_handle(f);
4368              if (ret < 0) {
4369                  qemu_file_set_error(f, ret);
4370              }
4371              break;
4372          default:
4373              error_report("Unknown combination of migration flags: 0x%x", flags);
4374              ret = -EINVAL;
4375          }
4376          if (!ret) {
4377              ret = qemu_file_get_error(f);
4378          }
4379          if (!ret && host_bak) {
4380              memcpy(host_bak, host, TARGET_PAGE_SIZE);
4381          }
4382      }
4383  
4384      ret |= wait_for_decompress_done();
4385      return ret;
4386  }
4387  
4388  static int ram_load(QEMUFile *f, void *opaque, int version_id)
4389  {
4390      int ret = 0;
4391      static uint64_t seq_iter;
4392      /*
4393       * If system is running in postcopy mode, page inserts to host memory must
4394       * be atomic
4395       */
4396      bool postcopy_running = postcopy_is_running();
4397  
4398      seq_iter++;
4399  
4400      if (version_id != 4) {
4401          return -EINVAL;
4402      }
4403  
4404      /*
4405       * This RCU critical section can be very long running.
4406       * When RCU reclaims in the code start to become numerous,
4407       * it will be necessary to reduce the granularity of this
4408       * critical section.
4409       */
4410      WITH_RCU_READ_LOCK_GUARD() {
4411          if (postcopy_running) {
4412              /*
4413               * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4414               * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4415               * service fast page faults.
4416               */
4417              ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4418          } else {
4419              ret = ram_load_precopy(f);
4420          }
4421      }
4422      trace_ram_load_complete(ret, seq_iter);
4423  
4424      return ret;
4425  }
4426  
4427  static bool ram_has_postcopy(void *opaque)
4428  {
4429      RAMBlock *rb;
4430      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4431          if (ramblock_is_pmem(rb)) {
4432              info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4433                           "is not supported now!", rb->idstr, rb->host);
4434              return false;
4435          }
4436      }
4437  
4438      return migrate_postcopy_ram();
4439  }
4440  
4441  /* Sync all the dirty bitmap with destination VM.  */
4442  static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4443  {
4444      RAMBlock *block;
4445      QEMUFile *file = s->to_dst_file;
4446  
4447      trace_ram_dirty_bitmap_sync_start();
4448  
4449      qatomic_set(&rs->postcopy_bmap_sync_requested, 0);
4450      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4451          qemu_savevm_send_recv_bitmap(file, block->idstr);
4452          trace_ram_dirty_bitmap_request(block->idstr);
4453          qatomic_inc(&rs->postcopy_bmap_sync_requested);
4454      }
4455  
4456      trace_ram_dirty_bitmap_sync_wait();
4457  
4458      /* Wait until all the ramblocks' dirty bitmap synced */
4459      while (qatomic_read(&rs->postcopy_bmap_sync_requested)) {
4460          if (migration_rp_wait(s)) {
4461              return -1;
4462          }
4463      }
4464  
4465      trace_ram_dirty_bitmap_sync_complete();
4466  
4467      return 0;
4468  }
4469  
4470  /*
4471   * Read the received bitmap, revert it as the initial dirty bitmap.
4472   * This is only used when the postcopy migration is paused but wants
4473   * to resume from a middle point.
4474   *
4475   * Returns true if succeeded, false for errors.
4476   */
4477  bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp)
4478  {
4479      /* from_dst_file is always valid because we're within rp_thread */
4480      QEMUFile *file = s->rp_state.from_dst_file;
4481      g_autofree unsigned long *le_bitmap = NULL;
4482      unsigned long nbits = block->used_length >> TARGET_PAGE_BITS;
4483      uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4484      uint64_t size, end_mark;
4485      RAMState *rs = ram_state;
4486  
4487      trace_ram_dirty_bitmap_reload_begin(block->idstr);
4488  
4489      if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4490          error_setg(errp, "Reload bitmap in incorrect state %s",
4491                     MigrationStatus_str(s->state));
4492          return false;
4493      }
4494  
4495      /*
4496       * Note: see comments in ramblock_recv_bitmap_send() on why we
4497       * need the endianness conversion, and the paddings.
4498       */
4499      local_size = ROUND_UP(local_size, 8);
4500  
4501      /* Add paddings */
4502      le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4503  
4504      size = qemu_get_be64(file);
4505  
4506      /* The size of the bitmap should match with our ramblock */
4507      if (size != local_size) {
4508          error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64
4509                     " != 0x%"PRIx64")", block->idstr, size, local_size);
4510          return false;
4511      }
4512  
4513      size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4514      end_mark = qemu_get_be64(file);
4515  
4516      if (qemu_file_get_error(file) || size != local_size) {
4517          error_setg(errp, "read bitmap failed for ramblock '%s': "
4518                     "(size 0x%"PRIx64", got: 0x%"PRIx64")",
4519                     block->idstr, local_size, size);
4520          return false;
4521      }
4522  
4523      if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4524          error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64,
4525                     block->idstr, end_mark);
4526          return false;
4527      }
4528  
4529      /*
4530       * Endianness conversion. We are during postcopy (though paused).
4531       * The dirty bitmap won't change. We can directly modify it.
4532       */
4533      bitmap_from_le(block->bmap, le_bitmap, nbits);
4534  
4535      /*
4536       * What we received is "received bitmap". Revert it as the initial
4537       * dirty bitmap for this ramblock.
4538       */
4539      bitmap_complement(block->bmap, block->bmap, nbits);
4540  
4541      /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4542      ramblock_dirty_bitmap_clear_discarded_pages(block);
4543  
4544      /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4545      trace_ram_dirty_bitmap_reload_complete(block->idstr);
4546  
4547      qatomic_dec(&rs->postcopy_bmap_sync_requested);
4548  
4549      /*
4550       * We succeeded to sync bitmap for current ramblock. Always kick the
4551       * migration thread to check whether all requested bitmaps are
4552       * reloaded.  NOTE: it's racy to only kick when requested==0, because
4553       * we don't know whether the migration thread may still be increasing
4554       * it.
4555       */
4556      migration_rp_kick(s);
4557  
4558      return true;
4559  }
4560  
4561  static int ram_resume_prepare(MigrationState *s, void *opaque)
4562  {
4563      RAMState *rs = *(RAMState **)opaque;
4564      int ret;
4565  
4566      ret = ram_dirty_bitmap_sync_all(s, rs);
4567      if (ret) {
4568          return ret;
4569      }
4570  
4571      ram_state_resume_prepare(rs, s->to_dst_file);
4572  
4573      return 0;
4574  }
4575  
4576  void postcopy_preempt_shutdown_file(MigrationState *s)
4577  {
4578      qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4579      qemu_fflush(s->postcopy_qemufile_src);
4580  }
4581  
4582  static SaveVMHandlers savevm_ram_handlers = {
4583      .save_setup = ram_save_setup,
4584      .save_live_iterate = ram_save_iterate,
4585      .save_live_complete_postcopy = ram_save_complete,
4586      .save_live_complete_precopy = ram_save_complete,
4587      .has_postcopy = ram_has_postcopy,
4588      .state_pending_exact = ram_state_pending_exact,
4589      .state_pending_estimate = ram_state_pending_estimate,
4590      .load_state = ram_load,
4591      .save_cleanup = ram_save_cleanup,
4592      .load_setup = ram_load_setup,
4593      .load_cleanup = ram_load_cleanup,
4594      .resume_prepare = ram_resume_prepare,
4595  };
4596  
4597  static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4598                                        size_t old_size, size_t new_size)
4599  {
4600      PostcopyState ps = postcopy_state_get();
4601      ram_addr_t offset;
4602      RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4603      Error *err = NULL;
4604  
4605      if (!rb) {
4606          error_report("RAM block not found");
4607          return;
4608      }
4609  
4610      if (migrate_ram_is_ignored(rb)) {
4611          return;
4612      }
4613  
4614      if (!migration_is_idle()) {
4615          /*
4616           * Precopy code on the source cannot deal with the size of RAM blocks
4617           * changing at random points in time - especially after sending the
4618           * RAM block sizes in the migration stream, they must no longer change.
4619           * Abort and indicate a proper reason.
4620           */
4621          error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4622          migration_cancel(err);
4623          error_free(err);
4624      }
4625  
4626      switch (ps) {
4627      case POSTCOPY_INCOMING_ADVISE:
4628          /*
4629           * Update what ram_postcopy_incoming_init()->init_range() does at the
4630           * time postcopy was advised. Syncing RAM blocks with the source will
4631           * result in RAM resizes.
4632           */
4633          if (old_size < new_size) {
4634              if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4635                  error_report("RAM block '%s' discard of resized RAM failed",
4636                               rb->idstr);
4637              }
4638          }
4639          rb->postcopy_length = new_size;
4640          break;
4641      case POSTCOPY_INCOMING_NONE:
4642      case POSTCOPY_INCOMING_RUNNING:
4643      case POSTCOPY_INCOMING_END:
4644          /*
4645           * Once our guest is running, postcopy does no longer care about
4646           * resizes. When growing, the new memory was not available on the
4647           * source, no handler needed.
4648           */
4649          break;
4650      default:
4651          error_report("RAM block '%s' resized during postcopy state: %d",
4652                       rb->idstr, ps);
4653          exit(-1);
4654      }
4655  }
4656  
4657  static RAMBlockNotifier ram_mig_ram_notifier = {
4658      .ram_block_resized = ram_mig_ram_block_resized,
4659  };
4660  
4661  void ram_mig_init(void)
4662  {
4663      qemu_mutex_init(&XBZRLE.lock);
4664      register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4665      ram_block_notifier_add(&ram_mig_ram_notifier);
4666  }
4667