xref: /openbmc/qemu/migration/ram.c (revision 623d7e3551a6fc5693c06ea938c60fe281b52e27)
1  /*
2   * QEMU System Emulator
3   *
4   * Copyright (c) 2003-2008 Fabrice Bellard
5   * Copyright (c) 2011-2015 Red Hat Inc
6   *
7   * Authors:
8   *  Juan Quintela <quintela@redhat.com>
9   *
10   * Permission is hereby granted, free of charge, to any person obtaining a copy
11   * of this software and associated documentation files (the "Software"), to deal
12   * in the Software without restriction, including without limitation the rights
13   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14   * copies of the Software, and to permit persons to whom the Software is
15   * furnished to do so, subject to the following conditions:
16   *
17   * The above copyright notice and this permission notice shall be included in
18   * all copies or substantial portions of the Software.
19   *
20   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26   * THE SOFTWARE.
27   */
28  
29  #include "qemu/osdep.h"
30  #include "qemu/cutils.h"
31  #include "qemu/bitops.h"
32  #include "qemu/bitmap.h"
33  #include "qemu/madvise.h"
34  #include "qemu/main-loop.h"
35  #include "xbzrle.h"
36  #include "ram-compress.h"
37  #include "ram.h"
38  #include "migration.h"
39  #include "migration-stats.h"
40  #include "migration/register.h"
41  #include "migration/misc.h"
42  #include "qemu-file.h"
43  #include "postcopy-ram.h"
44  #include "page_cache.h"
45  #include "qemu/error-report.h"
46  #include "qapi/error.h"
47  #include "qapi/qapi-types-migration.h"
48  #include "qapi/qapi-events-migration.h"
49  #include "qapi/qmp/qerror.h"
50  #include "trace.h"
51  #include "exec/ram_addr.h"
52  #include "exec/target_page.h"
53  #include "qemu/rcu_queue.h"
54  #include "migration/colo.h"
55  #include "block.h"
56  #include "sysemu/cpu-throttle.h"
57  #include "savevm.h"
58  #include "qemu/iov.h"
59  #include "multifd.h"
60  #include "sysemu/runstate.h"
61  #include "options.h"
62  
63  #include "hw/boards.h" /* for machine_dump_guest_core() */
64  
65  #if defined(__linux__)
66  #include "qemu/userfaultfd.h"
67  #endif /* defined(__linux__) */
68  
69  /***********************************************************/
70  /* ram save/restore */
71  
72  /*
73   * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
74   * worked for pages that were filled with the same char.  We switched
75   * it to only search for the zero value.  And to avoid confusion with
76   * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
77   */
78  /*
79   * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
80   */
81  #define RAM_SAVE_FLAG_FULL     0x01
82  #define RAM_SAVE_FLAG_ZERO     0x02
83  #define RAM_SAVE_FLAG_MEM_SIZE 0x04
84  #define RAM_SAVE_FLAG_PAGE     0x08
85  #define RAM_SAVE_FLAG_EOS      0x10
86  #define RAM_SAVE_FLAG_CONTINUE 0x20
87  #define RAM_SAVE_FLAG_XBZRLE   0x40
88  /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
89  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
90  #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
91  /* We can't use any flag that is bigger than 0x200 */
92  
93  XBZRLECacheStats xbzrle_counters;
94  
95  /* used by the search for pages to send */
96  struct PageSearchStatus {
97      /* The migration channel used for a specific host page */
98      QEMUFile    *pss_channel;
99      /* Last block from where we have sent data */
100      RAMBlock *last_sent_block;
101      /* Current block being searched */
102      RAMBlock    *block;
103      /* Current page to search from */
104      unsigned long page;
105      /* Set once we wrap around */
106      bool         complete_round;
107      /* Whether we're sending a host page */
108      bool          host_page_sending;
109      /* The start/end of current host page.  Invalid if host_page_sending==false */
110      unsigned long host_page_start;
111      unsigned long host_page_end;
112  };
113  typedef struct PageSearchStatus PageSearchStatus;
114  
115  /* struct contains XBZRLE cache and a static page
116     used by the compression */
117  static struct {
118      /* buffer used for XBZRLE encoding */
119      uint8_t *encoded_buf;
120      /* buffer for storing page content */
121      uint8_t *current_buf;
122      /* Cache for XBZRLE, Protected by lock. */
123      PageCache *cache;
124      QemuMutex lock;
125      /* it will store a page full of zeros */
126      uint8_t *zero_target_page;
127      /* buffer used for XBZRLE decoding */
128      uint8_t *decoded_buf;
129  } XBZRLE;
130  
131  static void XBZRLE_cache_lock(void)
132  {
133      if (migrate_xbzrle()) {
134          qemu_mutex_lock(&XBZRLE.lock);
135      }
136  }
137  
138  static void XBZRLE_cache_unlock(void)
139  {
140      if (migrate_xbzrle()) {
141          qemu_mutex_unlock(&XBZRLE.lock);
142      }
143  }
144  
145  /**
146   * xbzrle_cache_resize: resize the xbzrle cache
147   *
148   * This function is called from migrate_params_apply in main
149   * thread, possibly while a migration is in progress.  A running
150   * migration may be using the cache and might finish during this call,
151   * hence changes to the cache are protected by XBZRLE.lock().
152   *
153   * Returns 0 for success or -1 for error
154   *
155   * @new_size: new cache size
156   * @errp: set *errp if the check failed, with reason
157   */
158  int xbzrle_cache_resize(uint64_t new_size, Error **errp)
159  {
160      PageCache *new_cache;
161      int64_t ret = 0;
162  
163      /* Check for truncation */
164      if (new_size != (size_t)new_size) {
165          error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
166                     "exceeding address space");
167          return -1;
168      }
169  
170      if (new_size == migrate_xbzrle_cache_size()) {
171          /* nothing to do */
172          return 0;
173      }
174  
175      XBZRLE_cache_lock();
176  
177      if (XBZRLE.cache != NULL) {
178          new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
179          if (!new_cache) {
180              ret = -1;
181              goto out;
182          }
183  
184          cache_fini(XBZRLE.cache);
185          XBZRLE.cache = new_cache;
186      }
187  out:
188      XBZRLE_cache_unlock();
189      return ret;
190  }
191  
192  static bool postcopy_preempt_active(void)
193  {
194      return migrate_postcopy_preempt() && migration_in_postcopy();
195  }
196  
197  bool ramblock_is_ignored(RAMBlock *block)
198  {
199      return !qemu_ram_is_migratable(block) ||
200             (migrate_ignore_shared() && qemu_ram_is_shared(block)
201                                      && qemu_ram_is_named_file(block));
202  }
203  
204  #undef RAMBLOCK_FOREACH
205  
206  int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
207  {
208      RAMBlock *block;
209      int ret = 0;
210  
211      RCU_READ_LOCK_GUARD();
212  
213      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
214          ret = func(block, opaque);
215          if (ret) {
216              break;
217          }
218      }
219      return ret;
220  }
221  
222  static void ramblock_recv_map_init(void)
223  {
224      RAMBlock *rb;
225  
226      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
227          assert(!rb->receivedmap);
228          rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
229      }
230  }
231  
232  int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
233  {
234      return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
235                      rb->receivedmap);
236  }
237  
238  bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
239  {
240      return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
241  }
242  
243  void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
244  {
245      set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
246  }
247  
248  void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
249                                      size_t nr)
250  {
251      bitmap_set_atomic(rb->receivedmap,
252                        ramblock_recv_bitmap_offset(host_addr, rb),
253                        nr);
254  }
255  
256  #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
257  
258  /*
259   * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
260   *
261   * Returns >0 if success with sent bytes, or <0 if error.
262   */
263  int64_t ramblock_recv_bitmap_send(QEMUFile *file,
264                                    const char *block_name)
265  {
266      RAMBlock *block = qemu_ram_block_by_name(block_name);
267      unsigned long *le_bitmap, nbits;
268      uint64_t size;
269  
270      if (!block) {
271          error_report("%s: invalid block name: %s", __func__, block_name);
272          return -1;
273      }
274  
275      nbits = block->postcopy_length >> TARGET_PAGE_BITS;
276  
277      /*
278       * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
279       * machines we may need 4 more bytes for padding (see below
280       * comment). So extend it a bit before hand.
281       */
282      le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
283  
284      /*
285       * Always use little endian when sending the bitmap. This is
286       * required that when source and destination VMs are not using the
287       * same endianness. (Note: big endian won't work.)
288       */
289      bitmap_to_le(le_bitmap, block->receivedmap, nbits);
290  
291      /* Size of the bitmap, in bytes */
292      size = DIV_ROUND_UP(nbits, 8);
293  
294      /*
295       * size is always aligned to 8 bytes for 64bit machines, but it
296       * may not be true for 32bit machines. We need this padding to
297       * make sure the migration can survive even between 32bit and
298       * 64bit machines.
299       */
300      size = ROUND_UP(size, 8);
301  
302      qemu_put_be64(file, size);
303      qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
304      /*
305       * Mark as an end, in case the middle part is screwed up due to
306       * some "mysterious" reason.
307       */
308      qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
309      qemu_fflush(file);
310  
311      g_free(le_bitmap);
312  
313      if (qemu_file_get_error(file)) {
314          return qemu_file_get_error(file);
315      }
316  
317      return size + sizeof(size);
318  }
319  
320  /*
321   * An outstanding page request, on the source, having been received
322   * and queued
323   */
324  struct RAMSrcPageRequest {
325      RAMBlock *rb;
326      hwaddr    offset;
327      hwaddr    len;
328  
329      QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
330  };
331  
332  /* State of RAM for migration */
333  struct RAMState {
334      /*
335       * PageSearchStatus structures for the channels when send pages.
336       * Protected by the bitmap_mutex.
337       */
338      PageSearchStatus pss[RAM_CHANNEL_MAX];
339      /* UFFD file descriptor, used in 'write-tracking' migration */
340      int uffdio_fd;
341      /* total ram size in bytes */
342      uint64_t ram_bytes_total;
343      /* Last block that we have visited searching for dirty pages */
344      RAMBlock *last_seen_block;
345      /* Last dirty target page we have sent */
346      ram_addr_t last_page;
347      /* last ram version we have seen */
348      uint32_t last_version;
349      /* How many times we have dirty too many pages */
350      int dirty_rate_high_cnt;
351      /* these variables are used for bitmap sync */
352      /* last time we did a full bitmap_sync */
353      int64_t time_last_bitmap_sync;
354      /* bytes transferred at start_time */
355      uint64_t bytes_xfer_prev;
356      /* number of dirty pages since start_time */
357      uint64_t num_dirty_pages_period;
358      /* xbzrle misses since the beginning of the period */
359      uint64_t xbzrle_cache_miss_prev;
360      /* Amount of xbzrle pages since the beginning of the period */
361      uint64_t xbzrle_pages_prev;
362      /* Amount of xbzrle encoded bytes since the beginning of the period */
363      uint64_t xbzrle_bytes_prev;
364      /* Are we really using XBZRLE (e.g., after the first round). */
365      bool xbzrle_started;
366      /* Are we on the last stage of migration */
367      bool last_stage;
368      /* compression statistics since the beginning of the period */
369      /* amount of count that no free thread to compress data */
370      uint64_t compress_thread_busy_prev;
371      /* amount bytes after compression */
372      uint64_t compressed_size_prev;
373      /* amount of compressed pages */
374      uint64_t compress_pages_prev;
375  
376      /* total handled target pages at the beginning of period */
377      uint64_t target_page_count_prev;
378      /* total handled target pages since start */
379      uint64_t target_page_count;
380      /* number of dirty bits in the bitmap */
381      uint64_t migration_dirty_pages;
382      /*
383       * Protects:
384       * - dirty/clear bitmap
385       * - migration_dirty_pages
386       * - pss structures
387       */
388      QemuMutex bitmap_mutex;
389      /* The RAMBlock used in the last src_page_requests */
390      RAMBlock *last_req_rb;
391      /* Queue of outstanding page requests from the destination */
392      QemuMutex src_page_req_mutex;
393      QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
394  };
395  typedef struct RAMState RAMState;
396  
397  static RAMState *ram_state;
398  
399  static NotifierWithReturnList precopy_notifier_list;
400  
401  /* Whether postcopy has queued requests? */
402  static bool postcopy_has_request(RAMState *rs)
403  {
404      return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
405  }
406  
407  void precopy_infrastructure_init(void)
408  {
409      notifier_with_return_list_init(&precopy_notifier_list);
410  }
411  
412  void precopy_add_notifier(NotifierWithReturn *n)
413  {
414      notifier_with_return_list_add(&precopy_notifier_list, n);
415  }
416  
417  void precopy_remove_notifier(NotifierWithReturn *n)
418  {
419      notifier_with_return_remove(n);
420  }
421  
422  int precopy_notify(PrecopyNotifyReason reason, Error **errp)
423  {
424      PrecopyNotifyData pnd;
425      pnd.reason = reason;
426      pnd.errp = errp;
427  
428      return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
429  }
430  
431  uint64_t ram_bytes_remaining(void)
432  {
433      return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
434                         0;
435  }
436  
437  void ram_transferred_add(uint64_t bytes)
438  {
439      if (runstate_is_running()) {
440          stat64_add(&mig_stats.precopy_bytes, bytes);
441      } else if (migration_in_postcopy()) {
442          stat64_add(&mig_stats.postcopy_bytes, bytes);
443      } else {
444          stat64_add(&mig_stats.downtime_bytes, bytes);
445      }
446      stat64_add(&mig_stats.transferred, bytes);
447  }
448  
449  struct MigrationOps {
450      int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
451  };
452  typedef struct MigrationOps MigrationOps;
453  
454  MigrationOps *migration_ops;
455  
456  static int ram_save_host_page_urgent(PageSearchStatus *pss);
457  
458  /* NOTE: page is the PFN not real ram_addr_t. */
459  static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
460  {
461      pss->block = rb;
462      pss->page = page;
463      pss->complete_round = false;
464  }
465  
466  /*
467   * Check whether two PSSs are actively sending the same page.  Return true
468   * if it is, false otherwise.
469   */
470  static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
471  {
472      return pss1->host_page_sending && pss2->host_page_sending &&
473          (pss1->host_page_start == pss2->host_page_start);
474  }
475  
476  /**
477   * save_page_header: write page header to wire
478   *
479   * If this is the 1st block, it also writes the block identification
480   *
481   * Returns the number of bytes written
482   *
483   * @pss: current PSS channel status
484   * @block: block that contains the page we want to send
485   * @offset: offset inside the block for the page
486   *          in the lower bits, it contains flags
487   */
488  static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
489                                 RAMBlock *block, ram_addr_t offset)
490  {
491      size_t size, len;
492      bool same_block = (block == pss->last_sent_block);
493  
494      if (same_block) {
495          offset |= RAM_SAVE_FLAG_CONTINUE;
496      }
497      qemu_put_be64(f, offset);
498      size = 8;
499  
500      if (!same_block) {
501          len = strlen(block->idstr);
502          qemu_put_byte(f, len);
503          qemu_put_buffer(f, (uint8_t *)block->idstr, len);
504          size += 1 + len;
505          pss->last_sent_block = block;
506      }
507      return size;
508  }
509  
510  /**
511   * mig_throttle_guest_down: throttle down the guest
512   *
513   * Reduce amount of guest cpu execution to hopefully slow down memory
514   * writes. If guest dirty memory rate is reduced below the rate at
515   * which we can transfer pages to the destination then we should be
516   * able to complete migration. Some workloads dirty memory way too
517   * fast and will not effectively converge, even with auto-converge.
518   */
519  static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
520                                      uint64_t bytes_dirty_threshold)
521  {
522      uint64_t pct_initial = migrate_cpu_throttle_initial();
523      uint64_t pct_increment = migrate_cpu_throttle_increment();
524      bool pct_tailslow = migrate_cpu_throttle_tailslow();
525      int pct_max = migrate_max_cpu_throttle();
526  
527      uint64_t throttle_now = cpu_throttle_get_percentage();
528      uint64_t cpu_now, cpu_ideal, throttle_inc;
529  
530      /* We have not started throttling yet. Let's start it. */
531      if (!cpu_throttle_active()) {
532          cpu_throttle_set(pct_initial);
533      } else {
534          /* Throttling already on, just increase the rate */
535          if (!pct_tailslow) {
536              throttle_inc = pct_increment;
537          } else {
538              /* Compute the ideal CPU percentage used by Guest, which may
539               * make the dirty rate match the dirty rate threshold. */
540              cpu_now = 100 - throttle_now;
541              cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
542                          bytes_dirty_period);
543              throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
544          }
545          cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
546      }
547  }
548  
549  void mig_throttle_counter_reset(void)
550  {
551      RAMState *rs = ram_state;
552  
553      rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
554      rs->num_dirty_pages_period = 0;
555      rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
556  }
557  
558  /**
559   * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
560   *
561   * @rs: current RAM state
562   * @current_addr: address for the zero page
563   *
564   * Update the xbzrle cache to reflect a page that's been sent as all 0.
565   * The important thing is that a stale (not-yet-0'd) page be replaced
566   * by the new data.
567   * As a bonus, if the page wasn't in the cache it gets added so that
568   * when a small write is made into the 0'd page it gets XBZRLE sent.
569   */
570  static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
571  {
572      /* We don't care if this fails to allocate a new cache page
573       * as long as it updated an old one */
574      cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
575                   stat64_get(&mig_stats.dirty_sync_count));
576  }
577  
578  #define ENCODING_FLAG_XBZRLE 0x1
579  
580  /**
581   * save_xbzrle_page: compress and send current page
582   *
583   * Returns: 1 means that we wrote the page
584   *          0 means that page is identical to the one already sent
585   *          -1 means that xbzrle would be longer than normal
586   *
587   * @rs: current RAM state
588   * @pss: current PSS channel
589   * @current_data: pointer to the address of the page contents
590   * @current_addr: addr of the page
591   * @block: block that contains the page we want to send
592   * @offset: offset inside the block for the page
593   */
594  static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
595                              uint8_t **current_data, ram_addr_t current_addr,
596                              RAMBlock *block, ram_addr_t offset)
597  {
598      int encoded_len = 0, bytes_xbzrle;
599      uint8_t *prev_cached_page;
600      QEMUFile *file = pss->pss_channel;
601      uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
602  
603      if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
604          xbzrle_counters.cache_miss++;
605          if (!rs->last_stage) {
606              if (cache_insert(XBZRLE.cache, current_addr, *current_data,
607                               generation) == -1) {
608                  return -1;
609              } else {
610                  /* update *current_data when the page has been
611                     inserted into cache */
612                  *current_data = get_cached_data(XBZRLE.cache, current_addr);
613              }
614          }
615          return -1;
616      }
617  
618      /*
619       * Reaching here means the page has hit the xbzrle cache, no matter what
620       * encoding result it is (normal encoding, overflow or skipping the page),
621       * count the page as encoded. This is used to calculate the encoding rate.
622       *
623       * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
624       * 2nd page turns out to be skipped (i.e. no new bytes written to the
625       * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
626       * skipped page included. In this way, the encoding rate can tell if the
627       * guest page is good for xbzrle encoding.
628       */
629      xbzrle_counters.pages++;
630      prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
631  
632      /* save current buffer into memory */
633      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
634  
635      /* XBZRLE encoding (if there is no overflow) */
636      encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
637                                         TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
638                                         TARGET_PAGE_SIZE);
639  
640      /*
641       * Update the cache contents, so that it corresponds to the data
642       * sent, in all cases except where we skip the page.
643       */
644      if (!rs->last_stage && encoded_len != 0) {
645          memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
646          /*
647           * In the case where we couldn't compress, ensure that the caller
648           * sends the data from the cache, since the guest might have
649           * changed the RAM since we copied it.
650           */
651          *current_data = prev_cached_page;
652      }
653  
654      if (encoded_len == 0) {
655          trace_save_xbzrle_page_skipping();
656          return 0;
657      } else if (encoded_len == -1) {
658          trace_save_xbzrle_page_overflow();
659          xbzrle_counters.overflow++;
660          xbzrle_counters.bytes += TARGET_PAGE_SIZE;
661          return -1;
662      }
663  
664      /* Send XBZRLE based compressed page */
665      bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
666                                      offset | RAM_SAVE_FLAG_XBZRLE);
667      qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
668      qemu_put_be16(file, encoded_len);
669      qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
670      bytes_xbzrle += encoded_len + 1 + 2;
671      /*
672       * Like compressed_size (please see update_compress_thread_counts),
673       * the xbzrle encoded bytes don't count the 8 byte header with
674       * RAM_SAVE_FLAG_CONTINUE.
675       */
676      xbzrle_counters.bytes += bytes_xbzrle - 8;
677      ram_transferred_add(bytes_xbzrle);
678  
679      return 1;
680  }
681  
682  /**
683   * pss_find_next_dirty: find the next dirty page of current ramblock
684   *
685   * This function updates pss->page to point to the next dirty page index
686   * within the ramblock to migrate, or the end of ramblock when nothing
687   * found.  Note that when pss->host_page_sending==true it means we're
688   * during sending a host page, so we won't look for dirty page that is
689   * outside the host page boundary.
690   *
691   * @pss: the current page search status
692   */
693  static void pss_find_next_dirty(PageSearchStatus *pss)
694  {
695      RAMBlock *rb = pss->block;
696      unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
697      unsigned long *bitmap = rb->bmap;
698  
699      if (ramblock_is_ignored(rb)) {
700          /* Points directly to the end, so we know no dirty page */
701          pss->page = size;
702          return;
703      }
704  
705      /*
706       * If during sending a host page, only look for dirty pages within the
707       * current host page being send.
708       */
709      if (pss->host_page_sending) {
710          assert(pss->host_page_end);
711          size = MIN(size, pss->host_page_end);
712      }
713  
714      pss->page = find_next_bit(bitmap, size, pss->page);
715  }
716  
717  static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
718                                                         unsigned long page)
719  {
720      uint8_t shift;
721      hwaddr size, start;
722  
723      if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
724          return;
725      }
726  
727      shift = rb->clear_bmap_shift;
728      /*
729       * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
730       * can make things easier sometimes since then start address
731       * of the small chunk will always be 64 pages aligned so the
732       * bitmap will always be aligned to unsigned long. We should
733       * even be able to remove this restriction but I'm simply
734       * keeping it.
735       */
736      assert(shift >= 6);
737  
738      size = 1ULL << (TARGET_PAGE_BITS + shift);
739      start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
740      trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
741      memory_region_clear_dirty_bitmap(rb->mr, start, size);
742  }
743  
744  static void
745  migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
746                                                   unsigned long start,
747                                                   unsigned long npages)
748  {
749      unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
750      unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
751      unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
752  
753      /*
754       * Clear pages from start to start + npages - 1, so the end boundary is
755       * exclusive.
756       */
757      for (i = chunk_start; i < chunk_end; i += chunk_pages) {
758          migration_clear_memory_region_dirty_bitmap(rb, i);
759      }
760  }
761  
762  /*
763   * colo_bitmap_find_diry:find contiguous dirty pages from start
764   *
765   * Returns the page offset within memory region of the start of the contiguout
766   * dirty page
767   *
768   * @rs: current RAM state
769   * @rb: RAMBlock where to search for dirty pages
770   * @start: page where we start the search
771   * @num: the number of contiguous dirty pages
772   */
773  static inline
774  unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
775                                       unsigned long start, unsigned long *num)
776  {
777      unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
778      unsigned long *bitmap = rb->bmap;
779      unsigned long first, next;
780  
781      *num = 0;
782  
783      if (ramblock_is_ignored(rb)) {
784          return size;
785      }
786  
787      first = find_next_bit(bitmap, size, start);
788      if (first >= size) {
789          return first;
790      }
791      next = find_next_zero_bit(bitmap, size, first + 1);
792      assert(next >= first);
793      *num = next - first;
794      return first;
795  }
796  
797  static inline bool migration_bitmap_clear_dirty(RAMState *rs,
798                                                  RAMBlock *rb,
799                                                  unsigned long page)
800  {
801      bool ret;
802  
803      /*
804       * Clear dirty bitmap if needed.  This _must_ be called before we
805       * send any of the page in the chunk because we need to make sure
806       * we can capture further page content changes when we sync dirty
807       * log the next time.  So as long as we are going to send any of
808       * the page in the chunk we clear the remote dirty bitmap for all.
809       * Clearing it earlier won't be a problem, but too late will.
810       */
811      migration_clear_memory_region_dirty_bitmap(rb, page);
812  
813      ret = test_and_clear_bit(page, rb->bmap);
814      if (ret) {
815          rs->migration_dirty_pages--;
816      }
817  
818      return ret;
819  }
820  
821  static void dirty_bitmap_clear_section(MemoryRegionSection *section,
822                                         void *opaque)
823  {
824      const hwaddr offset = section->offset_within_region;
825      const hwaddr size = int128_get64(section->size);
826      const unsigned long start = offset >> TARGET_PAGE_BITS;
827      const unsigned long npages = size >> TARGET_PAGE_BITS;
828      RAMBlock *rb = section->mr->ram_block;
829      uint64_t *cleared_bits = opaque;
830  
831      /*
832       * We don't grab ram_state->bitmap_mutex because we expect to run
833       * only when starting migration or during postcopy recovery where
834       * we don't have concurrent access.
835       */
836      if (!migration_in_postcopy() && !migrate_background_snapshot()) {
837          migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
838      }
839      *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
840      bitmap_clear(rb->bmap, start, npages);
841  }
842  
843  /*
844   * Exclude all dirty pages from migration that fall into a discarded range as
845   * managed by a RamDiscardManager responsible for the mapped memory region of
846   * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
847   *
848   * Discarded pages ("logically unplugged") have undefined content and must
849   * not get migrated, because even reading these pages for migration might
850   * result in undesired behavior.
851   *
852   * Returns the number of cleared bits in the RAMBlock dirty bitmap.
853   *
854   * Note: The result is only stable while migrating (precopy/postcopy).
855   */
856  static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
857  {
858      uint64_t cleared_bits = 0;
859  
860      if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
861          RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
862          MemoryRegionSection section = {
863              .mr = rb->mr,
864              .offset_within_region = 0,
865              .size = int128_make64(qemu_ram_get_used_length(rb)),
866          };
867  
868          ram_discard_manager_replay_discarded(rdm, &section,
869                                               dirty_bitmap_clear_section,
870                                               &cleared_bits);
871      }
872      return cleared_bits;
873  }
874  
875  /*
876   * Check if a host-page aligned page falls into a discarded range as managed by
877   * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
878   *
879   * Note: The result is only stable while migrating (precopy/postcopy).
880   */
881  bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
882  {
883      if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
884          RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
885          MemoryRegionSection section = {
886              .mr = rb->mr,
887              .offset_within_region = start,
888              .size = int128_make64(qemu_ram_pagesize(rb)),
889          };
890  
891          return !ram_discard_manager_is_populated(rdm, &section);
892      }
893      return false;
894  }
895  
896  /* Called with RCU critical section */
897  static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
898  {
899      uint64_t new_dirty_pages =
900          cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
901  
902      rs->migration_dirty_pages += new_dirty_pages;
903      rs->num_dirty_pages_period += new_dirty_pages;
904  }
905  
906  /**
907   * ram_pagesize_summary: calculate all the pagesizes of a VM
908   *
909   * Returns a summary bitmap of the page sizes of all RAMBlocks
910   *
911   * For VMs with just normal pages this is equivalent to the host page
912   * size. If it's got some huge pages then it's the OR of all the
913   * different page sizes.
914   */
915  uint64_t ram_pagesize_summary(void)
916  {
917      RAMBlock *block;
918      uint64_t summary = 0;
919  
920      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
921          summary |= block->page_size;
922      }
923  
924      return summary;
925  }
926  
927  uint64_t ram_get_total_transferred_pages(void)
928  {
929      return stat64_get(&mig_stats.normal_pages) +
930          stat64_get(&mig_stats.zero_pages) +
931          compression_counters.pages + xbzrle_counters.pages;
932  }
933  
934  static void migration_update_rates(RAMState *rs, int64_t end_time)
935  {
936      uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
937      double compressed_size;
938  
939      /* calculate period counters */
940      stat64_set(&mig_stats.dirty_pages_rate,
941                 rs->num_dirty_pages_period * 1000 /
942                 (end_time - rs->time_last_bitmap_sync));
943  
944      if (!page_count) {
945          return;
946      }
947  
948      if (migrate_xbzrle()) {
949          double encoded_size, unencoded_size;
950  
951          xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
952              rs->xbzrle_cache_miss_prev) / page_count;
953          rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
954          unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
955                           TARGET_PAGE_SIZE;
956          encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
957          if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
958              xbzrle_counters.encoding_rate = 0;
959          } else {
960              xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
961          }
962          rs->xbzrle_pages_prev = xbzrle_counters.pages;
963          rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
964      }
965  
966      if (migrate_compress()) {
967          compression_counters.busy_rate = (double)(compression_counters.busy -
968              rs->compress_thread_busy_prev) / page_count;
969          rs->compress_thread_busy_prev = compression_counters.busy;
970  
971          compressed_size = compression_counters.compressed_size -
972                            rs->compressed_size_prev;
973          if (compressed_size) {
974              double uncompressed_size = (compression_counters.pages -
975                                      rs->compress_pages_prev) * TARGET_PAGE_SIZE;
976  
977              /* Compression-Ratio = Uncompressed-size / Compressed-size */
978              compression_counters.compression_rate =
979                                          uncompressed_size / compressed_size;
980  
981              rs->compress_pages_prev = compression_counters.pages;
982              rs->compressed_size_prev = compression_counters.compressed_size;
983          }
984      }
985  }
986  
987  static void migration_trigger_throttle(RAMState *rs)
988  {
989      uint64_t threshold = migrate_throttle_trigger_threshold();
990      uint64_t bytes_xfer_period =
991          stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
992      uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
993      uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
994  
995      /* During block migration the auto-converge logic incorrectly detects
996       * that ram migration makes no progress. Avoid this by disabling the
997       * throttling logic during the bulk phase of block migration. */
998      if (migrate_auto_converge() && !blk_mig_bulk_active()) {
999          /* The following detection logic can be refined later. For now:
1000             Check to see if the ratio between dirtied bytes and the approx.
1001             amount of bytes that just got transferred since the last time
1002             we were in this routine reaches the threshold. If that happens
1003             twice, start or increase throttling. */
1004  
1005          if ((bytes_dirty_period > bytes_dirty_threshold) &&
1006              (++rs->dirty_rate_high_cnt >= 2)) {
1007              trace_migration_throttle();
1008              rs->dirty_rate_high_cnt = 0;
1009              mig_throttle_guest_down(bytes_dirty_period,
1010                                      bytes_dirty_threshold);
1011          }
1012      }
1013  }
1014  
1015  static void migration_bitmap_sync(RAMState *rs, bool last_stage)
1016  {
1017      RAMBlock *block;
1018      int64_t end_time;
1019  
1020      stat64_add(&mig_stats.dirty_sync_count, 1);
1021  
1022      if (!rs->time_last_bitmap_sync) {
1023          rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1024      }
1025  
1026      trace_migration_bitmap_sync_start();
1027      memory_global_dirty_log_sync(last_stage);
1028  
1029      qemu_mutex_lock(&rs->bitmap_mutex);
1030      WITH_RCU_READ_LOCK_GUARD() {
1031          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1032              ramblock_sync_dirty_bitmap(rs, block);
1033          }
1034          stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1035      }
1036      qemu_mutex_unlock(&rs->bitmap_mutex);
1037  
1038      memory_global_after_dirty_log_sync();
1039      trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1040  
1041      end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1042  
1043      /* more than 1 second = 1000 millisecons */
1044      if (end_time > rs->time_last_bitmap_sync + 1000) {
1045          migration_trigger_throttle(rs);
1046  
1047          migration_update_rates(rs, end_time);
1048  
1049          rs->target_page_count_prev = rs->target_page_count;
1050  
1051          /* reset period counters */
1052          rs->time_last_bitmap_sync = end_time;
1053          rs->num_dirty_pages_period = 0;
1054          rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
1055      }
1056      if (migrate_events()) {
1057          uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1058          qapi_event_send_migration_pass(generation);
1059      }
1060  }
1061  
1062  static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
1063  {
1064      Error *local_err = NULL;
1065  
1066      /*
1067       * The current notifier usage is just an optimization to migration, so we
1068       * don't stop the normal migration process in the error case.
1069       */
1070      if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1071          error_report_err(local_err);
1072          local_err = NULL;
1073      }
1074  
1075      migration_bitmap_sync(rs, last_stage);
1076  
1077      if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1078          error_report_err(local_err);
1079      }
1080  }
1081  
1082  void ram_release_page(const char *rbname, uint64_t offset)
1083  {
1084      if (!migrate_release_ram() || !migration_in_postcopy()) {
1085          return;
1086      }
1087  
1088      ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1089  }
1090  
1091  /**
1092   * save_zero_page_to_file: send the zero page to the file
1093   *
1094   * Returns the size of data written to the file, 0 means the page is not
1095   * a zero page
1096   *
1097   * @pss: current PSS channel
1098   * @block: block that contains the page we want to send
1099   * @offset: offset inside the block for the page
1100   */
1101  static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1102                                    RAMBlock *block, ram_addr_t offset)
1103  {
1104      uint8_t *p = block->host + offset;
1105      int len = 0;
1106  
1107      if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1108          len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1109          qemu_put_byte(file, 0);
1110          len += 1;
1111          ram_release_page(block->idstr, offset);
1112      }
1113      return len;
1114  }
1115  
1116  /**
1117   * save_zero_page: send the zero page to the stream
1118   *
1119   * Returns the number of pages written.
1120   *
1121   * @pss: current PSS channel
1122   * @block: block that contains the page we want to send
1123   * @offset: offset inside the block for the page
1124   */
1125  static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1126                            ram_addr_t offset)
1127  {
1128      int len = save_zero_page_to_file(pss, f, block, offset);
1129  
1130      if (len) {
1131          stat64_add(&mig_stats.zero_pages, 1);
1132          ram_transferred_add(len);
1133          return 1;
1134      }
1135      return -1;
1136  }
1137  
1138  /*
1139   * @pages: the number of pages written by the control path,
1140   *        < 0 - error
1141   *        > 0 - number of pages written
1142   *
1143   * Return true if the pages has been saved, otherwise false is returned.
1144   */
1145  static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1146                                ram_addr_t offset, int *pages)
1147  {
1148      uint64_t bytes_xmit = 0;
1149      int ret;
1150  
1151      *pages = -1;
1152      ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1153                                  TARGET_PAGE_SIZE, &bytes_xmit);
1154      if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1155          return false;
1156      }
1157  
1158      if (bytes_xmit) {
1159          ram_transferred_add(bytes_xmit);
1160          *pages = 1;
1161      }
1162  
1163      if (ret == RAM_SAVE_CONTROL_DELAYED) {
1164          return true;
1165      }
1166  
1167      if (bytes_xmit > 0) {
1168          stat64_add(&mig_stats.normal_pages, 1);
1169      } else if (bytes_xmit == 0) {
1170          stat64_add(&mig_stats.zero_pages, 1);
1171      }
1172  
1173      return true;
1174  }
1175  
1176  /*
1177   * directly send the page to the stream
1178   *
1179   * Returns the number of pages written.
1180   *
1181   * @pss: current PSS channel
1182   * @block: block that contains the page we want to send
1183   * @offset: offset inside the block for the page
1184   * @buf: the page to be sent
1185   * @async: send to page asyncly
1186   */
1187  static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1188                              ram_addr_t offset, uint8_t *buf, bool async)
1189  {
1190      QEMUFile *file = pss->pss_channel;
1191  
1192      ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1193                                           offset | RAM_SAVE_FLAG_PAGE));
1194      if (async) {
1195          qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1196                                migrate_release_ram() &&
1197                                migration_in_postcopy());
1198      } else {
1199          qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1200      }
1201      ram_transferred_add(TARGET_PAGE_SIZE);
1202      stat64_add(&mig_stats.normal_pages, 1);
1203      return 1;
1204  }
1205  
1206  /**
1207   * ram_save_page: send the given page to the stream
1208   *
1209   * Returns the number of pages written.
1210   *          < 0 - error
1211   *          >=0 - Number of pages written - this might legally be 0
1212   *                if xbzrle noticed the page was the same.
1213   *
1214   * @rs: current RAM state
1215   * @block: block that contains the page we want to send
1216   * @offset: offset inside the block for the page
1217   */
1218  static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1219  {
1220      int pages = -1;
1221      uint8_t *p;
1222      bool send_async = true;
1223      RAMBlock *block = pss->block;
1224      ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1225      ram_addr_t current_addr = block->offset + offset;
1226  
1227      p = block->host + offset;
1228      trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1229  
1230      XBZRLE_cache_lock();
1231      if (rs->xbzrle_started && !migration_in_postcopy()) {
1232          pages = save_xbzrle_page(rs, pss, &p, current_addr,
1233                                   block, offset);
1234          if (!rs->last_stage) {
1235              /* Can't send this cached data async, since the cache page
1236               * might get updated before it gets to the wire
1237               */
1238              send_async = false;
1239          }
1240      }
1241  
1242      /* XBZRLE overflow or normal page */
1243      if (pages == -1) {
1244          pages = save_normal_page(pss, block, offset, p, send_async);
1245      }
1246  
1247      XBZRLE_cache_unlock();
1248  
1249      return pages;
1250  }
1251  
1252  static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1253                                   ram_addr_t offset)
1254  {
1255      if (multifd_queue_page(file, block, offset) < 0) {
1256          return -1;
1257      }
1258      stat64_add(&mig_stats.normal_pages, 1);
1259  
1260      return 1;
1261  }
1262  
1263  static void
1264  update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1265  {
1266      ram_transferred_add(bytes_xmit);
1267  
1268      if (param->result == RES_ZEROPAGE) {
1269          stat64_add(&mig_stats.zero_pages, 1);
1270          return;
1271      }
1272  
1273      /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1274      compression_counters.compressed_size += bytes_xmit - 8;
1275      compression_counters.pages++;
1276  }
1277  
1278  static bool save_page_use_compression(RAMState *rs);
1279  
1280  static int send_queued_data(CompressParam *param)
1281  {
1282      PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1283      MigrationState *ms = migrate_get_current();
1284      QEMUFile *file = ms->to_dst_file;
1285      int len = 0;
1286  
1287      RAMBlock *block = param->block;
1288      ram_addr_t offset = param->offset;
1289  
1290      if (param->result == RES_NONE) {
1291          return 0;
1292      }
1293  
1294      assert(block == pss->last_sent_block);
1295  
1296      if (param->result == RES_ZEROPAGE) {
1297          assert(qemu_file_buffer_empty(param->file));
1298          len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1299          qemu_put_byte(file, 0);
1300          len += 1;
1301          ram_release_page(block->idstr, offset);
1302      } else if (param->result == RES_COMPRESS) {
1303          assert(!qemu_file_buffer_empty(param->file));
1304          len += save_page_header(pss, file, block,
1305                                  offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1306          len += qemu_put_qemu_file(file, param->file);
1307      } else {
1308          abort();
1309      }
1310  
1311      update_compress_thread_counts(param, len);
1312  
1313      return len;
1314  }
1315  
1316  static void ram_flush_compressed_data(RAMState *rs)
1317  {
1318      if (!save_page_use_compression(rs)) {
1319          return;
1320      }
1321  
1322      flush_compressed_data(send_queued_data);
1323  }
1324  
1325  #define PAGE_ALL_CLEAN 0
1326  #define PAGE_TRY_AGAIN 1
1327  #define PAGE_DIRTY_FOUND 2
1328  /**
1329   * find_dirty_block: find the next dirty page and update any state
1330   * associated with the search process.
1331   *
1332   * Returns:
1333   *         <0: An error happened
1334   *         PAGE_ALL_CLEAN: no dirty page found, give up
1335   *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1336   *         PAGE_DIRTY_FOUND: dirty page found
1337   *
1338   * @rs: current RAM state
1339   * @pss: data about the state of the current dirty page scan
1340   * @again: set to false if the search has scanned the whole of RAM
1341   */
1342  static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1343  {
1344      /* Update pss->page for the next dirty bit in ramblock */
1345      pss_find_next_dirty(pss);
1346  
1347      if (pss->complete_round && pss->block == rs->last_seen_block &&
1348          pss->page >= rs->last_page) {
1349          /*
1350           * We've been once around the RAM and haven't found anything.
1351           * Give up.
1352           */
1353          return PAGE_ALL_CLEAN;
1354      }
1355      if (!offset_in_ramblock(pss->block,
1356                              ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1357          /* Didn't find anything in this RAM Block */
1358          pss->page = 0;
1359          pss->block = QLIST_NEXT_RCU(pss->block, next);
1360          if (!pss->block) {
1361              if (!migrate_multifd_flush_after_each_section()) {
1362                  QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1363                  int ret = multifd_send_sync_main(f);
1364                  if (ret < 0) {
1365                      return ret;
1366                  }
1367                  qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1368                  qemu_fflush(f);
1369              }
1370              /*
1371               * If memory migration starts over, we will meet a dirtied page
1372               * which may still exists in compression threads's ring, so we
1373               * should flush the compressed data to make sure the new page
1374               * is not overwritten by the old one in the destination.
1375               *
1376               * Also If xbzrle is on, stop using the data compression at this
1377               * point. In theory, xbzrle can do better than compression.
1378               */
1379              ram_flush_compressed_data(rs);
1380  
1381              /* Hit the end of the list */
1382              pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1383              /* Flag that we've looped */
1384              pss->complete_round = true;
1385              /* After the first round, enable XBZRLE. */
1386              if (migrate_xbzrle()) {
1387                  rs->xbzrle_started = true;
1388              }
1389          }
1390          /* Didn't find anything this time, but try again on the new block */
1391          return PAGE_TRY_AGAIN;
1392      } else {
1393          /* We've found something */
1394          return PAGE_DIRTY_FOUND;
1395      }
1396  }
1397  
1398  /**
1399   * unqueue_page: gets a page of the queue
1400   *
1401   * Helper for 'get_queued_page' - gets a page off the queue
1402   *
1403   * Returns the block of the page (or NULL if none available)
1404   *
1405   * @rs: current RAM state
1406   * @offset: used to return the offset within the RAMBlock
1407   */
1408  static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1409  {
1410      struct RAMSrcPageRequest *entry;
1411      RAMBlock *block = NULL;
1412  
1413      if (!postcopy_has_request(rs)) {
1414          return NULL;
1415      }
1416  
1417      QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1418  
1419      /*
1420       * This should _never_ change even after we take the lock, because no one
1421       * should be taking anything off the request list other than us.
1422       */
1423      assert(postcopy_has_request(rs));
1424  
1425      entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1426      block = entry->rb;
1427      *offset = entry->offset;
1428  
1429      if (entry->len > TARGET_PAGE_SIZE) {
1430          entry->len -= TARGET_PAGE_SIZE;
1431          entry->offset += TARGET_PAGE_SIZE;
1432      } else {
1433          memory_region_unref(block->mr);
1434          QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435          g_free(entry);
1436          migration_consume_urgent_request();
1437      }
1438  
1439      return block;
1440  }
1441  
1442  #if defined(__linux__)
1443  /**
1444   * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1445   *   is found, return RAM block pointer and page offset
1446   *
1447   * Returns pointer to the RAMBlock containing faulting page,
1448   *   NULL if no write faults are pending
1449   *
1450   * @rs: current RAM state
1451   * @offset: page offset from the beginning of the block
1452   */
1453  static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1454  {
1455      struct uffd_msg uffd_msg;
1456      void *page_address;
1457      RAMBlock *block;
1458      int res;
1459  
1460      if (!migrate_background_snapshot()) {
1461          return NULL;
1462      }
1463  
1464      res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1465      if (res <= 0) {
1466          return NULL;
1467      }
1468  
1469      page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1470      block = qemu_ram_block_from_host(page_address, false, offset);
1471      assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1472      return block;
1473  }
1474  
1475  /**
1476   * ram_save_release_protection: release UFFD write protection after
1477   *   a range of pages has been saved
1478   *
1479   * @rs: current RAM state
1480   * @pss: page-search-status structure
1481   * @start_page: index of the first page in the range relative to pss->block
1482   *
1483   * Returns 0 on success, negative value in case of an error
1484  */
1485  static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1486          unsigned long start_page)
1487  {
1488      int res = 0;
1489  
1490      /* Check if page is from UFFD-managed region. */
1491      if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1492          void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1493          uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1494  
1495          /* Flush async buffers before un-protect. */
1496          qemu_fflush(pss->pss_channel);
1497          /* Un-protect memory range. */
1498          res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1499                  false, false);
1500      }
1501  
1502      return res;
1503  }
1504  
1505  /* ram_write_tracking_available: check if kernel supports required UFFD features
1506   *
1507   * Returns true if supports, false otherwise
1508   */
1509  bool ram_write_tracking_available(void)
1510  {
1511      uint64_t uffd_features;
1512      int res;
1513  
1514      res = uffd_query_features(&uffd_features);
1515      return (res == 0 &&
1516              (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1517  }
1518  
1519  /* ram_write_tracking_compatible: check if guest configuration is
1520   *   compatible with 'write-tracking'
1521   *
1522   * Returns true if compatible, false otherwise
1523   */
1524  bool ram_write_tracking_compatible(void)
1525  {
1526      const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1527      int uffd_fd;
1528      RAMBlock *block;
1529      bool ret = false;
1530  
1531      /* Open UFFD file descriptor */
1532      uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1533      if (uffd_fd < 0) {
1534          return false;
1535      }
1536  
1537      RCU_READ_LOCK_GUARD();
1538  
1539      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1540          uint64_t uffd_ioctls;
1541  
1542          /* Nothing to do with read-only and MMIO-writable regions */
1543          if (block->mr->readonly || block->mr->rom_device) {
1544              continue;
1545          }
1546          /* Try to register block memory via UFFD-IO to track writes */
1547          if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1548                  UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1549              goto out;
1550          }
1551          if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1552              goto out;
1553          }
1554      }
1555      ret = true;
1556  
1557  out:
1558      uffd_close_fd(uffd_fd);
1559      return ret;
1560  }
1561  
1562  static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1563                                         ram_addr_t size)
1564  {
1565      const ram_addr_t end = offset + size;
1566  
1567      /*
1568       * We read one byte of each page; this will preallocate page tables if
1569       * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1570       * where no page was populated yet. This might require adaption when
1571       * supporting other mappings, like shmem.
1572       */
1573      for (; offset < end; offset += block->page_size) {
1574          char tmp = *((char *)block->host + offset);
1575  
1576          /* Don't optimize the read out */
1577          asm volatile("" : "+r" (tmp));
1578      }
1579  }
1580  
1581  static inline int populate_read_section(MemoryRegionSection *section,
1582                                          void *opaque)
1583  {
1584      const hwaddr size = int128_get64(section->size);
1585      hwaddr offset = section->offset_within_region;
1586      RAMBlock *block = section->mr->ram_block;
1587  
1588      populate_read_range(block, offset, size);
1589      return 0;
1590  }
1591  
1592  /*
1593   * ram_block_populate_read: preallocate page tables and populate pages in the
1594   *   RAM block by reading a byte of each page.
1595   *
1596   * Since it's solely used for userfault_fd WP feature, here we just
1597   *   hardcode page size to qemu_real_host_page_size.
1598   *
1599   * @block: RAM block to populate
1600   */
1601  static void ram_block_populate_read(RAMBlock *rb)
1602  {
1603      /*
1604       * Skip populating all pages that fall into a discarded range as managed by
1605       * a RamDiscardManager responsible for the mapped memory region of the
1606       * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1607       * must not get populated automatically. We don't have to track
1608       * modifications via userfaultfd WP reliably, because these pages will
1609       * not be part of the migration stream either way -- see
1610       * ramblock_dirty_bitmap_exclude_discarded_pages().
1611       *
1612       * Note: The result is only stable while migrating (precopy/postcopy).
1613       */
1614      if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1615          RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1616          MemoryRegionSection section = {
1617              .mr = rb->mr,
1618              .offset_within_region = 0,
1619              .size = rb->mr->size,
1620          };
1621  
1622          ram_discard_manager_replay_populated(rdm, &section,
1623                                               populate_read_section, NULL);
1624      } else {
1625          populate_read_range(rb, 0, rb->used_length);
1626      }
1627  }
1628  
1629  /*
1630   * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1631   */
1632  void ram_write_tracking_prepare(void)
1633  {
1634      RAMBlock *block;
1635  
1636      RCU_READ_LOCK_GUARD();
1637  
1638      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639          /* Nothing to do with read-only and MMIO-writable regions */
1640          if (block->mr->readonly || block->mr->rom_device) {
1641              continue;
1642          }
1643  
1644          /*
1645           * Populate pages of the RAM block before enabling userfault_fd
1646           * write protection.
1647           *
1648           * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1649           * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1650           * pages with pte_none() entries in page table.
1651           */
1652          ram_block_populate_read(block);
1653      }
1654  }
1655  
1656  static inline int uffd_protect_section(MemoryRegionSection *section,
1657                                         void *opaque)
1658  {
1659      const hwaddr size = int128_get64(section->size);
1660      const hwaddr offset = section->offset_within_region;
1661      RAMBlock *rb = section->mr->ram_block;
1662      int uffd_fd = (uintptr_t)opaque;
1663  
1664      return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1665                                    false);
1666  }
1667  
1668  static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1669  {
1670      assert(rb->flags & RAM_UF_WRITEPROTECT);
1671  
1672      /* See ram_block_populate_read() */
1673      if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1674          RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1675          MemoryRegionSection section = {
1676              .mr = rb->mr,
1677              .offset_within_region = 0,
1678              .size = rb->mr->size,
1679          };
1680  
1681          return ram_discard_manager_replay_populated(rdm, &section,
1682                                                      uffd_protect_section,
1683                                                      (void *)(uintptr_t)uffd_fd);
1684      }
1685      return uffd_change_protection(uffd_fd, rb->host,
1686                                    rb->used_length, true, false);
1687  }
1688  
1689  /*
1690   * ram_write_tracking_start: start UFFD-WP memory tracking
1691   *
1692   * Returns 0 for success or negative value in case of error
1693   */
1694  int ram_write_tracking_start(void)
1695  {
1696      int uffd_fd;
1697      RAMState *rs = ram_state;
1698      RAMBlock *block;
1699  
1700      /* Open UFFD file descriptor */
1701      uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1702      if (uffd_fd < 0) {
1703          return uffd_fd;
1704      }
1705      rs->uffdio_fd = uffd_fd;
1706  
1707      RCU_READ_LOCK_GUARD();
1708  
1709      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1710          /* Nothing to do with read-only and MMIO-writable regions */
1711          if (block->mr->readonly || block->mr->rom_device) {
1712              continue;
1713          }
1714  
1715          /* Register block memory with UFFD to track writes */
1716          if (uffd_register_memory(rs->uffdio_fd, block->host,
1717                  block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1718              goto fail;
1719          }
1720          block->flags |= RAM_UF_WRITEPROTECT;
1721          memory_region_ref(block->mr);
1722  
1723          /* Apply UFFD write protection to the block memory range */
1724          if (ram_block_uffd_protect(block, uffd_fd)) {
1725              goto fail;
1726          }
1727  
1728          trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1729                  block->host, block->max_length);
1730      }
1731  
1732      return 0;
1733  
1734  fail:
1735      error_report("ram_write_tracking_start() failed: restoring initial memory state");
1736  
1737      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1738          if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1739              continue;
1740          }
1741          uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1742          /* Cleanup flags and remove reference */
1743          block->flags &= ~RAM_UF_WRITEPROTECT;
1744          memory_region_unref(block->mr);
1745      }
1746  
1747      uffd_close_fd(uffd_fd);
1748      rs->uffdio_fd = -1;
1749      return -1;
1750  }
1751  
1752  /**
1753   * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1754   */
1755  void ram_write_tracking_stop(void)
1756  {
1757      RAMState *rs = ram_state;
1758      RAMBlock *block;
1759  
1760      RCU_READ_LOCK_GUARD();
1761  
1762      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1763          if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1764              continue;
1765          }
1766          uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1767  
1768          trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1769                  block->host, block->max_length);
1770  
1771          /* Cleanup flags and remove reference */
1772          block->flags &= ~RAM_UF_WRITEPROTECT;
1773          memory_region_unref(block->mr);
1774      }
1775  
1776      /* Finally close UFFD file descriptor */
1777      uffd_close_fd(rs->uffdio_fd);
1778      rs->uffdio_fd = -1;
1779  }
1780  
1781  #else
1782  /* No target OS support, stubs just fail or ignore */
1783  
1784  static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1785  {
1786      (void) rs;
1787      (void) offset;
1788  
1789      return NULL;
1790  }
1791  
1792  static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1793          unsigned long start_page)
1794  {
1795      (void) rs;
1796      (void) pss;
1797      (void) start_page;
1798  
1799      return 0;
1800  }
1801  
1802  bool ram_write_tracking_available(void)
1803  {
1804      return false;
1805  }
1806  
1807  bool ram_write_tracking_compatible(void)
1808  {
1809      assert(0);
1810      return false;
1811  }
1812  
1813  int ram_write_tracking_start(void)
1814  {
1815      assert(0);
1816      return -1;
1817  }
1818  
1819  void ram_write_tracking_stop(void)
1820  {
1821      assert(0);
1822  }
1823  #endif /* defined(__linux__) */
1824  
1825  /**
1826   * get_queued_page: unqueue a page from the postcopy requests
1827   *
1828   * Skips pages that are already sent (!dirty)
1829   *
1830   * Returns true if a queued page is found
1831   *
1832   * @rs: current RAM state
1833   * @pss: data about the state of the current dirty page scan
1834   */
1835  static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1836  {
1837      RAMBlock  *block;
1838      ram_addr_t offset;
1839      bool dirty;
1840  
1841      do {
1842          block = unqueue_page(rs, &offset);
1843          /*
1844           * We're sending this page, and since it's postcopy nothing else
1845           * will dirty it, and we must make sure it doesn't get sent again
1846           * even if this queue request was received after the background
1847           * search already sent it.
1848           */
1849          if (block) {
1850              unsigned long page;
1851  
1852              page = offset >> TARGET_PAGE_BITS;
1853              dirty = test_bit(page, block->bmap);
1854              if (!dirty) {
1855                  trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1856                                                  page);
1857              } else {
1858                  trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1859              }
1860          }
1861  
1862      } while (block && !dirty);
1863  
1864      if (!block) {
1865          /*
1866           * Poll write faults too if background snapshot is enabled; that's
1867           * when we have vcpus got blocked by the write protected pages.
1868           */
1869          block = poll_fault_page(rs, &offset);
1870      }
1871  
1872      if (block) {
1873          /*
1874           * We want the background search to continue from the queued page
1875           * since the guest is likely to want other pages near to the page
1876           * it just requested.
1877           */
1878          pss->block = block;
1879          pss->page = offset >> TARGET_PAGE_BITS;
1880  
1881          /*
1882           * This unqueued page would break the "one round" check, even is
1883           * really rare.
1884           */
1885          pss->complete_round = false;
1886      }
1887  
1888      return !!block;
1889  }
1890  
1891  /**
1892   * migration_page_queue_free: drop any remaining pages in the ram
1893   * request queue
1894   *
1895   * It should be empty at the end anyway, but in error cases there may
1896   * be some left.  in case that there is any page left, we drop it.
1897   *
1898   */
1899  static void migration_page_queue_free(RAMState *rs)
1900  {
1901      struct RAMSrcPageRequest *mspr, *next_mspr;
1902      /* This queue generally should be empty - but in the case of a failed
1903       * migration might have some droppings in.
1904       */
1905      RCU_READ_LOCK_GUARD();
1906      QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1907          memory_region_unref(mspr->rb->mr);
1908          QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1909          g_free(mspr);
1910      }
1911  }
1912  
1913  /**
1914   * ram_save_queue_pages: queue the page for transmission
1915   *
1916   * A request from postcopy destination for example.
1917   *
1918   * Returns zero on success or negative on error
1919   *
1920   * @rbname: Name of the RAMBLock of the request. NULL means the
1921   *          same that last one.
1922   * @start: starting address from the start of the RAMBlock
1923   * @len: length (in bytes) to send
1924   */
1925  int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1926  {
1927      RAMBlock *ramblock;
1928      RAMState *rs = ram_state;
1929  
1930      stat64_add(&mig_stats.postcopy_requests, 1);
1931      RCU_READ_LOCK_GUARD();
1932  
1933      if (!rbname) {
1934          /* Reuse last RAMBlock */
1935          ramblock = rs->last_req_rb;
1936  
1937          if (!ramblock) {
1938              /*
1939               * Shouldn't happen, we can't reuse the last RAMBlock if
1940               * it's the 1st request.
1941               */
1942              error_report("ram_save_queue_pages no previous block");
1943              return -1;
1944          }
1945      } else {
1946          ramblock = qemu_ram_block_by_name(rbname);
1947  
1948          if (!ramblock) {
1949              /* We shouldn't be asked for a non-existent RAMBlock */
1950              error_report("ram_save_queue_pages no block '%s'", rbname);
1951              return -1;
1952          }
1953          rs->last_req_rb = ramblock;
1954      }
1955      trace_ram_save_queue_pages(ramblock->idstr, start, len);
1956      if (!offset_in_ramblock(ramblock, start + len - 1)) {
1957          error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1958                       RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1959                       __func__, start, len, ramblock->used_length);
1960          return -1;
1961      }
1962  
1963      /*
1964       * When with postcopy preempt, we send back the page directly in the
1965       * rp-return thread.
1966       */
1967      if (postcopy_preempt_active()) {
1968          ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1969          size_t page_size = qemu_ram_pagesize(ramblock);
1970          PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1971          int ret = 0;
1972  
1973          qemu_mutex_lock(&rs->bitmap_mutex);
1974  
1975          pss_init(pss, ramblock, page_start);
1976          /*
1977           * Always use the preempt channel, and make sure it's there.  It's
1978           * safe to access without lock, because when rp-thread is running
1979           * we should be the only one who operates on the qemufile
1980           */
1981          pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
1982          assert(pss->pss_channel);
1983  
1984          /*
1985           * It must be either one or multiple of host page size.  Just
1986           * assert; if something wrong we're mostly split brain anyway.
1987           */
1988          assert(len % page_size == 0);
1989          while (len) {
1990              if (ram_save_host_page_urgent(pss)) {
1991                  error_report("%s: ram_save_host_page_urgent() failed: "
1992                               "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
1993                               __func__, ramblock->idstr, start);
1994                  ret = -1;
1995                  break;
1996              }
1997              /*
1998               * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
1999               * will automatically be moved and point to the next host page
2000               * we're going to send, so no need to update here.
2001               *
2002               * Normally QEMU never sends >1 host page in requests, so
2003               * logically we don't even need that as the loop should only
2004               * run once, but just to be consistent.
2005               */
2006              len -= page_size;
2007          };
2008          qemu_mutex_unlock(&rs->bitmap_mutex);
2009  
2010          return ret;
2011      }
2012  
2013      struct RAMSrcPageRequest *new_entry =
2014          g_new0(struct RAMSrcPageRequest, 1);
2015      new_entry->rb = ramblock;
2016      new_entry->offset = start;
2017      new_entry->len = len;
2018  
2019      memory_region_ref(ramblock->mr);
2020      qemu_mutex_lock(&rs->src_page_req_mutex);
2021      QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2022      migration_make_urgent_request();
2023      qemu_mutex_unlock(&rs->src_page_req_mutex);
2024  
2025      return 0;
2026  }
2027  
2028  static bool save_page_use_compression(RAMState *rs)
2029  {
2030      if (!migrate_compress()) {
2031          return false;
2032      }
2033  
2034      /*
2035       * If xbzrle is enabled (e.g., after first round of migration), stop
2036       * using the data compression. In theory, xbzrle can do better than
2037       * compression.
2038       */
2039      if (rs->xbzrle_started) {
2040          return false;
2041      }
2042  
2043      return true;
2044  }
2045  
2046  /*
2047   * try to compress the page before posting it out, return true if the page
2048   * has been properly handled by compression, otherwise needs other
2049   * paths to handle it
2050   */
2051  static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2052                                 RAMBlock *block, ram_addr_t offset)
2053  {
2054      if (!save_page_use_compression(rs)) {
2055          return false;
2056      }
2057  
2058      /*
2059       * When starting the process of a new block, the first page of
2060       * the block should be sent out before other pages in the same
2061       * block, and all the pages in last block should have been sent
2062       * out, keeping this order is important, because the 'cont' flag
2063       * is used to avoid resending the block name.
2064       *
2065       * We post the fist page as normal page as compression will take
2066       * much CPU resource.
2067       */
2068      if (block != pss->last_sent_block) {
2069          ram_flush_compressed_data(rs);
2070          return false;
2071      }
2072  
2073      if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
2074          return true;
2075      }
2076  
2077      compression_counters.busy++;
2078      return false;
2079  }
2080  
2081  /**
2082   * ram_save_target_page_legacy: save one target page
2083   *
2084   * Returns the number of pages written
2085   *
2086   * @rs: current RAM state
2087   * @pss: data about the page we want to send
2088   */
2089  static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2090  {
2091      RAMBlock *block = pss->block;
2092      ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2093      int res;
2094  
2095      if (control_save_page(pss, block, offset, &res)) {
2096          return res;
2097      }
2098  
2099      if (save_compress_page(rs, pss, block, offset)) {
2100          return 1;
2101      }
2102  
2103      res = save_zero_page(pss, pss->pss_channel, block, offset);
2104      if (res > 0) {
2105          /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2106           * page would be stale
2107           */
2108          if (rs->xbzrle_started) {
2109              XBZRLE_cache_lock();
2110              xbzrle_cache_zero_page(rs, block->offset + offset);
2111              XBZRLE_cache_unlock();
2112          }
2113          return res;
2114      }
2115  
2116      /*
2117       * Do not use multifd in postcopy as one whole host page should be
2118       * placed.  Meanwhile postcopy requires atomic update of pages, so even
2119       * if host page size == guest page size the dest guest during run may
2120       * still see partially copied pages which is data corruption.
2121       */
2122      if (migrate_multifd() && !migration_in_postcopy()) {
2123          return ram_save_multifd_page(pss->pss_channel, block, offset);
2124      }
2125  
2126      return ram_save_page(rs, pss);
2127  }
2128  
2129  /* Should be called before sending a host page */
2130  static void pss_host_page_prepare(PageSearchStatus *pss)
2131  {
2132      /* How many guest pages are there in one host page? */
2133      size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2134  
2135      pss->host_page_sending = true;
2136      if (guest_pfns <= 1) {
2137          /*
2138           * This covers both when guest psize == host psize, or when guest
2139           * has larger psize than the host (guest_pfns==0).
2140           *
2141           * For the latter, we always send one whole guest page per
2142           * iteration of the host page (example: an Alpha VM on x86 host
2143           * will have guest psize 8K while host psize 4K).
2144           */
2145          pss->host_page_start = pss->page;
2146          pss->host_page_end = pss->page + 1;
2147      } else {
2148          /*
2149           * The host page spans over multiple guest pages, we send them
2150           * within the same host page iteration.
2151           */
2152          pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2153          pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2154      }
2155  }
2156  
2157  /*
2158   * Whether the page pointed by PSS is within the host page being sent.
2159   * Must be called after a previous pss_host_page_prepare().
2160   */
2161  static bool pss_within_range(PageSearchStatus *pss)
2162  {
2163      ram_addr_t ram_addr;
2164  
2165      assert(pss->host_page_sending);
2166  
2167      /* Over host-page boundary? */
2168      if (pss->page >= pss->host_page_end) {
2169          return false;
2170      }
2171  
2172      ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2173  
2174      return offset_in_ramblock(pss->block, ram_addr);
2175  }
2176  
2177  static void pss_host_page_finish(PageSearchStatus *pss)
2178  {
2179      pss->host_page_sending = false;
2180      /* This is not needed, but just to reset it */
2181      pss->host_page_start = pss->host_page_end = 0;
2182  }
2183  
2184  /*
2185   * Send an urgent host page specified by `pss'.  Need to be called with
2186   * bitmap_mutex held.
2187   *
2188   * Returns 0 if save host page succeeded, false otherwise.
2189   */
2190  static int ram_save_host_page_urgent(PageSearchStatus *pss)
2191  {
2192      bool page_dirty, sent = false;
2193      RAMState *rs = ram_state;
2194      int ret = 0;
2195  
2196      trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2197      pss_host_page_prepare(pss);
2198  
2199      /*
2200       * If precopy is sending the same page, let it be done in precopy, or
2201       * we could send the same page in two channels and none of them will
2202       * receive the whole page.
2203       */
2204      if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2205          trace_postcopy_preempt_hit(pss->block->idstr,
2206                                     pss->page << TARGET_PAGE_BITS);
2207          return 0;
2208      }
2209  
2210      do {
2211          page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2212  
2213          if (page_dirty) {
2214              /* Be strict to return code; it must be 1, or what else? */
2215              if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2216                  error_report_once("%s: ram_save_target_page failed", __func__);
2217                  ret = -1;
2218                  goto out;
2219              }
2220              sent = true;
2221          }
2222          pss_find_next_dirty(pss);
2223      } while (pss_within_range(pss));
2224  out:
2225      pss_host_page_finish(pss);
2226      /* For urgent requests, flush immediately if sent */
2227      if (sent) {
2228          qemu_fflush(pss->pss_channel);
2229      }
2230      return ret;
2231  }
2232  
2233  /**
2234   * ram_save_host_page: save a whole host page
2235   *
2236   * Starting at *offset send pages up to the end of the current host
2237   * page. It's valid for the initial offset to point into the middle of
2238   * a host page in which case the remainder of the hostpage is sent.
2239   * Only dirty target pages are sent. Note that the host page size may
2240   * be a huge page for this block.
2241   *
2242   * The saving stops at the boundary of the used_length of the block
2243   * if the RAMBlock isn't a multiple of the host page size.
2244   *
2245   * The caller must be with ram_state.bitmap_mutex held to call this
2246   * function.  Note that this function can temporarily release the lock, but
2247   * when the function is returned it'll make sure the lock is still held.
2248   *
2249   * Returns the number of pages written or negative on error
2250   *
2251   * @rs: current RAM state
2252   * @pss: data about the page we want to send
2253   */
2254  static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2255  {
2256      bool page_dirty, preempt_active = postcopy_preempt_active();
2257      int tmppages, pages = 0;
2258      size_t pagesize_bits =
2259          qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2260      unsigned long start_page = pss->page;
2261      int res;
2262  
2263      if (ramblock_is_ignored(pss->block)) {
2264          error_report("block %s should not be migrated !", pss->block->idstr);
2265          return 0;
2266      }
2267  
2268      /* Update host page boundary information */
2269      pss_host_page_prepare(pss);
2270  
2271      do {
2272          page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2273  
2274          /* Check the pages is dirty and if it is send it */
2275          if (page_dirty) {
2276              /*
2277               * Properly yield the lock only in postcopy preempt mode
2278               * because both migration thread and rp-return thread can
2279               * operate on the bitmaps.
2280               */
2281              if (preempt_active) {
2282                  qemu_mutex_unlock(&rs->bitmap_mutex);
2283              }
2284              tmppages = migration_ops->ram_save_target_page(rs, pss);
2285              if (tmppages >= 0) {
2286                  pages += tmppages;
2287                  /*
2288                   * Allow rate limiting to happen in the middle of huge pages if
2289                   * something is sent in the current iteration.
2290                   */
2291                  if (pagesize_bits > 1 && tmppages > 0) {
2292                      migration_rate_limit();
2293                  }
2294              }
2295              if (preempt_active) {
2296                  qemu_mutex_lock(&rs->bitmap_mutex);
2297              }
2298          } else {
2299              tmppages = 0;
2300          }
2301  
2302          if (tmppages < 0) {
2303              pss_host_page_finish(pss);
2304              return tmppages;
2305          }
2306  
2307          pss_find_next_dirty(pss);
2308      } while (pss_within_range(pss));
2309  
2310      pss_host_page_finish(pss);
2311  
2312      res = ram_save_release_protection(rs, pss, start_page);
2313      return (res < 0 ? res : pages);
2314  }
2315  
2316  /**
2317   * ram_find_and_save_block: finds a dirty page and sends it to f
2318   *
2319   * Called within an RCU critical section.
2320   *
2321   * Returns the number of pages written where zero means no dirty pages,
2322   * or negative on error
2323   *
2324   * @rs: current RAM state
2325   *
2326   * On systems where host-page-size > target-page-size it will send all the
2327   * pages in a host page that are dirty.
2328   */
2329  static int ram_find_and_save_block(RAMState *rs)
2330  {
2331      PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2332      int pages = 0;
2333  
2334      /* No dirty page as there is zero RAM */
2335      if (!rs->ram_bytes_total) {
2336          return pages;
2337      }
2338  
2339      /*
2340       * Always keep last_seen_block/last_page valid during this procedure,
2341       * because find_dirty_block() relies on these values (e.g., we compare
2342       * last_seen_block with pss.block to see whether we searched all the
2343       * ramblocks) to detect the completion of migration.  Having NULL value
2344       * of last_seen_block can conditionally cause below loop to run forever.
2345       */
2346      if (!rs->last_seen_block) {
2347          rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2348          rs->last_page = 0;
2349      }
2350  
2351      pss_init(pss, rs->last_seen_block, rs->last_page);
2352  
2353      while (true){
2354          if (!get_queued_page(rs, pss)) {
2355              /* priority queue empty, so just search for something dirty */
2356              int res = find_dirty_block(rs, pss);
2357              if (res != PAGE_DIRTY_FOUND) {
2358                  if (res == PAGE_ALL_CLEAN) {
2359                      break;
2360                  } else if (res == PAGE_TRY_AGAIN) {
2361                      continue;
2362                  } else if (res < 0) {
2363                      pages = res;
2364                      break;
2365                  }
2366              }
2367          }
2368          pages = ram_save_host_page(rs, pss);
2369          if (pages) {
2370              break;
2371          }
2372      }
2373  
2374      rs->last_seen_block = pss->block;
2375      rs->last_page = pss->page;
2376  
2377      return pages;
2378  }
2379  
2380  static uint64_t ram_bytes_total_with_ignored(void)
2381  {
2382      RAMBlock *block;
2383      uint64_t total = 0;
2384  
2385      RCU_READ_LOCK_GUARD();
2386  
2387      RAMBLOCK_FOREACH_MIGRATABLE(block) {
2388          total += block->used_length;
2389      }
2390      return total;
2391  }
2392  
2393  uint64_t ram_bytes_total(void)
2394  {
2395      RAMBlock *block;
2396      uint64_t total = 0;
2397  
2398      RCU_READ_LOCK_GUARD();
2399  
2400      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2401          total += block->used_length;
2402      }
2403      return total;
2404  }
2405  
2406  static void xbzrle_load_setup(void)
2407  {
2408      XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2409  }
2410  
2411  static void xbzrle_load_cleanup(void)
2412  {
2413      g_free(XBZRLE.decoded_buf);
2414      XBZRLE.decoded_buf = NULL;
2415  }
2416  
2417  static void ram_state_cleanup(RAMState **rsp)
2418  {
2419      if (*rsp) {
2420          migration_page_queue_free(*rsp);
2421          qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2422          qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2423          g_free(*rsp);
2424          *rsp = NULL;
2425      }
2426  }
2427  
2428  static void xbzrle_cleanup(void)
2429  {
2430      XBZRLE_cache_lock();
2431      if (XBZRLE.cache) {
2432          cache_fini(XBZRLE.cache);
2433          g_free(XBZRLE.encoded_buf);
2434          g_free(XBZRLE.current_buf);
2435          g_free(XBZRLE.zero_target_page);
2436          XBZRLE.cache = NULL;
2437          XBZRLE.encoded_buf = NULL;
2438          XBZRLE.current_buf = NULL;
2439          XBZRLE.zero_target_page = NULL;
2440      }
2441      XBZRLE_cache_unlock();
2442  }
2443  
2444  static void ram_save_cleanup(void *opaque)
2445  {
2446      RAMState **rsp = opaque;
2447      RAMBlock *block;
2448  
2449      /* We don't use dirty log with background snapshots */
2450      if (!migrate_background_snapshot()) {
2451          /* caller have hold iothread lock or is in a bh, so there is
2452           * no writing race against the migration bitmap
2453           */
2454          if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2455              /*
2456               * do not stop dirty log without starting it, since
2457               * memory_global_dirty_log_stop will assert that
2458               * memory_global_dirty_log_start/stop used in pairs
2459               */
2460              memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2461          }
2462      }
2463  
2464      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2465          g_free(block->clear_bmap);
2466          block->clear_bmap = NULL;
2467          g_free(block->bmap);
2468          block->bmap = NULL;
2469      }
2470  
2471      xbzrle_cleanup();
2472      compress_threads_save_cleanup();
2473      ram_state_cleanup(rsp);
2474      g_free(migration_ops);
2475      migration_ops = NULL;
2476  }
2477  
2478  static void ram_state_reset(RAMState *rs)
2479  {
2480      int i;
2481  
2482      for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2483          rs->pss[i].last_sent_block = NULL;
2484      }
2485  
2486      rs->last_seen_block = NULL;
2487      rs->last_page = 0;
2488      rs->last_version = ram_list.version;
2489      rs->xbzrle_started = false;
2490  }
2491  
2492  #define MAX_WAIT 50 /* ms, half buffered_file limit */
2493  
2494  /* **** functions for postcopy ***** */
2495  
2496  void ram_postcopy_migrated_memory_release(MigrationState *ms)
2497  {
2498      struct RAMBlock *block;
2499  
2500      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2501          unsigned long *bitmap = block->bmap;
2502          unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2503          unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2504  
2505          while (run_start < range) {
2506              unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2507              ram_discard_range(block->idstr,
2508                                ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2509                                ((ram_addr_t)(run_end - run_start))
2510                                  << TARGET_PAGE_BITS);
2511              run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2512          }
2513      }
2514  }
2515  
2516  /**
2517   * postcopy_send_discard_bm_ram: discard a RAMBlock
2518   *
2519   * Callback from postcopy_each_ram_send_discard for each RAMBlock
2520   *
2521   * @ms: current migration state
2522   * @block: RAMBlock to discard
2523   */
2524  static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2525  {
2526      unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2527      unsigned long current;
2528      unsigned long *bitmap = block->bmap;
2529  
2530      for (current = 0; current < end; ) {
2531          unsigned long one = find_next_bit(bitmap, end, current);
2532          unsigned long zero, discard_length;
2533  
2534          if (one >= end) {
2535              break;
2536          }
2537  
2538          zero = find_next_zero_bit(bitmap, end, one + 1);
2539  
2540          if (zero >= end) {
2541              discard_length = end - one;
2542          } else {
2543              discard_length = zero - one;
2544          }
2545          postcopy_discard_send_range(ms, one, discard_length);
2546          current = one + discard_length;
2547      }
2548  }
2549  
2550  static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2551  
2552  /**
2553   * postcopy_each_ram_send_discard: discard all RAMBlocks
2554   *
2555   * Utility for the outgoing postcopy code.
2556   *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2557   *   passing it bitmap indexes and name.
2558   * (qemu_ram_foreach_block ends up passing unscaled lengths
2559   *  which would mean postcopy code would have to deal with target page)
2560   *
2561   * @ms: current migration state
2562   */
2563  static void postcopy_each_ram_send_discard(MigrationState *ms)
2564  {
2565      struct RAMBlock *block;
2566  
2567      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2568          postcopy_discard_send_init(ms, block->idstr);
2569  
2570          /*
2571           * Deal with TPS != HPS and huge pages.  It discard any partially sent
2572           * host-page size chunks, mark any partially dirty host-page size
2573           * chunks as all dirty.  In this case the host-page is the host-page
2574           * for the particular RAMBlock, i.e. it might be a huge page.
2575           */
2576          postcopy_chunk_hostpages_pass(ms, block);
2577  
2578          /*
2579           * Postcopy sends chunks of bitmap over the wire, but it
2580           * just needs indexes at this point, avoids it having
2581           * target page specific code.
2582           */
2583          postcopy_send_discard_bm_ram(ms, block);
2584          postcopy_discard_send_finish(ms);
2585      }
2586  }
2587  
2588  /**
2589   * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2590   *
2591   * Helper for postcopy_chunk_hostpages; it's called twice to
2592   * canonicalize the two bitmaps, that are similar, but one is
2593   * inverted.
2594   *
2595   * Postcopy requires that all target pages in a hostpage are dirty or
2596   * clean, not a mix.  This function canonicalizes the bitmaps.
2597   *
2598   * @ms: current migration state
2599   * @block: block that contains the page we want to canonicalize
2600   */
2601  static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2602  {
2603      RAMState *rs = ram_state;
2604      unsigned long *bitmap = block->bmap;
2605      unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2606      unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2607      unsigned long run_start;
2608  
2609      if (block->page_size == TARGET_PAGE_SIZE) {
2610          /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2611          return;
2612      }
2613  
2614      /* Find a dirty page */
2615      run_start = find_next_bit(bitmap, pages, 0);
2616  
2617      while (run_start < pages) {
2618  
2619          /*
2620           * If the start of this run of pages is in the middle of a host
2621           * page, then we need to fixup this host page.
2622           */
2623          if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2624              /* Find the end of this run */
2625              run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2626              /*
2627               * If the end isn't at the start of a host page, then the
2628               * run doesn't finish at the end of a host page
2629               * and we need to discard.
2630               */
2631          }
2632  
2633          if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2634              unsigned long page;
2635              unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2636                                                               host_ratio);
2637              run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2638  
2639              /* Clean up the bitmap */
2640              for (page = fixup_start_addr;
2641                   page < fixup_start_addr + host_ratio; page++) {
2642                  /*
2643                   * Remark them as dirty, updating the count for any pages
2644                   * that weren't previously dirty.
2645                   */
2646                  rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2647              }
2648          }
2649  
2650          /* Find the next dirty page for the next iteration */
2651          run_start = find_next_bit(bitmap, pages, run_start);
2652      }
2653  }
2654  
2655  /**
2656   * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2657   *
2658   * Transmit the set of pages to be discarded after precopy to the target
2659   * these are pages that:
2660   *     a) Have been previously transmitted but are now dirty again
2661   *     b) Pages that have never been transmitted, this ensures that
2662   *        any pages on the destination that have been mapped by background
2663   *        tasks get discarded (transparent huge pages is the specific concern)
2664   * Hopefully this is pretty sparse
2665   *
2666   * @ms: current migration state
2667   */
2668  void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2669  {
2670      RAMState *rs = ram_state;
2671  
2672      RCU_READ_LOCK_GUARD();
2673  
2674      /* This should be our last sync, the src is now paused */
2675      migration_bitmap_sync(rs, false);
2676  
2677      /* Easiest way to make sure we don't resume in the middle of a host-page */
2678      rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2679      rs->last_seen_block = NULL;
2680      rs->last_page = 0;
2681  
2682      postcopy_each_ram_send_discard(ms);
2683  
2684      trace_ram_postcopy_send_discard_bitmap();
2685  }
2686  
2687  /**
2688   * ram_discard_range: discard dirtied pages at the beginning of postcopy
2689   *
2690   * Returns zero on success
2691   *
2692   * @rbname: name of the RAMBlock of the request. NULL means the
2693   *          same that last one.
2694   * @start: RAMBlock starting page
2695   * @length: RAMBlock size
2696   */
2697  int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2698  {
2699      trace_ram_discard_range(rbname, start, length);
2700  
2701      RCU_READ_LOCK_GUARD();
2702      RAMBlock *rb = qemu_ram_block_by_name(rbname);
2703  
2704      if (!rb) {
2705          error_report("ram_discard_range: Failed to find block '%s'", rbname);
2706          return -1;
2707      }
2708  
2709      /*
2710       * On source VM, we don't need to update the received bitmap since
2711       * we don't even have one.
2712       */
2713      if (rb->receivedmap) {
2714          bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2715                       length >> qemu_target_page_bits());
2716      }
2717  
2718      return ram_block_discard_range(rb, start, length);
2719  }
2720  
2721  /*
2722   * For every allocation, we will try not to crash the VM if the
2723   * allocation failed.
2724   */
2725  static int xbzrle_init(void)
2726  {
2727      Error *local_err = NULL;
2728  
2729      if (!migrate_xbzrle()) {
2730          return 0;
2731      }
2732  
2733      XBZRLE_cache_lock();
2734  
2735      XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2736      if (!XBZRLE.zero_target_page) {
2737          error_report("%s: Error allocating zero page", __func__);
2738          goto err_out;
2739      }
2740  
2741      XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2742                                TARGET_PAGE_SIZE, &local_err);
2743      if (!XBZRLE.cache) {
2744          error_report_err(local_err);
2745          goto free_zero_page;
2746      }
2747  
2748      XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2749      if (!XBZRLE.encoded_buf) {
2750          error_report("%s: Error allocating encoded_buf", __func__);
2751          goto free_cache;
2752      }
2753  
2754      XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2755      if (!XBZRLE.current_buf) {
2756          error_report("%s: Error allocating current_buf", __func__);
2757          goto free_encoded_buf;
2758      }
2759  
2760      /* We are all good */
2761      XBZRLE_cache_unlock();
2762      return 0;
2763  
2764  free_encoded_buf:
2765      g_free(XBZRLE.encoded_buf);
2766      XBZRLE.encoded_buf = NULL;
2767  free_cache:
2768      cache_fini(XBZRLE.cache);
2769      XBZRLE.cache = NULL;
2770  free_zero_page:
2771      g_free(XBZRLE.zero_target_page);
2772      XBZRLE.zero_target_page = NULL;
2773  err_out:
2774      XBZRLE_cache_unlock();
2775      return -ENOMEM;
2776  }
2777  
2778  static int ram_state_init(RAMState **rsp)
2779  {
2780      *rsp = g_try_new0(RAMState, 1);
2781  
2782      if (!*rsp) {
2783          error_report("%s: Init ramstate fail", __func__);
2784          return -1;
2785      }
2786  
2787      qemu_mutex_init(&(*rsp)->bitmap_mutex);
2788      qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2789      QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2790      (*rsp)->ram_bytes_total = ram_bytes_total();
2791  
2792      /*
2793       * Count the total number of pages used by ram blocks not including any
2794       * gaps due to alignment or unplugs.
2795       * This must match with the initial values of dirty bitmap.
2796       */
2797      (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
2798      ram_state_reset(*rsp);
2799  
2800      return 0;
2801  }
2802  
2803  static void ram_list_init_bitmaps(void)
2804  {
2805      MigrationState *ms = migrate_get_current();
2806      RAMBlock *block;
2807      unsigned long pages;
2808      uint8_t shift;
2809  
2810      /* Skip setting bitmap if there is no RAM */
2811      if (ram_bytes_total()) {
2812          shift = ms->clear_bitmap_shift;
2813          if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2814              error_report("clear_bitmap_shift (%u) too big, using "
2815                           "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2816              shift = CLEAR_BITMAP_SHIFT_MAX;
2817          } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2818              error_report("clear_bitmap_shift (%u) too small, using "
2819                           "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2820              shift = CLEAR_BITMAP_SHIFT_MIN;
2821          }
2822  
2823          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2824              pages = block->max_length >> TARGET_PAGE_BITS;
2825              /*
2826               * The initial dirty bitmap for migration must be set with all
2827               * ones to make sure we'll migrate every guest RAM page to
2828               * destination.
2829               * Here we set RAMBlock.bmap all to 1 because when rebegin a
2830               * new migration after a failed migration, ram_list.
2831               * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2832               * guest memory.
2833               */
2834              block->bmap = bitmap_new(pages);
2835              bitmap_set(block->bmap, 0, pages);
2836              block->clear_bmap_shift = shift;
2837              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2838          }
2839      }
2840  }
2841  
2842  static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2843  {
2844      unsigned long pages;
2845      RAMBlock *rb;
2846  
2847      RCU_READ_LOCK_GUARD();
2848  
2849      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2850              pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2851              rs->migration_dirty_pages -= pages;
2852      }
2853  }
2854  
2855  static void ram_init_bitmaps(RAMState *rs)
2856  {
2857      /* For memory_global_dirty_log_start below.  */
2858      qemu_mutex_lock_iothread();
2859      qemu_mutex_lock_ramlist();
2860  
2861      WITH_RCU_READ_LOCK_GUARD() {
2862          ram_list_init_bitmaps();
2863          /* We don't use dirty log with background snapshots */
2864          if (!migrate_background_snapshot()) {
2865              memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2866              migration_bitmap_sync_precopy(rs, false);
2867          }
2868      }
2869      qemu_mutex_unlock_ramlist();
2870      qemu_mutex_unlock_iothread();
2871  
2872      /*
2873       * After an eventual first bitmap sync, fixup the initial bitmap
2874       * containing all 1s to exclude any discarded pages from migration.
2875       */
2876      migration_bitmap_clear_discarded_pages(rs);
2877  }
2878  
2879  static int ram_init_all(RAMState **rsp)
2880  {
2881      if (ram_state_init(rsp)) {
2882          return -1;
2883      }
2884  
2885      if (xbzrle_init()) {
2886          ram_state_cleanup(rsp);
2887          return -1;
2888      }
2889  
2890      ram_init_bitmaps(*rsp);
2891  
2892      return 0;
2893  }
2894  
2895  static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2896  {
2897      RAMBlock *block;
2898      uint64_t pages = 0;
2899  
2900      /*
2901       * Postcopy is not using xbzrle/compression, so no need for that.
2902       * Also, since source are already halted, we don't need to care
2903       * about dirty page logging as well.
2904       */
2905  
2906      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2907          pages += bitmap_count_one(block->bmap,
2908                                    block->used_length >> TARGET_PAGE_BITS);
2909      }
2910  
2911      /* This may not be aligned with current bitmaps. Recalculate. */
2912      rs->migration_dirty_pages = pages;
2913  
2914      ram_state_reset(rs);
2915  
2916      /* Update RAMState cache of output QEMUFile */
2917      rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
2918  
2919      trace_ram_state_resume_prepare(pages);
2920  }
2921  
2922  /*
2923   * This function clears bits of the free pages reported by the caller from the
2924   * migration dirty bitmap. @addr is the host address corresponding to the
2925   * start of the continuous guest free pages, and @len is the total bytes of
2926   * those pages.
2927   */
2928  void qemu_guest_free_page_hint(void *addr, size_t len)
2929  {
2930      RAMBlock *block;
2931      ram_addr_t offset;
2932      size_t used_len, start, npages;
2933      MigrationState *s = migrate_get_current();
2934  
2935      /* This function is currently expected to be used during live migration */
2936      if (!migration_is_setup_or_active(s->state)) {
2937          return;
2938      }
2939  
2940      for (; len > 0; len -= used_len, addr += used_len) {
2941          block = qemu_ram_block_from_host(addr, false, &offset);
2942          if (unlikely(!block || offset >= block->used_length)) {
2943              /*
2944               * The implementation might not support RAMBlock resize during
2945               * live migration, but it could happen in theory with future
2946               * updates. So we add a check here to capture that case.
2947               */
2948              error_report_once("%s unexpected error", __func__);
2949              return;
2950          }
2951  
2952          if (len <= block->used_length - offset) {
2953              used_len = len;
2954          } else {
2955              used_len = block->used_length - offset;
2956          }
2957  
2958          start = offset >> TARGET_PAGE_BITS;
2959          npages = used_len >> TARGET_PAGE_BITS;
2960  
2961          qemu_mutex_lock(&ram_state->bitmap_mutex);
2962          /*
2963           * The skipped free pages are equavalent to be sent from clear_bmap's
2964           * perspective, so clear the bits from the memory region bitmap which
2965           * are initially set. Otherwise those skipped pages will be sent in
2966           * the next round after syncing from the memory region bitmap.
2967           */
2968          migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2969          ram_state->migration_dirty_pages -=
2970                        bitmap_count_one_with_offset(block->bmap, start, npages);
2971          bitmap_clear(block->bmap, start, npages);
2972          qemu_mutex_unlock(&ram_state->bitmap_mutex);
2973      }
2974  }
2975  
2976  /*
2977   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2978   * long-running RCU critical section.  When rcu-reclaims in the code
2979   * start to become numerous it will be necessary to reduce the
2980   * granularity of these critical sections.
2981   */
2982  
2983  /**
2984   * ram_save_setup: Setup RAM for migration
2985   *
2986   * Returns zero to indicate success and negative for error
2987   *
2988   * @f: QEMUFile where to send the data
2989   * @opaque: RAMState pointer
2990   */
2991  static int ram_save_setup(QEMUFile *f, void *opaque)
2992  {
2993      RAMState **rsp = opaque;
2994      RAMBlock *block;
2995      int ret;
2996  
2997      if (compress_threads_save_setup()) {
2998          return -1;
2999      }
3000  
3001      /* migration has already setup the bitmap, reuse it. */
3002      if (!migration_in_colo_state()) {
3003          if (ram_init_all(rsp) != 0) {
3004              compress_threads_save_cleanup();
3005              return -1;
3006          }
3007      }
3008      (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3009  
3010      WITH_RCU_READ_LOCK_GUARD() {
3011          qemu_put_be64(f, ram_bytes_total_with_ignored()
3012                           | RAM_SAVE_FLAG_MEM_SIZE);
3013  
3014          RAMBLOCK_FOREACH_MIGRATABLE(block) {
3015              qemu_put_byte(f, strlen(block->idstr));
3016              qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3017              qemu_put_be64(f, block->used_length);
3018              if (migrate_postcopy_ram() && block->page_size !=
3019                                            qemu_host_page_size) {
3020                  qemu_put_be64(f, block->page_size);
3021              }
3022              if (migrate_ignore_shared()) {
3023                  qemu_put_be64(f, block->mr->addr);
3024              }
3025          }
3026      }
3027  
3028      ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3029      ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3030  
3031      migration_ops = g_malloc0(sizeof(MigrationOps));
3032      migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3033      ret = multifd_send_sync_main(f);
3034      if (ret < 0) {
3035          return ret;
3036      }
3037  
3038      if (!migrate_multifd_flush_after_each_section()) {
3039          qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3040      }
3041  
3042      qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3043      qemu_fflush(f);
3044  
3045      return 0;
3046  }
3047  
3048  /**
3049   * ram_save_iterate: iterative stage for migration
3050   *
3051   * Returns zero to indicate success and negative for error
3052   *
3053   * @f: QEMUFile where to send the data
3054   * @opaque: RAMState pointer
3055   */
3056  static int ram_save_iterate(QEMUFile *f, void *opaque)
3057  {
3058      RAMState **temp = opaque;
3059      RAMState *rs = *temp;
3060      int ret = 0;
3061      int i;
3062      int64_t t0;
3063      int done = 0;
3064  
3065      if (blk_mig_bulk_active()) {
3066          /* Avoid transferring ram during bulk phase of block migration as
3067           * the bulk phase will usually take a long time and transferring
3068           * ram updates during that time is pointless. */
3069          goto out;
3070      }
3071  
3072      /*
3073       * We'll take this lock a little bit long, but it's okay for two reasons.
3074       * Firstly, the only possible other thread to take it is who calls
3075       * qemu_guest_free_page_hint(), which should be rare; secondly, see
3076       * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3077       * guarantees that we'll at least released it in a regular basis.
3078       */
3079      qemu_mutex_lock(&rs->bitmap_mutex);
3080      WITH_RCU_READ_LOCK_GUARD() {
3081          if (ram_list.version != rs->last_version) {
3082              ram_state_reset(rs);
3083          }
3084  
3085          /* Read version before ram_list.blocks */
3086          smp_rmb();
3087  
3088          ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3089  
3090          t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3091          i = 0;
3092          while ((ret = migration_rate_exceeded(f)) == 0 ||
3093                 postcopy_has_request(rs)) {
3094              int pages;
3095  
3096              if (qemu_file_get_error(f)) {
3097                  break;
3098              }
3099  
3100              pages = ram_find_and_save_block(rs);
3101              /* no more pages to sent */
3102              if (pages == 0) {
3103                  done = 1;
3104                  break;
3105              }
3106  
3107              if (pages < 0) {
3108                  qemu_file_set_error(f, pages);
3109                  break;
3110              }
3111  
3112              rs->target_page_count += pages;
3113  
3114              /*
3115               * During postcopy, it is necessary to make sure one whole host
3116               * page is sent in one chunk.
3117               */
3118              if (migrate_postcopy_ram()) {
3119                  ram_flush_compressed_data(rs);
3120              }
3121  
3122              /*
3123               * we want to check in the 1st loop, just in case it was the 1st
3124               * time and we had to sync the dirty bitmap.
3125               * qemu_clock_get_ns() is a bit expensive, so we only check each
3126               * some iterations
3127               */
3128              if ((i & 63) == 0) {
3129                  uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3130                                1000000;
3131                  if (t1 > MAX_WAIT) {
3132                      trace_ram_save_iterate_big_wait(t1, i);
3133                      break;
3134                  }
3135              }
3136              i++;
3137          }
3138      }
3139      qemu_mutex_unlock(&rs->bitmap_mutex);
3140  
3141      /*
3142       * Must occur before EOS (or any QEMUFile operation)
3143       * because of RDMA protocol.
3144       */
3145      ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3146  
3147  out:
3148      if (ret >= 0
3149          && migration_is_setup_or_active(migrate_get_current()->state)) {
3150          if (migrate_multifd_flush_after_each_section()) {
3151              ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3152              if (ret < 0) {
3153                  return ret;
3154              }
3155          }
3156  
3157          qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3158          qemu_fflush(f);
3159          ram_transferred_add(8);
3160  
3161          ret = qemu_file_get_error(f);
3162      }
3163      if (ret < 0) {
3164          return ret;
3165      }
3166  
3167      return done;
3168  }
3169  
3170  /**
3171   * ram_save_complete: function called to send the remaining amount of ram
3172   *
3173   * Returns zero to indicate success or negative on error
3174   *
3175   * Called with iothread lock
3176   *
3177   * @f: QEMUFile where to send the data
3178   * @opaque: RAMState pointer
3179   */
3180  static int ram_save_complete(QEMUFile *f, void *opaque)
3181  {
3182      RAMState **temp = opaque;
3183      RAMState *rs = *temp;
3184      int ret = 0;
3185  
3186      rs->last_stage = !migration_in_colo_state();
3187  
3188      WITH_RCU_READ_LOCK_GUARD() {
3189          if (!migration_in_postcopy()) {
3190              migration_bitmap_sync_precopy(rs, true);
3191          }
3192  
3193          ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3194  
3195          /* try transferring iterative blocks of memory */
3196  
3197          /* flush all remaining blocks regardless of rate limiting */
3198          qemu_mutex_lock(&rs->bitmap_mutex);
3199          while (true) {
3200              int pages;
3201  
3202              pages = ram_find_and_save_block(rs);
3203              /* no more blocks to sent */
3204              if (pages == 0) {
3205                  break;
3206              }
3207              if (pages < 0) {
3208                  ret = pages;
3209                  break;
3210              }
3211          }
3212          qemu_mutex_unlock(&rs->bitmap_mutex);
3213  
3214          ram_flush_compressed_data(rs);
3215          ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3216      }
3217  
3218      if (ret < 0) {
3219          return ret;
3220      }
3221  
3222      ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3223      if (ret < 0) {
3224          return ret;
3225      }
3226  
3227      if (!migrate_multifd_flush_after_each_section()) {
3228          qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3229      }
3230      qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3231      qemu_fflush(f);
3232  
3233      return 0;
3234  }
3235  
3236  static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3237                                         uint64_t *can_postcopy)
3238  {
3239      RAMState **temp = opaque;
3240      RAMState *rs = *temp;
3241  
3242      uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3243  
3244      if (migrate_postcopy_ram()) {
3245          /* We can do postcopy, and all the data is postcopiable */
3246          *can_postcopy += remaining_size;
3247      } else {
3248          *must_precopy += remaining_size;
3249      }
3250  }
3251  
3252  static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3253                                      uint64_t *can_postcopy)
3254  {
3255      MigrationState *s = migrate_get_current();
3256      RAMState **temp = opaque;
3257      RAMState *rs = *temp;
3258  
3259      uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3260  
3261      if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3262          qemu_mutex_lock_iothread();
3263          WITH_RCU_READ_LOCK_GUARD() {
3264              migration_bitmap_sync_precopy(rs, false);
3265          }
3266          qemu_mutex_unlock_iothread();
3267          remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3268      }
3269  
3270      if (migrate_postcopy_ram()) {
3271          /* We can do postcopy, and all the data is postcopiable */
3272          *can_postcopy += remaining_size;
3273      } else {
3274          *must_precopy += remaining_size;
3275      }
3276  }
3277  
3278  static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3279  {
3280      unsigned int xh_len;
3281      int xh_flags;
3282      uint8_t *loaded_data;
3283  
3284      /* extract RLE header */
3285      xh_flags = qemu_get_byte(f);
3286      xh_len = qemu_get_be16(f);
3287  
3288      if (xh_flags != ENCODING_FLAG_XBZRLE) {
3289          error_report("Failed to load XBZRLE page - wrong compression!");
3290          return -1;
3291      }
3292  
3293      if (xh_len > TARGET_PAGE_SIZE) {
3294          error_report("Failed to load XBZRLE page - len overflow!");
3295          return -1;
3296      }
3297      loaded_data = XBZRLE.decoded_buf;
3298      /* load data and decode */
3299      /* it can change loaded_data to point to an internal buffer */
3300      qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3301  
3302      /* decode RLE */
3303      if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3304                               TARGET_PAGE_SIZE) == -1) {
3305          error_report("Failed to load XBZRLE page - decode error!");
3306          return -1;
3307      }
3308  
3309      return 0;
3310  }
3311  
3312  /**
3313   * ram_block_from_stream: read a RAMBlock id from the migration stream
3314   *
3315   * Must be called from within a rcu critical section.
3316   *
3317   * Returns a pointer from within the RCU-protected ram_list.
3318   *
3319   * @mis: the migration incoming state pointer
3320   * @f: QEMUFile where to read the data from
3321   * @flags: Page flags (mostly to see if it's a continuation of previous block)
3322   * @channel: the channel we're using
3323   */
3324  static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3325                                                QEMUFile *f, int flags,
3326                                                int channel)
3327  {
3328      RAMBlock *block = mis->last_recv_block[channel];
3329      char id[256];
3330      uint8_t len;
3331  
3332      if (flags & RAM_SAVE_FLAG_CONTINUE) {
3333          if (!block) {
3334              error_report("Ack, bad migration stream!");
3335              return NULL;
3336          }
3337          return block;
3338      }
3339  
3340      len = qemu_get_byte(f);
3341      qemu_get_buffer(f, (uint8_t *)id, len);
3342      id[len] = 0;
3343  
3344      block = qemu_ram_block_by_name(id);
3345      if (!block) {
3346          error_report("Can't find block %s", id);
3347          return NULL;
3348      }
3349  
3350      if (ramblock_is_ignored(block)) {
3351          error_report("block %s should not be migrated !", id);
3352          return NULL;
3353      }
3354  
3355      mis->last_recv_block[channel] = block;
3356  
3357      return block;
3358  }
3359  
3360  static inline void *host_from_ram_block_offset(RAMBlock *block,
3361                                                 ram_addr_t offset)
3362  {
3363      if (!offset_in_ramblock(block, offset)) {
3364          return NULL;
3365      }
3366  
3367      return block->host + offset;
3368  }
3369  
3370  static void *host_page_from_ram_block_offset(RAMBlock *block,
3371                                               ram_addr_t offset)
3372  {
3373      /* Note: Explicitly no check against offset_in_ramblock(). */
3374      return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3375                                     block->page_size);
3376  }
3377  
3378  static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3379                                                           ram_addr_t offset)
3380  {
3381      return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3382  }
3383  
3384  void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3385  {
3386      qemu_mutex_lock(&ram_state->bitmap_mutex);
3387      for (int i = 0; i < pages; i++) {
3388          ram_addr_t offset = normal[i];
3389          ram_state->migration_dirty_pages += !test_and_set_bit(
3390                                                  offset >> TARGET_PAGE_BITS,
3391                                                  block->bmap);
3392      }
3393      qemu_mutex_unlock(&ram_state->bitmap_mutex);
3394  }
3395  
3396  static inline void *colo_cache_from_block_offset(RAMBlock *block,
3397                               ram_addr_t offset, bool record_bitmap)
3398  {
3399      if (!offset_in_ramblock(block, offset)) {
3400          return NULL;
3401      }
3402      if (!block->colo_cache) {
3403          error_report("%s: colo_cache is NULL in block :%s",
3404                       __func__, block->idstr);
3405          return NULL;
3406      }
3407  
3408      /*
3409      * During colo checkpoint, we need bitmap of these migrated pages.
3410      * It help us to decide which pages in ram cache should be flushed
3411      * into VM's RAM later.
3412      */
3413      if (record_bitmap) {
3414          colo_record_bitmap(block, &offset, 1);
3415      }
3416      return block->colo_cache + offset;
3417  }
3418  
3419  /**
3420   * ram_handle_compressed: handle the zero page case
3421   *
3422   * If a page (or a whole RDMA chunk) has been
3423   * determined to be zero, then zap it.
3424   *
3425   * @host: host address for the zero page
3426   * @ch: what the page is filled from.  We only support zero
3427   * @size: size of the zero page
3428   */
3429  void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3430  {
3431      if (ch != 0 || !buffer_is_zero(host, size)) {
3432          memset(host, ch, size);
3433      }
3434  }
3435  
3436  static void colo_init_ram_state(void)
3437  {
3438      ram_state_init(&ram_state);
3439  }
3440  
3441  /*
3442   * colo cache: this is for secondary VM, we cache the whole
3443   * memory of the secondary VM, it is need to hold the global lock
3444   * to call this helper.
3445   */
3446  int colo_init_ram_cache(void)
3447  {
3448      RAMBlock *block;
3449  
3450      WITH_RCU_READ_LOCK_GUARD() {
3451          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3452              block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3453                                                      NULL, false, false);
3454              if (!block->colo_cache) {
3455                  error_report("%s: Can't alloc memory for COLO cache of block %s,"
3456                               "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3457                               block->used_length);
3458                  RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3459                      if (block->colo_cache) {
3460                          qemu_anon_ram_free(block->colo_cache, block->used_length);
3461                          block->colo_cache = NULL;
3462                      }
3463                  }
3464                  return -errno;
3465              }
3466              if (!machine_dump_guest_core(current_machine)) {
3467                  qemu_madvise(block->colo_cache, block->used_length,
3468                               QEMU_MADV_DONTDUMP);
3469              }
3470          }
3471      }
3472  
3473      /*
3474      * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3475      * with to decide which page in cache should be flushed into SVM's RAM. Here
3476      * we use the same name 'ram_bitmap' as for migration.
3477      */
3478      if (ram_bytes_total()) {
3479          RAMBlock *block;
3480  
3481          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3482              unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3483              block->bmap = bitmap_new(pages);
3484          }
3485      }
3486  
3487      colo_init_ram_state();
3488      return 0;
3489  }
3490  
3491  /* TODO: duplicated with ram_init_bitmaps */
3492  void colo_incoming_start_dirty_log(void)
3493  {
3494      RAMBlock *block = NULL;
3495      /* For memory_global_dirty_log_start below. */
3496      qemu_mutex_lock_iothread();
3497      qemu_mutex_lock_ramlist();
3498  
3499      memory_global_dirty_log_sync(false);
3500      WITH_RCU_READ_LOCK_GUARD() {
3501          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3502              ramblock_sync_dirty_bitmap(ram_state, block);
3503              /* Discard this dirty bitmap record */
3504              bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3505          }
3506          memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3507      }
3508      ram_state->migration_dirty_pages = 0;
3509      qemu_mutex_unlock_ramlist();
3510      qemu_mutex_unlock_iothread();
3511  }
3512  
3513  /* It is need to hold the global lock to call this helper */
3514  void colo_release_ram_cache(void)
3515  {
3516      RAMBlock *block;
3517  
3518      memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3519      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3520          g_free(block->bmap);
3521          block->bmap = NULL;
3522      }
3523  
3524      WITH_RCU_READ_LOCK_GUARD() {
3525          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3526              if (block->colo_cache) {
3527                  qemu_anon_ram_free(block->colo_cache, block->used_length);
3528                  block->colo_cache = NULL;
3529              }
3530          }
3531      }
3532      ram_state_cleanup(&ram_state);
3533  }
3534  
3535  /**
3536   * ram_load_setup: Setup RAM for migration incoming side
3537   *
3538   * Returns zero to indicate success and negative for error
3539   *
3540   * @f: QEMUFile where to receive the data
3541   * @opaque: RAMState pointer
3542   */
3543  static int ram_load_setup(QEMUFile *f, void *opaque)
3544  {
3545      xbzrle_load_setup();
3546      ramblock_recv_map_init();
3547  
3548      return 0;
3549  }
3550  
3551  static int ram_load_cleanup(void *opaque)
3552  {
3553      RAMBlock *rb;
3554  
3555      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3556          qemu_ram_block_writeback(rb);
3557      }
3558  
3559      xbzrle_load_cleanup();
3560  
3561      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3562          g_free(rb->receivedmap);
3563          rb->receivedmap = NULL;
3564      }
3565  
3566      return 0;
3567  }
3568  
3569  /**
3570   * ram_postcopy_incoming_init: allocate postcopy data structures
3571   *
3572   * Returns 0 for success and negative if there was one error
3573   *
3574   * @mis: current migration incoming state
3575   *
3576   * Allocate data structures etc needed by incoming migration with
3577   * postcopy-ram. postcopy-ram's similarly names
3578   * postcopy_ram_incoming_init does the work.
3579   */
3580  int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3581  {
3582      return postcopy_ram_incoming_init(mis);
3583  }
3584  
3585  /**
3586   * ram_load_postcopy: load a page in postcopy case
3587   *
3588   * Returns 0 for success or -errno in case of error
3589   *
3590   * Called in postcopy mode by ram_load().
3591   * rcu_read_lock is taken prior to this being called.
3592   *
3593   * @f: QEMUFile where to send the data
3594   * @channel: the channel to use for loading
3595   */
3596  int ram_load_postcopy(QEMUFile *f, int channel)
3597  {
3598      int flags = 0, ret = 0;
3599      bool place_needed = false;
3600      bool matches_target_page_size = false;
3601      MigrationIncomingState *mis = migration_incoming_get_current();
3602      PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3603  
3604      while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3605          ram_addr_t addr;
3606          void *page_buffer = NULL;
3607          void *place_source = NULL;
3608          RAMBlock *block = NULL;
3609          uint8_t ch;
3610          int len;
3611  
3612          addr = qemu_get_be64(f);
3613  
3614          /*
3615           * If qemu file error, we should stop here, and then "addr"
3616           * may be invalid
3617           */
3618          ret = qemu_file_get_error(f);
3619          if (ret) {
3620              break;
3621          }
3622  
3623          flags = addr & ~TARGET_PAGE_MASK;
3624          addr &= TARGET_PAGE_MASK;
3625  
3626          trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3627          if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3628                       RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3629              block = ram_block_from_stream(mis, f, flags, channel);
3630              if (!block) {
3631                  ret = -EINVAL;
3632                  break;
3633              }
3634  
3635              /*
3636               * Relying on used_length is racy and can result in false positives.
3637               * We might place pages beyond used_length in case RAM was shrunk
3638               * while in postcopy, which is fine - trying to place via
3639               * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3640               */
3641              if (!block->host || addr >= block->postcopy_length) {
3642                  error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3643                  ret = -EINVAL;
3644                  break;
3645              }
3646              tmp_page->target_pages++;
3647              matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3648              /*
3649               * Postcopy requires that we place whole host pages atomically;
3650               * these may be huge pages for RAMBlocks that are backed by
3651               * hugetlbfs.
3652               * To make it atomic, the data is read into a temporary page
3653               * that's moved into place later.
3654               * The migration protocol uses,  possibly smaller, target-pages
3655               * however the source ensures it always sends all the components
3656               * of a host page in one chunk.
3657               */
3658              page_buffer = tmp_page->tmp_huge_page +
3659                            host_page_offset_from_ram_block_offset(block, addr);
3660              /* If all TP are zero then we can optimise the place */
3661              if (tmp_page->target_pages == 1) {
3662                  tmp_page->host_addr =
3663                      host_page_from_ram_block_offset(block, addr);
3664              } else if (tmp_page->host_addr !=
3665                         host_page_from_ram_block_offset(block, addr)) {
3666                  /* not the 1st TP within the HP */
3667                  error_report("Non-same host page detected on channel %d: "
3668                               "Target host page %p, received host page %p "
3669                               "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3670                               channel, tmp_page->host_addr,
3671                               host_page_from_ram_block_offset(block, addr),
3672                               block->idstr, addr, tmp_page->target_pages);
3673                  ret = -EINVAL;
3674                  break;
3675              }
3676  
3677              /*
3678               * If it's the last part of a host page then we place the host
3679               * page
3680               */
3681              if (tmp_page->target_pages ==
3682                  (block->page_size / TARGET_PAGE_SIZE)) {
3683                  place_needed = true;
3684              }
3685              place_source = tmp_page->tmp_huge_page;
3686          }
3687  
3688          switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3689          case RAM_SAVE_FLAG_ZERO:
3690              ch = qemu_get_byte(f);
3691              /*
3692               * Can skip to set page_buffer when
3693               * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3694               */
3695              if (ch || !matches_target_page_size) {
3696                  memset(page_buffer, ch, TARGET_PAGE_SIZE);
3697              }
3698              if (ch) {
3699                  tmp_page->all_zero = false;
3700              }
3701              break;
3702  
3703          case RAM_SAVE_FLAG_PAGE:
3704              tmp_page->all_zero = false;
3705              if (!matches_target_page_size) {
3706                  /* For huge pages, we always use temporary buffer */
3707                  qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3708              } else {
3709                  /*
3710                   * For small pages that matches target page size, we
3711                   * avoid the qemu_file copy.  Instead we directly use
3712                   * the buffer of QEMUFile to place the page.  Note: we
3713                   * cannot do any QEMUFile operation before using that
3714                   * buffer to make sure the buffer is valid when
3715                   * placing the page.
3716                   */
3717                  qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3718                                           TARGET_PAGE_SIZE);
3719              }
3720              break;
3721          case RAM_SAVE_FLAG_COMPRESS_PAGE:
3722              tmp_page->all_zero = false;
3723              len = qemu_get_be32(f);
3724              if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3725                  error_report("Invalid compressed data length: %d", len);
3726                  ret = -EINVAL;
3727                  break;
3728              }
3729              decompress_data_with_multi_threads(f, page_buffer, len);
3730              break;
3731          case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3732              multifd_recv_sync_main();
3733              break;
3734          case RAM_SAVE_FLAG_EOS:
3735              /* normal exit */
3736              if (migrate_multifd_flush_after_each_section()) {
3737                  multifd_recv_sync_main();
3738              }
3739              break;
3740          default:
3741              error_report("Unknown combination of migration flags: 0x%x"
3742                           " (postcopy mode)", flags);
3743              ret = -EINVAL;
3744              break;
3745          }
3746  
3747          /* Got the whole host page, wait for decompress before placing. */
3748          if (place_needed) {
3749              ret |= wait_for_decompress_done();
3750          }
3751  
3752          /* Detect for any possible file errors */
3753          if (!ret && qemu_file_get_error(f)) {
3754              ret = qemu_file_get_error(f);
3755          }
3756  
3757          if (!ret && place_needed) {
3758              if (tmp_page->all_zero) {
3759                  ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3760              } else {
3761                  ret = postcopy_place_page(mis, tmp_page->host_addr,
3762                                            place_source, block);
3763              }
3764              place_needed = false;
3765              postcopy_temp_page_reset(tmp_page);
3766          }
3767      }
3768  
3769      return ret;
3770  }
3771  
3772  static bool postcopy_is_running(void)
3773  {
3774      PostcopyState ps = postcopy_state_get();
3775      return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3776  }
3777  
3778  /*
3779   * Flush content of RAM cache into SVM's memory.
3780   * Only flush the pages that be dirtied by PVM or SVM or both.
3781   */
3782  void colo_flush_ram_cache(void)
3783  {
3784      RAMBlock *block = NULL;
3785      void *dst_host;
3786      void *src_host;
3787      unsigned long offset = 0;
3788  
3789      memory_global_dirty_log_sync(false);
3790      qemu_mutex_lock(&ram_state->bitmap_mutex);
3791      WITH_RCU_READ_LOCK_GUARD() {
3792          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3793              ramblock_sync_dirty_bitmap(ram_state, block);
3794          }
3795      }
3796  
3797      trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3798      WITH_RCU_READ_LOCK_GUARD() {
3799          block = QLIST_FIRST_RCU(&ram_list.blocks);
3800  
3801          while (block) {
3802              unsigned long num = 0;
3803  
3804              offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3805              if (!offset_in_ramblock(block,
3806                                      ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3807                  offset = 0;
3808                  num = 0;
3809                  block = QLIST_NEXT_RCU(block, next);
3810              } else {
3811                  unsigned long i = 0;
3812  
3813                  for (i = 0; i < num; i++) {
3814                      migration_bitmap_clear_dirty(ram_state, block, offset + i);
3815                  }
3816                  dst_host = block->host
3817                           + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3818                  src_host = block->colo_cache
3819                           + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3820                  memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3821                  offset += num;
3822              }
3823          }
3824      }
3825      qemu_mutex_unlock(&ram_state->bitmap_mutex);
3826      trace_colo_flush_ram_cache_end();
3827  }
3828  
3829  /**
3830   * ram_load_precopy: load pages in precopy case
3831   *
3832   * Returns 0 for success or -errno in case of error
3833   *
3834   * Called in precopy mode by ram_load().
3835   * rcu_read_lock is taken prior to this being called.
3836   *
3837   * @f: QEMUFile where to send the data
3838   */
3839  static int ram_load_precopy(QEMUFile *f)
3840  {
3841      MigrationIncomingState *mis = migration_incoming_get_current();
3842      int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3843      /* ADVISE is earlier, it shows the source has the postcopy capability on */
3844      bool postcopy_advised = migration_incoming_postcopy_advised();
3845      if (!migrate_compress()) {
3846          invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3847      }
3848  
3849      while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3850          ram_addr_t addr, total_ram_bytes;
3851          void *host = NULL, *host_bak = NULL;
3852          uint8_t ch;
3853  
3854          /*
3855           * Yield periodically to let main loop run, but an iteration of
3856           * the main loop is expensive, so do it each some iterations
3857           */
3858          if ((i & 32767) == 0 && qemu_in_coroutine()) {
3859              aio_co_schedule(qemu_get_current_aio_context(),
3860                              qemu_coroutine_self());
3861              qemu_coroutine_yield();
3862          }
3863          i++;
3864  
3865          addr = qemu_get_be64(f);
3866          flags = addr & ~TARGET_PAGE_MASK;
3867          addr &= TARGET_PAGE_MASK;
3868  
3869          if (flags & invalid_flags) {
3870              if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3871                  error_report("Received an unexpected compressed page");
3872              }
3873  
3874              ret = -EINVAL;
3875              break;
3876          }
3877  
3878          if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3879                       RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3880              RAMBlock *block = ram_block_from_stream(mis, f, flags,
3881                                                      RAM_CHANNEL_PRECOPY);
3882  
3883              host = host_from_ram_block_offset(block, addr);
3884              /*
3885               * After going into COLO stage, we should not load the page
3886               * into SVM's memory directly, we put them into colo_cache firstly.
3887               * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3888               * Previously, we copied all these memory in preparing stage of COLO
3889               * while we need to stop VM, which is a time-consuming process.
3890               * Here we optimize it by a trick, back-up every page while in
3891               * migration process while COLO is enabled, though it affects the
3892               * speed of the migration, but it obviously reduce the downtime of
3893               * back-up all SVM'S memory in COLO preparing stage.
3894               */
3895              if (migration_incoming_colo_enabled()) {
3896                  if (migration_incoming_in_colo_state()) {
3897                      /* In COLO stage, put all pages into cache temporarily */
3898                      host = colo_cache_from_block_offset(block, addr, true);
3899                  } else {
3900                     /*
3901                      * In migration stage but before COLO stage,
3902                      * Put all pages into both cache and SVM's memory.
3903                      */
3904                      host_bak = colo_cache_from_block_offset(block, addr, false);
3905                  }
3906              }
3907              if (!host) {
3908                  error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3909                  ret = -EINVAL;
3910                  break;
3911              }
3912              if (!migration_incoming_in_colo_state()) {
3913                  ramblock_recv_bitmap_set(block, host);
3914              }
3915  
3916              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3917          }
3918  
3919          switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3920          case RAM_SAVE_FLAG_MEM_SIZE:
3921              /* Synchronize RAM block list */
3922              total_ram_bytes = addr;
3923              while (!ret && total_ram_bytes) {
3924                  RAMBlock *block;
3925                  char id[256];
3926                  ram_addr_t length;
3927  
3928                  len = qemu_get_byte(f);
3929                  qemu_get_buffer(f, (uint8_t *)id, len);
3930                  id[len] = 0;
3931                  length = qemu_get_be64(f);
3932  
3933                  block = qemu_ram_block_by_name(id);
3934                  if (block && !qemu_ram_is_migratable(block)) {
3935                      error_report("block %s should not be migrated !", id);
3936                      ret = -EINVAL;
3937                  } else if (block) {
3938                      if (length != block->used_length) {
3939                          Error *local_err = NULL;
3940  
3941                          ret = qemu_ram_resize(block, length,
3942                                                &local_err);
3943                          if (local_err) {
3944                              error_report_err(local_err);
3945                          }
3946                      }
3947                      /* For postcopy we need to check hugepage sizes match */
3948                      if (postcopy_advised && migrate_postcopy_ram() &&
3949                          block->page_size != qemu_host_page_size) {
3950                          uint64_t remote_page_size = qemu_get_be64(f);
3951                          if (remote_page_size != block->page_size) {
3952                              error_report("Mismatched RAM page size %s "
3953                                           "(local) %zd != %" PRId64,
3954                                           id, block->page_size,
3955                                           remote_page_size);
3956                              ret = -EINVAL;
3957                          }
3958                      }
3959                      if (migrate_ignore_shared()) {
3960                          hwaddr addr = qemu_get_be64(f);
3961                          if (ramblock_is_ignored(block) &&
3962                              block->mr->addr != addr) {
3963                              error_report("Mismatched GPAs for block %s "
3964                                           "%" PRId64 "!= %" PRId64,
3965                                           id, (uint64_t)addr,
3966                                           (uint64_t)block->mr->addr);
3967                              ret = -EINVAL;
3968                          }
3969                      }
3970                      ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3971                                            block->idstr);
3972                  } else {
3973                      error_report("Unknown ramblock \"%s\", cannot "
3974                                   "accept migration", id);
3975                      ret = -EINVAL;
3976                  }
3977  
3978                  total_ram_bytes -= length;
3979              }
3980              break;
3981  
3982          case RAM_SAVE_FLAG_ZERO:
3983              ch = qemu_get_byte(f);
3984              ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3985              break;
3986  
3987          case RAM_SAVE_FLAG_PAGE:
3988              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3989              break;
3990  
3991          case RAM_SAVE_FLAG_COMPRESS_PAGE:
3992              len = qemu_get_be32(f);
3993              if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3994                  error_report("Invalid compressed data length: %d", len);
3995                  ret = -EINVAL;
3996                  break;
3997              }
3998              decompress_data_with_multi_threads(f, host, len);
3999              break;
4000  
4001          case RAM_SAVE_FLAG_XBZRLE:
4002              if (load_xbzrle(f, addr, host) < 0) {
4003                  error_report("Failed to decompress XBZRLE page at "
4004                               RAM_ADDR_FMT, addr);
4005                  ret = -EINVAL;
4006                  break;
4007              }
4008              break;
4009          case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4010              multifd_recv_sync_main();
4011              break;
4012          case RAM_SAVE_FLAG_EOS:
4013              /* normal exit */
4014              if (migrate_multifd_flush_after_each_section()) {
4015                  multifd_recv_sync_main();
4016              }
4017              break;
4018          case RAM_SAVE_FLAG_HOOK:
4019              ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4020              break;
4021          default:
4022              error_report("Unknown combination of migration flags: 0x%x", flags);
4023              ret = -EINVAL;
4024          }
4025          if (!ret) {
4026              ret = qemu_file_get_error(f);
4027          }
4028          if (!ret && host_bak) {
4029              memcpy(host_bak, host, TARGET_PAGE_SIZE);
4030          }
4031      }
4032  
4033      ret |= wait_for_decompress_done();
4034      return ret;
4035  }
4036  
4037  static int ram_load(QEMUFile *f, void *opaque, int version_id)
4038  {
4039      int ret = 0;
4040      static uint64_t seq_iter;
4041      /*
4042       * If system is running in postcopy mode, page inserts to host memory must
4043       * be atomic
4044       */
4045      bool postcopy_running = postcopy_is_running();
4046  
4047      seq_iter++;
4048  
4049      if (version_id != 4) {
4050          return -EINVAL;
4051      }
4052  
4053      /*
4054       * This RCU critical section can be very long running.
4055       * When RCU reclaims in the code start to become numerous,
4056       * it will be necessary to reduce the granularity of this
4057       * critical section.
4058       */
4059      WITH_RCU_READ_LOCK_GUARD() {
4060          if (postcopy_running) {
4061              /*
4062               * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4063               * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4064               * service fast page faults.
4065               */
4066              ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4067          } else {
4068              ret = ram_load_precopy(f);
4069          }
4070      }
4071      trace_ram_load_complete(ret, seq_iter);
4072  
4073      return ret;
4074  }
4075  
4076  static bool ram_has_postcopy(void *opaque)
4077  {
4078      RAMBlock *rb;
4079      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4080          if (ramblock_is_pmem(rb)) {
4081              info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4082                           "is not supported now!", rb->idstr, rb->host);
4083              return false;
4084          }
4085      }
4086  
4087      return migrate_postcopy_ram();
4088  }
4089  
4090  /* Sync all the dirty bitmap with destination VM.  */
4091  static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4092  {
4093      RAMBlock *block;
4094      QEMUFile *file = s->to_dst_file;
4095      int ramblock_count = 0;
4096  
4097      trace_ram_dirty_bitmap_sync_start();
4098  
4099      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4100          qemu_savevm_send_recv_bitmap(file, block->idstr);
4101          trace_ram_dirty_bitmap_request(block->idstr);
4102          ramblock_count++;
4103      }
4104  
4105      trace_ram_dirty_bitmap_sync_wait();
4106  
4107      /* Wait until all the ramblocks' dirty bitmap synced */
4108      while (ramblock_count--) {
4109          qemu_sem_wait(&s->rp_state.rp_sem);
4110      }
4111  
4112      trace_ram_dirty_bitmap_sync_complete();
4113  
4114      return 0;
4115  }
4116  
4117  static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4118  {
4119      qemu_sem_post(&s->rp_state.rp_sem);
4120  }
4121  
4122  /*
4123   * Read the received bitmap, revert it as the initial dirty bitmap.
4124   * This is only used when the postcopy migration is paused but wants
4125   * to resume from a middle point.
4126   */
4127  int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4128  {
4129      int ret = -EINVAL;
4130      /* from_dst_file is always valid because we're within rp_thread */
4131      QEMUFile *file = s->rp_state.from_dst_file;
4132      unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4133      uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4134      uint64_t size, end_mark;
4135  
4136      trace_ram_dirty_bitmap_reload_begin(block->idstr);
4137  
4138      if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4139          error_report("%s: incorrect state %s", __func__,
4140                       MigrationStatus_str(s->state));
4141          return -EINVAL;
4142      }
4143  
4144      /*
4145       * Note: see comments in ramblock_recv_bitmap_send() on why we
4146       * need the endianness conversion, and the paddings.
4147       */
4148      local_size = ROUND_UP(local_size, 8);
4149  
4150      /* Add paddings */
4151      le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4152  
4153      size = qemu_get_be64(file);
4154  
4155      /* The size of the bitmap should match with our ramblock */
4156      if (size != local_size) {
4157          error_report("%s: ramblock '%s' bitmap size mismatch "
4158                       "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4159                       block->idstr, size, local_size);
4160          ret = -EINVAL;
4161          goto out;
4162      }
4163  
4164      size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4165      end_mark = qemu_get_be64(file);
4166  
4167      ret = qemu_file_get_error(file);
4168      if (ret || size != local_size) {
4169          error_report("%s: read bitmap failed for ramblock '%s': %d"
4170                       " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4171                       __func__, block->idstr, ret, local_size, size);
4172          ret = -EIO;
4173          goto out;
4174      }
4175  
4176      if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4177          error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4178                       __func__, block->idstr, end_mark);
4179          ret = -EINVAL;
4180          goto out;
4181      }
4182  
4183      /*
4184       * Endianness conversion. We are during postcopy (though paused).
4185       * The dirty bitmap won't change. We can directly modify it.
4186       */
4187      bitmap_from_le(block->bmap, le_bitmap, nbits);
4188  
4189      /*
4190       * What we received is "received bitmap". Revert it as the initial
4191       * dirty bitmap for this ramblock.
4192       */
4193      bitmap_complement(block->bmap, block->bmap, nbits);
4194  
4195      /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4196      ramblock_dirty_bitmap_clear_discarded_pages(block);
4197  
4198      /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4199      trace_ram_dirty_bitmap_reload_complete(block->idstr);
4200  
4201      /*
4202       * We succeeded to sync bitmap for current ramblock. If this is
4203       * the last one to sync, we need to notify the main send thread.
4204       */
4205      ram_dirty_bitmap_reload_notify(s);
4206  
4207      ret = 0;
4208  out:
4209      g_free(le_bitmap);
4210      return ret;
4211  }
4212  
4213  static int ram_resume_prepare(MigrationState *s, void *opaque)
4214  {
4215      RAMState *rs = *(RAMState **)opaque;
4216      int ret;
4217  
4218      ret = ram_dirty_bitmap_sync_all(s, rs);
4219      if (ret) {
4220          return ret;
4221      }
4222  
4223      ram_state_resume_prepare(rs, s->to_dst_file);
4224  
4225      return 0;
4226  }
4227  
4228  void postcopy_preempt_shutdown_file(MigrationState *s)
4229  {
4230      qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4231      qemu_fflush(s->postcopy_qemufile_src);
4232  }
4233  
4234  static SaveVMHandlers savevm_ram_handlers = {
4235      .save_setup = ram_save_setup,
4236      .save_live_iterate = ram_save_iterate,
4237      .save_live_complete_postcopy = ram_save_complete,
4238      .save_live_complete_precopy = ram_save_complete,
4239      .has_postcopy = ram_has_postcopy,
4240      .state_pending_exact = ram_state_pending_exact,
4241      .state_pending_estimate = ram_state_pending_estimate,
4242      .load_state = ram_load,
4243      .save_cleanup = ram_save_cleanup,
4244      .load_setup = ram_load_setup,
4245      .load_cleanup = ram_load_cleanup,
4246      .resume_prepare = ram_resume_prepare,
4247  };
4248  
4249  static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4250                                        size_t old_size, size_t new_size)
4251  {
4252      PostcopyState ps = postcopy_state_get();
4253      ram_addr_t offset;
4254      RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4255      Error *err = NULL;
4256  
4257      if (ramblock_is_ignored(rb)) {
4258          return;
4259      }
4260  
4261      if (!migration_is_idle()) {
4262          /*
4263           * Precopy code on the source cannot deal with the size of RAM blocks
4264           * changing at random points in time - especially after sending the
4265           * RAM block sizes in the migration stream, they must no longer change.
4266           * Abort and indicate a proper reason.
4267           */
4268          error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4269          migration_cancel(err);
4270          error_free(err);
4271      }
4272  
4273      switch (ps) {
4274      case POSTCOPY_INCOMING_ADVISE:
4275          /*
4276           * Update what ram_postcopy_incoming_init()->init_range() does at the
4277           * time postcopy was advised. Syncing RAM blocks with the source will
4278           * result in RAM resizes.
4279           */
4280          if (old_size < new_size) {
4281              if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4282                  error_report("RAM block '%s' discard of resized RAM failed",
4283                               rb->idstr);
4284              }
4285          }
4286          rb->postcopy_length = new_size;
4287          break;
4288      case POSTCOPY_INCOMING_NONE:
4289      case POSTCOPY_INCOMING_RUNNING:
4290      case POSTCOPY_INCOMING_END:
4291          /*
4292           * Once our guest is running, postcopy does no longer care about
4293           * resizes. When growing, the new memory was not available on the
4294           * source, no handler needed.
4295           */
4296          break;
4297      default:
4298          error_report("RAM block '%s' resized during postcopy state: %d",
4299                       rb->idstr, ps);
4300          exit(-1);
4301      }
4302  }
4303  
4304  static RAMBlockNotifier ram_mig_ram_notifier = {
4305      .ram_block_resized = ram_mig_ram_block_resized,
4306  };
4307  
4308  void ram_mig_init(void)
4309  {
4310      qemu_mutex_init(&XBZRLE.lock);
4311      register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4312      ram_block_notifier_add(&ram_mig_ram_notifier);
4313  }
4314