1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration-stats.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qapi-commands-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "system/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "system/cpu-throttle.h" 56 #include "system/physmem.h" 57 #include "system/ramblock.h" 58 #include "savevm.h" 59 #include "qemu/iov.h" 60 #include "multifd.h" 61 #include "system/runstate.h" 62 #include "rdma.h" 63 #include "options.h" 64 #include "system/dirtylimit.h" 65 #include "system/kvm.h" 66 67 #include "hw/boards.h" /* for machine_dump_guest_core() */ 68 69 #if defined(__linux__) 70 #include "qemu/userfaultfd.h" 71 #endif /* defined(__linux__) */ 72 73 /***********************************************************/ 74 /* ram save/restore */ 75 76 /* 77 * mapped-ram migration supports O_DIRECT, so we need to make sure the 78 * userspace buffer, the IO operation size and the file offset are 79 * aligned according to the underlying device's block size. The first 80 * two are already aligned to page size, but we need to add padding to 81 * the file to align the offset. We cannot read the block size 82 * dynamically because the migration file can be moved between 83 * different systems, so use 1M to cover most block sizes and to keep 84 * the file offset aligned at page size as well. 85 */ 86 #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 87 88 /* 89 * When doing mapped-ram migration, this is the amount we read from 90 * the pages region in the migration file at a time. 91 */ 92 #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 93 94 XBZRLECacheStats xbzrle_counters; 95 96 /* 97 * This structure locates a specific location of a guest page. In QEMU, 98 * it's described in a tuple of (ramblock, offset). 99 */ 100 struct PageLocation { 101 RAMBlock *block; 102 unsigned long offset; 103 }; 104 typedef struct PageLocation PageLocation; 105 106 /** 107 * PageLocationHint: describes a hint to a page location 108 * 109 * @valid set if the hint is vaild and to be consumed 110 * @location: the hint content 111 * 112 * In postcopy preempt mode, the urgent channel may provide hints to the 113 * background channel, so that QEMU source can try to migrate whatever is 114 * right after the requested urgent pages. 115 * 116 * This is based on the assumption that the VM (already running on the 117 * destination side) tends to access the memory with spatial locality. 118 * This is also the default behavior of vanilla postcopy (preempt off). 119 */ 120 struct PageLocationHint { 121 bool valid; 122 PageLocation location; 123 }; 124 typedef struct PageLocationHint PageLocationHint; 125 126 /* used by the search for pages to send */ 127 struct PageSearchStatus { 128 /* The migration channel used for a specific host page */ 129 QEMUFile *pss_channel; 130 /* Last block from where we have sent data */ 131 RAMBlock *last_sent_block; 132 /* Current block being searched */ 133 RAMBlock *block; 134 /* Current page to search from */ 135 unsigned long page; 136 /* Set once we wrap around */ 137 bool complete_round; 138 /* Whether we're sending a host page */ 139 bool host_page_sending; 140 /* The start/end of current host page. Invalid if host_page_sending==false */ 141 unsigned long host_page_start; 142 unsigned long host_page_end; 143 }; 144 typedef struct PageSearchStatus PageSearchStatus; 145 146 /* struct contains XBZRLE cache and a static page 147 used by the compression */ 148 static struct { 149 /* buffer used for XBZRLE encoding */ 150 uint8_t *encoded_buf; 151 /* buffer for storing page content */ 152 uint8_t *current_buf; 153 /* Cache for XBZRLE, Protected by lock. */ 154 PageCache *cache; 155 QemuMutex lock; 156 /* it will store a page full of zeros */ 157 uint8_t *zero_target_page; 158 /* buffer used for XBZRLE decoding */ 159 uint8_t *decoded_buf; 160 } XBZRLE; 161 162 static void XBZRLE_cache_lock(void) 163 { 164 if (migrate_xbzrle()) { 165 qemu_mutex_lock(&XBZRLE.lock); 166 } 167 } 168 169 static void XBZRLE_cache_unlock(void) 170 { 171 if (migrate_xbzrle()) { 172 qemu_mutex_unlock(&XBZRLE.lock); 173 } 174 } 175 176 /** 177 * xbzrle_cache_resize: resize the xbzrle cache 178 * 179 * This function is called from migrate_params_apply in main 180 * thread, possibly while a migration is in progress. A running 181 * migration may be using the cache and might finish during this call, 182 * hence changes to the cache are protected by XBZRLE.lock(). 183 * 184 * Returns 0 for success or -1 for error 185 * 186 * @new_size: new cache size 187 * @errp: set *errp if the check failed, with reason 188 */ 189 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 190 { 191 PageCache *new_cache; 192 int64_t ret = 0; 193 194 /* Check for truncation */ 195 if (new_size != (size_t)new_size) { 196 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 197 "exceeding address space"); 198 return -1; 199 } 200 201 if (new_size == migrate_xbzrle_cache_size()) { 202 /* nothing to do */ 203 return 0; 204 } 205 206 XBZRLE_cache_lock(); 207 208 if (XBZRLE.cache != NULL) { 209 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 210 if (!new_cache) { 211 ret = -1; 212 goto out; 213 } 214 215 cache_fini(XBZRLE.cache); 216 XBZRLE.cache = new_cache; 217 } 218 out: 219 XBZRLE_cache_unlock(); 220 return ret; 221 } 222 223 static bool postcopy_preempt_active(void) 224 { 225 return migrate_postcopy_preempt() && migration_in_postcopy(); 226 } 227 228 bool migrate_ram_is_ignored(RAMBlock *block) 229 { 230 MigMode mode = migrate_mode(); 231 return !qemu_ram_is_migratable(block) || 232 mode == MIG_MODE_CPR_TRANSFER || 233 mode == MIG_MODE_CPR_EXEC || 234 (migrate_ignore_shared() && qemu_ram_is_shared(block) 235 && qemu_ram_is_named_file(block)); 236 } 237 238 #undef RAMBLOCK_FOREACH 239 240 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 241 { 242 RAMBlock *block; 243 int ret = 0; 244 245 RCU_READ_LOCK_GUARD(); 246 247 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 248 ret = func(block, opaque); 249 if (ret) { 250 break; 251 } 252 } 253 return ret; 254 } 255 256 static void ramblock_recv_map_init(void) 257 { 258 RAMBlock *rb; 259 260 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 261 assert(!rb->receivedmap); 262 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 263 } 264 } 265 266 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 267 { 268 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 269 rb->receivedmap); 270 } 271 272 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 273 { 274 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 275 } 276 277 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 278 { 279 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 280 } 281 282 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 283 size_t nr) 284 { 285 bitmap_set_atomic(rb->receivedmap, 286 ramblock_recv_bitmap_offset(host_addr, rb), 287 nr); 288 } 289 290 void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) 291 { 292 set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 293 } 294 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 295 296 /* 297 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 298 * 299 * Returns >0 if success with sent bytes, or <0 if error. 300 */ 301 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 302 const char *block_name) 303 { 304 RAMBlock *block = qemu_ram_block_by_name(block_name); 305 unsigned long *le_bitmap, nbits; 306 uint64_t size; 307 308 if (!block) { 309 error_report("%s: invalid block name: %s", __func__, block_name); 310 return -1; 311 } 312 313 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 314 315 /* 316 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 317 * machines we may need 4 more bytes for padding (see below 318 * comment). So extend it a bit before hand. 319 */ 320 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 321 322 /* 323 * Always use little endian when sending the bitmap. This is 324 * required that when source and destination VMs are not using the 325 * same endianness. (Note: big endian won't work.) 326 */ 327 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 328 329 /* Size of the bitmap, in bytes */ 330 size = DIV_ROUND_UP(nbits, 8); 331 332 /* 333 * size is always aligned to 8 bytes for 64bit machines, but it 334 * may not be true for 32bit machines. We need this padding to 335 * make sure the migration can survive even between 32bit and 336 * 64bit machines. 337 */ 338 size = ROUND_UP(size, 8); 339 340 qemu_put_be64(file, size); 341 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 342 g_free(le_bitmap); 343 /* 344 * Mark as an end, in case the middle part is screwed up due to 345 * some "mysterious" reason. 346 */ 347 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 348 int ret = qemu_fflush(file); 349 if (ret) { 350 return ret; 351 } 352 353 return size + sizeof(size); 354 } 355 356 /* 357 * An outstanding page request, on the source, having been received 358 * and queued 359 */ 360 struct RAMSrcPageRequest { 361 RAMBlock *rb; 362 hwaddr offset; 363 hwaddr len; 364 365 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 366 }; 367 368 /* State of RAM for migration */ 369 struct RAMState { 370 /* 371 * PageSearchStatus structures for the channels when send pages. 372 * Protected by the bitmap_mutex. 373 */ 374 PageSearchStatus pss[RAM_CHANNEL_MAX]; 375 /* UFFD file descriptor, used in 'write-tracking' migration */ 376 int uffdio_fd; 377 /* total ram size in bytes */ 378 uint64_t ram_bytes_total; 379 /* Last block that we have visited searching for dirty pages */ 380 RAMBlock *last_seen_block; 381 /* Last dirty target page we have sent */ 382 ram_addr_t last_page; 383 /* last ram version we have seen */ 384 uint32_t last_version; 385 /* How many times we have dirty too many pages */ 386 int dirty_rate_high_cnt; 387 /* these variables are used for bitmap sync */ 388 /* last time we did a full bitmap_sync */ 389 int64_t time_last_bitmap_sync; 390 /* bytes transferred at start_time */ 391 uint64_t bytes_xfer_prev; 392 /* number of dirty pages since start_time */ 393 uint64_t num_dirty_pages_period; 394 /* xbzrle misses since the beginning of the period */ 395 uint64_t xbzrle_cache_miss_prev; 396 /* Amount of xbzrle pages since the beginning of the period */ 397 uint64_t xbzrle_pages_prev; 398 /* Amount of xbzrle encoded bytes since the beginning of the period */ 399 uint64_t xbzrle_bytes_prev; 400 /* Are we really using XBZRLE (e.g., after the first round). */ 401 bool xbzrle_started; 402 /* Are we on the last stage of migration */ 403 bool last_stage; 404 405 /* total handled target pages at the beginning of period */ 406 uint64_t target_page_count_prev; 407 /* total handled target pages since start */ 408 uint64_t target_page_count; 409 /* number of dirty bits in the bitmap */ 410 uint64_t migration_dirty_pages; 411 /* 412 * Protects: 413 * - dirty/clear bitmap 414 * - migration_dirty_pages 415 * - pss structures 416 */ 417 QemuMutex bitmap_mutex; 418 /* The RAMBlock used in the last src_page_requests */ 419 RAMBlock *last_req_rb; 420 /* Queue of outstanding page requests from the destination */ 421 QemuMutex src_page_req_mutex; 422 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 423 424 /* 425 * This is only used when postcopy is in recovery phase, to communicate 426 * between the migration thread and the return path thread on dirty 427 * bitmap synchronizations. This field is unused in other stages of 428 * RAM migration. 429 */ 430 unsigned int postcopy_bmap_sync_requested; 431 /* 432 * Page hint during postcopy when preempt mode is on. Return path 433 * thread sets it, while background migration thread consumes it. 434 * 435 * Protected by @bitmap_mutex. 436 */ 437 PageLocationHint page_hint; 438 }; 439 typedef struct RAMState RAMState; 440 441 static RAMState *ram_state; 442 443 static NotifierWithReturnList precopy_notifier_list; 444 445 /* Whether postcopy has queued requests? */ 446 static bool postcopy_has_request(RAMState *rs) 447 { 448 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 449 } 450 451 void precopy_infrastructure_init(void) 452 { 453 notifier_with_return_list_init(&precopy_notifier_list); 454 } 455 456 void precopy_add_notifier(NotifierWithReturn *n) 457 { 458 notifier_with_return_list_add(&precopy_notifier_list, n); 459 } 460 461 void precopy_remove_notifier(NotifierWithReturn *n) 462 { 463 notifier_with_return_remove(n); 464 } 465 466 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 467 { 468 PrecopyNotifyData pnd; 469 pnd.reason = reason; 470 471 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd, errp); 472 } 473 474 uint64_t ram_bytes_remaining(void) 475 { 476 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 477 0; 478 } 479 480 void ram_transferred_add(uint64_t bytes) 481 { 482 if (runstate_is_running()) { 483 stat64_add(&mig_stats.precopy_bytes, bytes); 484 } else if (migration_in_postcopy()) { 485 stat64_add(&mig_stats.postcopy_bytes, bytes); 486 } else { 487 stat64_add(&mig_stats.downtime_bytes, bytes); 488 } 489 } 490 491 static int ram_save_host_page_urgent(PageSearchStatus *pss); 492 493 /* NOTE: page is the PFN not real ram_addr_t. */ 494 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 495 { 496 pss->block = rb; 497 pss->page = page; 498 pss->complete_round = false; 499 } 500 501 /* 502 * Check whether two PSSs are actively sending the same page. Return true 503 * if it is, false otherwise. 504 */ 505 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 506 { 507 return pss1->host_page_sending && pss2->host_page_sending && 508 (pss1->host_page_start == pss2->host_page_start); 509 } 510 511 /** 512 * save_page_header: write page header to wire 513 * 514 * If this is the 1st block, it also writes the block identification 515 * 516 * Returns the number of bytes written 517 * 518 * @pss: current PSS channel status 519 * @block: block that contains the page we want to send 520 * @offset: offset inside the block for the page 521 * in the lower bits, it contains flags 522 */ 523 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 524 RAMBlock *block, ram_addr_t offset) 525 { 526 size_t size, len; 527 bool same_block = (block == pss->last_sent_block); 528 529 if (same_block) { 530 offset |= RAM_SAVE_FLAG_CONTINUE; 531 } 532 qemu_put_be64(f, offset); 533 size = 8; 534 535 if (!same_block) { 536 len = strlen(block->idstr); 537 qemu_put_byte(f, len); 538 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 539 size += 1 + len; 540 pss->last_sent_block = block; 541 } 542 return size; 543 } 544 545 /** 546 * mig_throttle_guest_down: throttle down the guest 547 * 548 * Reduce amount of guest cpu execution to hopefully slow down memory 549 * writes. If guest dirty memory rate is reduced below the rate at 550 * which we can transfer pages to the destination then we should be 551 * able to complete migration. Some workloads dirty memory way too 552 * fast and will not effectively converge, even with auto-converge. 553 */ 554 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 555 uint64_t bytes_dirty_threshold) 556 { 557 uint64_t pct_initial = migrate_cpu_throttle_initial(); 558 uint64_t pct_increment = migrate_cpu_throttle_increment(); 559 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 560 int pct_max = migrate_max_cpu_throttle(); 561 562 uint64_t throttle_now = cpu_throttle_get_percentage(); 563 uint64_t cpu_now, cpu_ideal, throttle_inc; 564 565 /* We have not started throttling yet. Let's start it. */ 566 if (!cpu_throttle_active()) { 567 cpu_throttle_set(pct_initial); 568 } else { 569 /* Throttling already on, just increase the rate */ 570 if (!pct_tailslow) { 571 throttle_inc = pct_increment; 572 } else { 573 /* Compute the ideal CPU percentage used by Guest, which may 574 * make the dirty rate match the dirty rate threshold. */ 575 cpu_now = 100 - throttle_now; 576 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 577 bytes_dirty_period); 578 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 579 } 580 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 581 } 582 } 583 584 void mig_throttle_counter_reset(void) 585 { 586 RAMState *rs = ram_state; 587 588 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 589 rs->num_dirty_pages_period = 0; 590 rs->bytes_xfer_prev = migration_transferred_bytes(); 591 } 592 593 /** 594 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 595 * 596 * @current_addr: address for the zero page 597 * 598 * Update the xbzrle cache to reflect a page that's been sent as all 0. 599 * The important thing is that a stale (not-yet-0'd) page be replaced 600 * by the new data. 601 * As a bonus, if the page wasn't in the cache it gets added so that 602 * when a small write is made into the 0'd page it gets XBZRLE sent. 603 */ 604 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 605 { 606 /* We don't care if this fails to allocate a new cache page 607 * as long as it updated an old one */ 608 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 609 stat64_get(&mig_stats.dirty_sync_count)); 610 } 611 612 #define ENCODING_FLAG_XBZRLE 0x1 613 614 /** 615 * save_xbzrle_page: compress and send current page 616 * 617 * Returns: 1 means that we wrote the page 618 * 0 means that page is identical to the one already sent 619 * -1 means that xbzrle would be longer than normal 620 * 621 * @rs: current RAM state 622 * @pss: current PSS channel 623 * @current_data: pointer to the address of the page contents 624 * @current_addr: addr of the page 625 * @block: block that contains the page we want to send 626 * @offset: offset inside the block for the page 627 */ 628 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 629 uint8_t **current_data, ram_addr_t current_addr, 630 RAMBlock *block, ram_addr_t offset) 631 { 632 int encoded_len = 0, bytes_xbzrle; 633 uint8_t *prev_cached_page; 634 QEMUFile *file = pss->pss_channel; 635 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 636 637 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 638 xbzrle_counters.cache_miss++; 639 if (!rs->last_stage) { 640 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 641 generation) == -1) { 642 return -1; 643 } else { 644 /* update *current_data when the page has been 645 inserted into cache */ 646 *current_data = get_cached_data(XBZRLE.cache, current_addr); 647 } 648 } 649 return -1; 650 } 651 652 /* 653 * Reaching here means the page has hit the xbzrle cache, no matter what 654 * encoding result it is (normal encoding, overflow or skipping the page), 655 * count the page as encoded. This is used to calculate the encoding rate. 656 * 657 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 658 * 2nd page turns out to be skipped (i.e. no new bytes written to the 659 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 660 * skipped page included. In this way, the encoding rate can tell if the 661 * guest page is good for xbzrle encoding. 662 */ 663 xbzrle_counters.pages++; 664 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 665 666 /* save current buffer into memory */ 667 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 668 669 /* XBZRLE encoding (if there is no overflow) */ 670 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 671 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 672 TARGET_PAGE_SIZE); 673 674 /* 675 * Update the cache contents, so that it corresponds to the data 676 * sent, in all cases except where we skip the page. 677 */ 678 if (!rs->last_stage && encoded_len != 0) { 679 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 680 /* 681 * In the case where we couldn't compress, ensure that the caller 682 * sends the data from the cache, since the guest might have 683 * changed the RAM since we copied it. 684 */ 685 *current_data = prev_cached_page; 686 } 687 688 if (encoded_len == 0) { 689 trace_save_xbzrle_page_skipping(); 690 return 0; 691 } else if (encoded_len == -1) { 692 trace_save_xbzrle_page_overflow(); 693 xbzrle_counters.overflow++; 694 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 695 return -1; 696 } 697 698 /* Send XBZRLE based compressed page */ 699 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 700 offset | RAM_SAVE_FLAG_XBZRLE); 701 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 702 qemu_put_be16(file, encoded_len); 703 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 704 bytes_xbzrle += encoded_len + 1 + 2; 705 /* 706 * The xbzrle encoded bytes don't count the 8 byte header with 707 * RAM_SAVE_FLAG_CONTINUE. 708 */ 709 xbzrle_counters.bytes += bytes_xbzrle - 8; 710 ram_transferred_add(bytes_xbzrle); 711 712 return 1; 713 } 714 715 /** 716 * pss_find_next_dirty: find the next dirty page of current ramblock 717 * 718 * This function updates pss->page to point to the next dirty page index 719 * within the ramblock to migrate, or the end of ramblock when nothing 720 * found. Note that when pss->host_page_sending==true it means we're 721 * during sending a host page, so we won't look for dirty page that is 722 * outside the host page boundary. 723 * 724 * @pss: the current page search status 725 */ 726 static void pss_find_next_dirty(PageSearchStatus *pss) 727 { 728 RAMBlock *rb = pss->block; 729 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 730 unsigned long *bitmap = rb->bmap; 731 732 if (migrate_ram_is_ignored(rb)) { 733 /* Points directly to the end, so we know no dirty page */ 734 pss->page = size; 735 return; 736 } 737 738 /* 739 * If during sending a host page, only look for dirty pages within the 740 * current host page being send. 741 */ 742 if (pss->host_page_sending) { 743 assert(pss->host_page_end); 744 size = MIN(size, pss->host_page_end); 745 } 746 747 pss->page = find_next_bit(bitmap, size, pss->page); 748 } 749 750 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 751 unsigned long page) 752 { 753 uint8_t shift; 754 hwaddr size, start; 755 756 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 757 return; 758 } 759 760 shift = rb->clear_bmap_shift; 761 /* 762 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 763 * can make things easier sometimes since then start address 764 * of the small chunk will always be 64 pages aligned so the 765 * bitmap will always be aligned to unsigned long. We should 766 * even be able to remove this restriction but I'm simply 767 * keeping it. 768 */ 769 assert(shift >= 6); 770 771 size = 1ULL << (TARGET_PAGE_BITS + shift); 772 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 773 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 774 memory_region_clear_dirty_bitmap(rb->mr, start, size); 775 } 776 777 static void 778 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 779 unsigned long start, 780 unsigned long npages) 781 { 782 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 783 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 784 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 785 786 /* 787 * Clear pages from start to start + npages - 1, so the end boundary is 788 * exclusive. 789 */ 790 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 791 migration_clear_memory_region_dirty_bitmap(rb, i); 792 } 793 } 794 795 /* 796 * colo_bitmap_find_diry:find contiguous dirty pages from start 797 * 798 * Returns the page offset within memory region of the start of the contiguout 799 * dirty page 800 * 801 * @rs: current RAM state 802 * @rb: RAMBlock where to search for dirty pages 803 * @start: page where we start the search 804 * @num: the number of contiguous dirty pages 805 */ 806 static inline 807 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 808 unsigned long start, unsigned long *num) 809 { 810 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 811 unsigned long *bitmap = rb->bmap; 812 unsigned long first, next; 813 814 *num = 0; 815 816 if (migrate_ram_is_ignored(rb)) { 817 return size; 818 } 819 820 first = find_next_bit(bitmap, size, start); 821 if (first >= size) { 822 return first; 823 } 824 next = find_next_zero_bit(bitmap, size, first + 1); 825 assert(next >= first); 826 *num = next - first; 827 return first; 828 } 829 830 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 831 RAMBlock *rb, 832 unsigned long page) 833 { 834 bool ret; 835 836 /* 837 * During the last stage (after source VM stopped), resetting the write 838 * protections isn't needed as we know there will be either (1) no 839 * further writes if migration will complete, or (2) migration fails 840 * at last then tracking isn't needed either. 841 * 842 * Do the same for postcopy due to the same reason. 843 */ 844 if (!rs->last_stage && !migration_in_postcopy()) { 845 /* 846 * Clear dirty bitmap if needed. This _must_ be called before we 847 * send any of the page in the chunk because we need to make sure 848 * we can capture further page content changes when we sync dirty 849 * log the next time. So as long as we are going to send any of 850 * the page in the chunk we clear the remote dirty bitmap for all. 851 * Clearing it earlier won't be a problem, but too late will. 852 */ 853 migration_clear_memory_region_dirty_bitmap(rb, page); 854 } 855 856 ret = test_and_clear_bit(page, rb->bmap); 857 if (ret) { 858 rs->migration_dirty_pages--; 859 } 860 861 return ret; 862 } 863 864 static int dirty_bitmap_clear_section(MemoryRegionSection *section, 865 void *opaque) 866 { 867 const hwaddr offset = section->offset_within_region; 868 const hwaddr size = int128_get64(section->size); 869 const unsigned long start = offset >> TARGET_PAGE_BITS; 870 const unsigned long npages = size >> TARGET_PAGE_BITS; 871 RAMBlock *rb = section->mr->ram_block; 872 uint64_t *cleared_bits = opaque; 873 874 /* 875 * We don't grab ram_state->bitmap_mutex because we expect to run 876 * only when starting migration or during postcopy recovery where 877 * we don't have concurrent access. 878 */ 879 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 880 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 881 } 882 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 883 bitmap_clear(rb->bmap, start, npages); 884 return 0; 885 } 886 887 /* 888 * Exclude all dirty pages from migration that fall into a discarded range as 889 * managed by a RamDiscardManager responsible for the mapped memory region of 890 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 891 * 892 * Discarded pages ("logically unplugged") have undefined content and must 893 * not get migrated, because even reading these pages for migration might 894 * result in undesired behavior. 895 * 896 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 897 * 898 * Note: The result is only stable while migrating (precopy/postcopy). 899 */ 900 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 901 { 902 uint64_t cleared_bits = 0; 903 904 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 905 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 906 MemoryRegionSection section = { 907 .mr = rb->mr, 908 .offset_within_region = 0, 909 .size = int128_make64(qemu_ram_get_used_length(rb)), 910 }; 911 912 ram_discard_manager_replay_discarded(rdm, §ion, 913 dirty_bitmap_clear_section, 914 &cleared_bits); 915 } 916 return cleared_bits; 917 } 918 919 /* 920 * Check if a host-page aligned page falls into a discarded range as managed by 921 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 922 * 923 * Note: The result is only stable while migrating (precopy/postcopy). 924 */ 925 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 926 { 927 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 928 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 929 MemoryRegionSection section = { 930 .mr = rb->mr, 931 .offset_within_region = start, 932 .size = int128_make64(qemu_ram_pagesize(rb)), 933 }; 934 935 return !ram_discard_manager_is_populated(rdm, §ion); 936 } 937 return false; 938 } 939 940 /* Called with RCU critical section */ 941 static uint64_t physical_memory_sync_dirty_bitmap(RAMBlock *rb, 942 ram_addr_t start, 943 ram_addr_t length) 944 { 945 ram_addr_t addr; 946 unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS); 947 uint64_t num_dirty = 0; 948 unsigned long *dest = rb->bmap; 949 950 /* start address and length is aligned at the start of a word? */ 951 if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == 952 (start + rb->offset) && 953 !(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) { 954 int k; 955 int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS); 956 unsigned long * const *src; 957 unsigned long idx = (word * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE; 958 unsigned long offset = BIT_WORD((word * BITS_PER_LONG) % 959 DIRTY_MEMORY_BLOCK_SIZE); 960 unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS); 961 962 src = qatomic_rcu_read( 963 &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks; 964 965 for (k = page; k < page + nr; k++) { 966 if (src[idx][offset]) { 967 unsigned long bits = qatomic_xchg(&src[idx][offset], 0); 968 unsigned long new_dirty; 969 new_dirty = ~dest[k]; 970 dest[k] |= bits; 971 new_dirty &= bits; 972 num_dirty += ctpopl(new_dirty); 973 } 974 975 if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) { 976 offset = 0; 977 idx++; 978 } 979 } 980 if (num_dirty) { 981 physical_memory_dirty_bits_cleared(start, length); 982 } 983 984 if (rb->clear_bmap) { 985 /* 986 * Postpone the dirty bitmap clear to the point before we 987 * really send the pages, also we will split the clear 988 * dirty procedure into smaller chunks. 989 */ 990 clear_bmap_set(rb, start >> TARGET_PAGE_BITS, 991 length >> TARGET_PAGE_BITS); 992 } else { 993 /* Slow path - still do that in a huge chunk */ 994 memory_region_clear_dirty_bitmap(rb->mr, start, length); 995 } 996 } else { 997 ram_addr_t offset = rb->offset; 998 999 for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { 1000 if (physical_memory_test_and_clear_dirty( 1001 start + addr + offset, 1002 TARGET_PAGE_SIZE, 1003 DIRTY_MEMORY_MIGRATION)) { 1004 long k = (start + addr) >> TARGET_PAGE_BITS; 1005 if (!test_and_set_bit(k, dest)) { 1006 num_dirty++; 1007 } 1008 } 1009 } 1010 } 1011 1012 return num_dirty; 1013 } 1014 1015 /* Called with RCU critical section */ 1016 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1017 { 1018 uint64_t new_dirty_pages = 1019 physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1020 1021 rs->migration_dirty_pages += new_dirty_pages; 1022 rs->num_dirty_pages_period += new_dirty_pages; 1023 } 1024 1025 /** 1026 * ram_pagesize_summary: calculate all the pagesizes of a VM 1027 * 1028 * Returns a summary bitmap of the page sizes of all RAMBlocks 1029 * 1030 * For VMs with just normal pages this is equivalent to the host page 1031 * size. If it's got some huge pages then it's the OR of all the 1032 * different page sizes. 1033 */ 1034 uint64_t ram_pagesize_summary(void) 1035 { 1036 RAMBlock *block; 1037 uint64_t summary = 0; 1038 1039 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1040 summary |= block->page_size; 1041 } 1042 1043 return summary; 1044 } 1045 1046 uint64_t ram_get_total_transferred_pages(void) 1047 { 1048 return stat64_get(&mig_stats.normal_pages) + 1049 stat64_get(&mig_stats.zero_pages) + 1050 xbzrle_counters.pages; 1051 } 1052 1053 static void migration_update_rates(RAMState *rs, int64_t end_time) 1054 { 1055 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1056 1057 /* calculate period counters */ 1058 stat64_set(&mig_stats.dirty_pages_rate, 1059 rs->num_dirty_pages_period * 1000 / 1060 (end_time - rs->time_last_bitmap_sync)); 1061 1062 if (!page_count) { 1063 return; 1064 } 1065 1066 if (migrate_xbzrle()) { 1067 double encoded_size, unencoded_size; 1068 1069 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1070 rs->xbzrle_cache_miss_prev) / page_count; 1071 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1072 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1073 TARGET_PAGE_SIZE; 1074 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1075 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1076 xbzrle_counters.encoding_rate = 0; 1077 } else { 1078 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1079 } 1080 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1081 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1082 } 1083 } 1084 1085 /* 1086 * Enable dirty-limit to throttle down the guest 1087 */ 1088 static void migration_dirty_limit_guest(void) 1089 { 1090 /* 1091 * dirty page rate quota for all vCPUs fetched from 1092 * migration parameter 'vcpu_dirty_limit' 1093 */ 1094 static int64_t quota_dirtyrate; 1095 MigrationState *s = migrate_get_current(); 1096 1097 /* 1098 * If dirty limit already enabled and migration parameter 1099 * vcpu-dirty-limit untouched. 1100 */ 1101 if (dirtylimit_in_service() && 1102 quota_dirtyrate == s->parameters.vcpu_dirty_limit) { 1103 return; 1104 } 1105 1106 quota_dirtyrate = s->parameters.vcpu_dirty_limit; 1107 1108 /* 1109 * Set all vCPU a quota dirtyrate, note that the second 1110 * parameter will be ignored if setting all vCPU for the vm 1111 */ 1112 qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL); 1113 trace_migration_dirty_limit_guest(quota_dirtyrate); 1114 } 1115 1116 static void migration_trigger_throttle(RAMState *rs) 1117 { 1118 uint64_t threshold = migrate_throttle_trigger_threshold(); 1119 uint64_t bytes_xfer_period = 1120 migration_transferred_bytes() - rs->bytes_xfer_prev; 1121 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1122 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1123 1124 /* 1125 * The following detection logic can be refined later. For now: 1126 * Check to see if the ratio between dirtied bytes and the approx. 1127 * amount of bytes that just got transferred since the last time 1128 * we were in this routine reaches the threshold. If that happens 1129 * twice, start or increase throttling. 1130 */ 1131 if ((bytes_dirty_period > bytes_dirty_threshold) && 1132 (++rs->dirty_rate_high_cnt >= 2)) { 1133 rs->dirty_rate_high_cnt = 0; 1134 if (migrate_auto_converge()) { 1135 trace_migration_throttle(); 1136 mig_throttle_guest_down(bytes_dirty_period, 1137 bytes_dirty_threshold); 1138 } else if (migrate_dirty_limit()) { 1139 migration_dirty_limit_guest(); 1140 } 1141 } 1142 } 1143 1144 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1145 { 1146 RAMBlock *block; 1147 int64_t end_time; 1148 1149 stat64_add(&mig_stats.dirty_sync_count, 1); 1150 1151 if (!rs->time_last_bitmap_sync) { 1152 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1153 } 1154 1155 trace_migration_bitmap_sync_start(); 1156 memory_global_dirty_log_sync(last_stage); 1157 1158 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 1159 WITH_RCU_READ_LOCK_GUARD() { 1160 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1161 ramblock_sync_dirty_bitmap(rs, block); 1162 } 1163 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1164 } 1165 } 1166 1167 memory_global_after_dirty_log_sync(); 1168 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1169 1170 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1171 1172 /* more than 1 second = 1000 millisecons */ 1173 if (end_time > rs->time_last_bitmap_sync + 1000) { 1174 migration_trigger_throttle(rs); 1175 1176 migration_update_rates(rs, end_time); 1177 1178 rs->target_page_count_prev = rs->target_page_count; 1179 1180 /* reset period counters */ 1181 rs->time_last_bitmap_sync = end_time; 1182 rs->num_dirty_pages_period = 0; 1183 rs->bytes_xfer_prev = migration_transferred_bytes(); 1184 } 1185 if (migrate_events()) { 1186 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1187 qapi_event_send_migration_pass(generation); 1188 } 1189 } 1190 1191 void migration_bitmap_sync_precopy(bool last_stage) 1192 { 1193 Error *local_err = NULL; 1194 assert(ram_state); 1195 1196 /* 1197 * The current notifier usage is just an optimization to migration, so we 1198 * don't stop the normal migration process in the error case. 1199 */ 1200 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1201 error_report_err(local_err); 1202 local_err = NULL; 1203 } 1204 1205 migration_bitmap_sync(ram_state, last_stage); 1206 1207 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1208 error_report_err(local_err); 1209 } 1210 } 1211 1212 void ram_release_page(const char *rbname, uint64_t offset) 1213 { 1214 if (!migrate_release_ram() || !migration_in_postcopy()) { 1215 return; 1216 } 1217 1218 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1219 } 1220 1221 /** 1222 * save_zero_page: send the zero page to the stream 1223 * 1224 * Returns the number of pages written. 1225 * 1226 * @rs: current RAM state 1227 * @pss: current PSS channel 1228 * @offset: offset inside the block for the page 1229 */ 1230 static int save_zero_page(RAMState *rs, PageSearchStatus *pss, 1231 ram_addr_t offset) 1232 { 1233 uint8_t *p = pss->block->host + offset; 1234 QEMUFile *file = pss->pss_channel; 1235 int len = 0; 1236 1237 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { 1238 return 0; 1239 } 1240 1241 if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1242 return 0; 1243 } 1244 1245 stat64_add(&mig_stats.zero_pages, 1); 1246 1247 if (migrate_mapped_ram()) { 1248 /* zero pages are not transferred with mapped-ram */ 1249 clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); 1250 return 1; 1251 } 1252 1253 len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); 1254 qemu_put_byte(file, 0); 1255 len += 1; 1256 ram_release_page(pss->block->idstr, offset); 1257 ram_transferred_add(len); 1258 1259 /* 1260 * Must let xbzrle know, otherwise a previous (now 0'd) cached 1261 * page would be stale. 1262 */ 1263 if (rs->xbzrle_started) { 1264 XBZRLE_cache_lock(); 1265 xbzrle_cache_zero_page(pss->block->offset + offset); 1266 XBZRLE_cache_unlock(); 1267 } 1268 1269 return len; 1270 } 1271 1272 /* 1273 * directly send the page to the stream 1274 * 1275 * Returns the number of pages written. 1276 * 1277 * @pss: current PSS channel 1278 * @block: block that contains the page we want to send 1279 * @offset: offset inside the block for the page 1280 * @buf: the page to be sent 1281 * @async: send to page asyncly 1282 */ 1283 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1284 ram_addr_t offset, uint8_t *buf, bool async) 1285 { 1286 QEMUFile *file = pss->pss_channel; 1287 1288 if (migrate_mapped_ram()) { 1289 qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, 1290 block->pages_offset + offset); 1291 set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); 1292 } else { 1293 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1294 offset | RAM_SAVE_FLAG_PAGE)); 1295 if (async) { 1296 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1297 migrate_release_ram() && 1298 migration_in_postcopy()); 1299 } else { 1300 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1301 } 1302 } 1303 ram_transferred_add(TARGET_PAGE_SIZE); 1304 stat64_add(&mig_stats.normal_pages, 1); 1305 return 1; 1306 } 1307 1308 /** 1309 * ram_save_page: send the given page to the stream 1310 * 1311 * Returns the number of pages written. 1312 * < 0 - error 1313 * >=0 - Number of pages written - this might legally be 0 1314 * if xbzrle noticed the page was the same. 1315 * 1316 * @rs: current RAM state 1317 * @block: block that contains the page we want to send 1318 * @offset: offset inside the block for the page 1319 */ 1320 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1321 { 1322 int pages = -1; 1323 uint8_t *p; 1324 bool send_async = true; 1325 RAMBlock *block = pss->block; 1326 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1327 ram_addr_t current_addr = block->offset + offset; 1328 1329 p = block->host + offset; 1330 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1331 1332 XBZRLE_cache_lock(); 1333 if (rs->xbzrle_started && !migration_in_postcopy()) { 1334 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1335 block, offset); 1336 if (!rs->last_stage) { 1337 /* Can't send this cached data async, since the cache page 1338 * might get updated before it gets to the wire 1339 */ 1340 send_async = false; 1341 } 1342 } 1343 1344 /* XBZRLE overflow or normal page */ 1345 if (pages == -1) { 1346 pages = save_normal_page(pss, block, offset, p, send_async); 1347 } 1348 1349 XBZRLE_cache_unlock(); 1350 1351 return pages; 1352 } 1353 1354 static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) 1355 { 1356 if (!multifd_queue_page(block, offset)) { 1357 return -1; 1358 } 1359 1360 return 1; 1361 } 1362 1363 1364 #define PAGE_ALL_CLEAN 0 1365 #define PAGE_TRY_AGAIN 1 1366 #define PAGE_DIRTY_FOUND 2 1367 /** 1368 * find_dirty_block: find the next dirty page and update any state 1369 * associated with the search process. 1370 * 1371 * Returns: 1372 * <0: An error happened 1373 * PAGE_ALL_CLEAN: no dirty page found, give up 1374 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1375 * PAGE_DIRTY_FOUND: dirty page found 1376 * 1377 * @rs: current RAM state 1378 * @pss: data about the state of the current dirty page scan 1379 * @again: set to false if the search has scanned the whole of RAM 1380 */ 1381 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1382 { 1383 /* Update pss->page for the next dirty bit in ramblock */ 1384 pss_find_next_dirty(pss); 1385 1386 if (pss->complete_round && pss->block == rs->last_seen_block && 1387 pss->page >= rs->last_page) { 1388 /* 1389 * We've been once around the RAM and haven't found anything. 1390 * Give up. 1391 */ 1392 return PAGE_ALL_CLEAN; 1393 } 1394 if (!offset_in_ramblock(pss->block, 1395 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1396 /* Didn't find anything in this RAM Block */ 1397 pss->page = 0; 1398 pss->block = QLIST_NEXT_RCU(pss->block, next); 1399 if (!pss->block) { 1400 if (multifd_ram_sync_per_round()) { 1401 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1402 int ret = multifd_ram_flush_and_sync(f); 1403 if (ret < 0) { 1404 return ret; 1405 } 1406 } 1407 1408 /* Hit the end of the list */ 1409 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1410 /* Flag that we've looped */ 1411 pss->complete_round = true; 1412 /* After the first round, enable XBZRLE. */ 1413 if (migrate_xbzrle()) { 1414 rs->xbzrle_started = true; 1415 } 1416 } 1417 /* Didn't find anything this time, but try again on the new block */ 1418 return PAGE_TRY_AGAIN; 1419 } else { 1420 /* We've found something */ 1421 return PAGE_DIRTY_FOUND; 1422 } 1423 } 1424 1425 /** 1426 * unqueue_page: gets a page of the queue 1427 * 1428 * Helper for 'get_queued_page' - gets a page off the queue 1429 * 1430 * Returns the block of the page (or NULL if none available) 1431 * 1432 * @rs: current RAM state 1433 * @offset: used to return the offset within the RAMBlock 1434 */ 1435 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1436 { 1437 struct RAMSrcPageRequest *entry; 1438 RAMBlock *block = NULL; 1439 1440 if (!postcopy_has_request(rs)) { 1441 return NULL; 1442 } 1443 1444 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1445 1446 /* 1447 * This should _never_ change even after we take the lock, because no one 1448 * should be taking anything off the request list other than us. 1449 */ 1450 assert(postcopy_has_request(rs)); 1451 1452 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1453 block = entry->rb; 1454 *offset = entry->offset; 1455 1456 if (entry->len > TARGET_PAGE_SIZE) { 1457 entry->len -= TARGET_PAGE_SIZE; 1458 entry->offset += TARGET_PAGE_SIZE; 1459 } else { 1460 memory_region_unref(block->mr); 1461 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1462 g_free(entry); 1463 migration_consume_urgent_request(); 1464 } 1465 1466 return block; 1467 } 1468 1469 #if defined(__linux__) 1470 /** 1471 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1472 * is found, return RAM block pointer and page offset 1473 * 1474 * Returns pointer to the RAMBlock containing faulting page, 1475 * NULL if no write faults are pending 1476 * 1477 * @rs: current RAM state 1478 * @offset: page offset from the beginning of the block 1479 */ 1480 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1481 { 1482 struct uffd_msg uffd_msg; 1483 void *page_address; 1484 RAMBlock *block; 1485 int res; 1486 1487 if (!migrate_background_snapshot()) { 1488 return NULL; 1489 } 1490 1491 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1492 if (res <= 0) { 1493 return NULL; 1494 } 1495 1496 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1497 block = qemu_ram_block_from_host(page_address, false, offset); 1498 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1499 return block; 1500 } 1501 1502 /** 1503 * ram_save_release_protection: release UFFD write protection after 1504 * a range of pages has been saved 1505 * 1506 * @rs: current RAM state 1507 * @pss: page-search-status structure 1508 * @start_page: index of the first page in the range relative to pss->block 1509 * 1510 * Returns 0 on success, negative value in case of an error 1511 */ 1512 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1513 unsigned long start_page) 1514 { 1515 int res = 0; 1516 1517 /* Check if page is from UFFD-managed region. */ 1518 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1519 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1520 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1521 1522 /* Flush async buffers before un-protect. */ 1523 qemu_fflush(pss->pss_channel); 1524 /* Un-protect memory range. */ 1525 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1526 false, false); 1527 } 1528 1529 return res; 1530 } 1531 1532 /* ram_write_tracking_available: check if kernel supports required UFFD features 1533 * 1534 * Returns true if supports, false otherwise 1535 */ 1536 bool ram_write_tracking_available(void) 1537 { 1538 uint64_t uffd_features; 1539 int res; 1540 1541 res = uffd_query_features(&uffd_features); 1542 return (res == 0 && 1543 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1544 } 1545 1546 /* ram_write_tracking_compatible: check if guest configuration is 1547 * compatible with 'write-tracking' 1548 * 1549 * Returns true if compatible, false otherwise 1550 */ 1551 bool ram_write_tracking_compatible(void) 1552 { 1553 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1554 int uffd_fd; 1555 RAMBlock *block; 1556 bool ret = false; 1557 1558 /* Open UFFD file descriptor */ 1559 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1560 if (uffd_fd < 0) { 1561 return false; 1562 } 1563 1564 RCU_READ_LOCK_GUARD(); 1565 1566 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1567 uint64_t uffd_ioctls; 1568 1569 /* Nothing to do with read-only and MMIO-writable regions */ 1570 if (block->mr->readonly || block->mr->rom_device) { 1571 continue; 1572 } 1573 /* Try to register block memory via UFFD-IO to track writes */ 1574 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1575 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1576 goto out; 1577 } 1578 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1579 goto out; 1580 } 1581 } 1582 ret = true; 1583 1584 out: 1585 uffd_close_fd(uffd_fd); 1586 return ret; 1587 } 1588 1589 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1590 ram_addr_t size) 1591 { 1592 const ram_addr_t end = offset + size; 1593 1594 /* 1595 * We read one byte of each page; this will preallocate page tables if 1596 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1597 * where no page was populated yet. This might require adaption when 1598 * supporting other mappings, like shmem. 1599 */ 1600 for (; offset < end; offset += block->page_size) { 1601 char tmp = *((char *)block->host + offset); 1602 1603 /* Don't optimize the read out */ 1604 asm volatile("" : "+r" (tmp)); 1605 } 1606 } 1607 1608 static inline int populate_read_section(MemoryRegionSection *section, 1609 void *opaque) 1610 { 1611 const hwaddr size = int128_get64(section->size); 1612 hwaddr offset = section->offset_within_region; 1613 RAMBlock *block = section->mr->ram_block; 1614 1615 populate_read_range(block, offset, size); 1616 return 0; 1617 } 1618 1619 /* 1620 * ram_block_populate_read: preallocate page tables and populate pages in the 1621 * RAM block by reading a byte of each page. 1622 * 1623 * Since it's solely used for userfault_fd WP feature, here we just 1624 * hardcode page size to qemu_real_host_page_size. 1625 * 1626 * @block: RAM block to populate 1627 */ 1628 static void ram_block_populate_read(RAMBlock *rb) 1629 { 1630 /* 1631 * Skip populating all pages that fall into a discarded range as managed by 1632 * a RamDiscardManager responsible for the mapped memory region of the 1633 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1634 * must not get populated automatically. We don't have to track 1635 * modifications via userfaultfd WP reliably, because these pages will 1636 * not be part of the migration stream either way -- see 1637 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1638 * 1639 * Note: The result is only stable while migrating (precopy/postcopy). 1640 */ 1641 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1642 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1643 MemoryRegionSection section = { 1644 .mr = rb->mr, 1645 .offset_within_region = 0, 1646 .size = rb->mr->size, 1647 }; 1648 1649 ram_discard_manager_replay_populated(rdm, §ion, 1650 populate_read_section, NULL); 1651 } else { 1652 populate_read_range(rb, 0, rb->used_length); 1653 } 1654 } 1655 1656 /* 1657 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1658 */ 1659 void ram_write_tracking_prepare(void) 1660 { 1661 RAMBlock *block; 1662 1663 RCU_READ_LOCK_GUARD(); 1664 1665 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1666 /* Nothing to do with read-only and MMIO-writable regions */ 1667 if (block->mr->readonly || block->mr->rom_device) { 1668 continue; 1669 } 1670 1671 /* 1672 * Populate pages of the RAM block before enabling userfault_fd 1673 * write protection. 1674 * 1675 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1676 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1677 * pages with pte_none() entries in page table. 1678 */ 1679 ram_block_populate_read(block); 1680 } 1681 } 1682 1683 static inline int uffd_protect_section(MemoryRegionSection *section, 1684 void *opaque) 1685 { 1686 const hwaddr size = int128_get64(section->size); 1687 const hwaddr offset = section->offset_within_region; 1688 RAMBlock *rb = section->mr->ram_block; 1689 int uffd_fd = (uintptr_t)opaque; 1690 1691 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1692 false); 1693 } 1694 1695 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1696 { 1697 assert(rb->flags & RAM_UF_WRITEPROTECT); 1698 1699 /* See ram_block_populate_read() */ 1700 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1701 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1702 MemoryRegionSection section = { 1703 .mr = rb->mr, 1704 .offset_within_region = 0, 1705 .size = rb->mr->size, 1706 }; 1707 1708 return ram_discard_manager_replay_populated(rdm, §ion, 1709 uffd_protect_section, 1710 (void *)(uintptr_t)uffd_fd); 1711 } 1712 return uffd_change_protection(uffd_fd, rb->host, 1713 rb->used_length, true, false); 1714 } 1715 1716 /* 1717 * ram_write_tracking_start: start UFFD-WP memory tracking 1718 * 1719 * Returns 0 for success or negative value in case of error 1720 */ 1721 int ram_write_tracking_start(void) 1722 { 1723 int uffd_fd; 1724 RAMState *rs = ram_state; 1725 RAMBlock *block; 1726 1727 /* Open UFFD file descriptor */ 1728 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1729 if (uffd_fd < 0) { 1730 return uffd_fd; 1731 } 1732 rs->uffdio_fd = uffd_fd; 1733 1734 RCU_READ_LOCK_GUARD(); 1735 1736 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1737 /* Nothing to do with read-only and MMIO-writable regions */ 1738 if (block->mr->readonly || block->mr->rom_device) { 1739 continue; 1740 } 1741 1742 /* Register block memory with UFFD to track writes */ 1743 if (uffd_register_memory(rs->uffdio_fd, block->host, 1744 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1745 goto fail; 1746 } 1747 block->flags |= RAM_UF_WRITEPROTECT; 1748 memory_region_ref(block->mr); 1749 1750 /* Apply UFFD write protection to the block memory range */ 1751 if (ram_block_uffd_protect(block, uffd_fd)) { 1752 goto fail; 1753 } 1754 1755 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1756 block->host, block->max_length); 1757 } 1758 1759 return 0; 1760 1761 fail: 1762 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1763 1764 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1765 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1766 continue; 1767 } 1768 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1769 /* Cleanup flags and remove reference */ 1770 block->flags &= ~RAM_UF_WRITEPROTECT; 1771 memory_region_unref(block->mr); 1772 } 1773 1774 uffd_close_fd(uffd_fd); 1775 rs->uffdio_fd = -1; 1776 return -1; 1777 } 1778 1779 /** 1780 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1781 */ 1782 void ram_write_tracking_stop(void) 1783 { 1784 RAMState *rs = ram_state; 1785 RAMBlock *block; 1786 1787 RCU_READ_LOCK_GUARD(); 1788 1789 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1790 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1791 continue; 1792 } 1793 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1794 1795 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1796 block->host, block->max_length); 1797 1798 /* Cleanup flags and remove reference */ 1799 block->flags &= ~RAM_UF_WRITEPROTECT; 1800 memory_region_unref(block->mr); 1801 } 1802 1803 /* Finally close UFFD file descriptor */ 1804 uffd_close_fd(rs->uffdio_fd); 1805 rs->uffdio_fd = -1; 1806 } 1807 1808 #else 1809 /* No target OS support, stubs just fail or ignore */ 1810 1811 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1812 { 1813 (void) rs; 1814 (void) offset; 1815 1816 return NULL; 1817 } 1818 1819 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1820 unsigned long start_page) 1821 { 1822 (void) rs; 1823 (void) pss; 1824 (void) start_page; 1825 1826 return 0; 1827 } 1828 1829 bool ram_write_tracking_available(void) 1830 { 1831 return false; 1832 } 1833 1834 bool ram_write_tracking_compatible(void) 1835 { 1836 g_assert_not_reached(); 1837 } 1838 1839 int ram_write_tracking_start(void) 1840 { 1841 g_assert_not_reached(); 1842 } 1843 1844 void ram_write_tracking_stop(void) 1845 { 1846 g_assert_not_reached(); 1847 } 1848 #endif /* defined(__linux__) */ 1849 1850 /** 1851 * get_queued_page: unqueue a page from the postcopy requests 1852 * 1853 * Skips pages that are already sent (!dirty) 1854 * 1855 * Returns true if a queued page is found 1856 * 1857 * @rs: current RAM state 1858 * @pss: data about the state of the current dirty page scan 1859 */ 1860 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1861 { 1862 RAMBlock *block; 1863 ram_addr_t offset; 1864 bool dirty = false; 1865 1866 do { 1867 block = unqueue_page(rs, &offset); 1868 /* 1869 * We're sending this page, and since it's postcopy nothing else 1870 * will dirty it, and we must make sure it doesn't get sent again 1871 * even if this queue request was received after the background 1872 * search already sent it. 1873 */ 1874 if (block) { 1875 unsigned long page; 1876 1877 page = offset >> TARGET_PAGE_BITS; 1878 dirty = test_bit(page, block->bmap); 1879 if (!dirty) { 1880 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1881 page); 1882 } else { 1883 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1884 } 1885 } 1886 1887 } while (block && !dirty); 1888 1889 if (!block) { 1890 /* 1891 * Poll write faults too if background snapshot is enabled; that's 1892 * when we have vcpus got blocked by the write protected pages. 1893 */ 1894 block = poll_fault_page(rs, &offset); 1895 } 1896 1897 if (block) { 1898 /* 1899 * We want the background search to continue from the queued page 1900 * since the guest is likely to want other pages near to the page 1901 * it just requested. 1902 */ 1903 pss->block = block; 1904 pss->page = offset >> TARGET_PAGE_BITS; 1905 1906 /* 1907 * This unqueued page would break the "one round" check, even is 1908 * really rare. 1909 */ 1910 pss->complete_round = false; 1911 } 1912 1913 return !!block; 1914 } 1915 1916 /** 1917 * migration_page_queue_free: drop any remaining pages in the ram 1918 * request queue 1919 * 1920 * It should be empty at the end anyway, but in error cases there may 1921 * be some left. in case that there is any page left, we drop it. 1922 * 1923 */ 1924 static void migration_page_queue_free(RAMState *rs) 1925 { 1926 struct RAMSrcPageRequest *mspr, *next_mspr; 1927 /* This queue generally should be empty - but in the case of a failed 1928 * migration might have some droppings in. 1929 */ 1930 RCU_READ_LOCK_GUARD(); 1931 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1932 memory_region_unref(mspr->rb->mr); 1933 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1934 g_free(mspr); 1935 } 1936 } 1937 1938 /** 1939 * ram_save_queue_pages: queue the page for transmission 1940 * 1941 * A request from postcopy destination for example. 1942 * 1943 * Returns zero on success or negative on error 1944 * 1945 * @rbname: Name of the RAMBLock of the request. NULL means the 1946 * same that last one. 1947 * @start: starting address from the start of the RAMBlock 1948 * @len: length (in bytes) to send 1949 */ 1950 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, 1951 Error **errp) 1952 { 1953 RAMBlock *ramblock; 1954 RAMState *rs = ram_state; 1955 1956 stat64_add(&mig_stats.postcopy_requests, 1); 1957 RCU_READ_LOCK_GUARD(); 1958 1959 if (!rbname) { 1960 /* Reuse last RAMBlock */ 1961 ramblock = rs->last_req_rb; 1962 1963 if (!ramblock) { 1964 /* 1965 * Shouldn't happen, we can't reuse the last RAMBlock if 1966 * it's the 1st request. 1967 */ 1968 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block"); 1969 return -1; 1970 } 1971 } else { 1972 ramblock = qemu_ram_block_by_name(rbname); 1973 1974 if (!ramblock) { 1975 /* We shouldn't be asked for a non-existent RAMBlock */ 1976 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname); 1977 return -1; 1978 } 1979 rs->last_req_rb = ramblock; 1980 } 1981 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1982 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1983 error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, " 1984 "start=" RAM_ADDR_FMT " len=" 1985 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1986 start, len, ramblock->used_length); 1987 return -1; 1988 } 1989 1990 /* 1991 * When with postcopy preempt, we send back the page directly in the 1992 * rp-return thread. 1993 */ 1994 if (postcopy_preempt_active()) { 1995 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1996 size_t page_size = qemu_ram_pagesize(ramblock); 1997 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1998 int ret = 0; 1999 2000 qemu_mutex_lock(&rs->bitmap_mutex); 2001 2002 pss_init(pss, ramblock, page_start); 2003 /* 2004 * Always use the preempt channel, and make sure it's there. It's 2005 * safe to access without lock, because when rp-thread is running 2006 * we should be the only one who operates on the qemufile 2007 */ 2008 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2009 assert(pss->pss_channel); 2010 2011 /* 2012 * It must be either one or multiple of host page size. Just 2013 * assert; if something wrong we're mostly split brain anyway. 2014 */ 2015 assert(len % page_size == 0); 2016 while (len) { 2017 if (ram_save_host_page_urgent(pss)) { 2018 error_setg(errp, "ram_save_host_page_urgent() failed: " 2019 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2020 ramblock->idstr, start); 2021 ret = -1; 2022 break; 2023 } 2024 /* 2025 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2026 * will automatically be moved and point to the next host page 2027 * we're going to send, so no need to update here. 2028 * 2029 * Normally QEMU never sends >1 host page in requests, so 2030 * logically we don't even need that as the loop should only 2031 * run once, but just to be consistent. 2032 */ 2033 len -= page_size; 2034 }; 2035 qemu_mutex_unlock(&rs->bitmap_mutex); 2036 2037 return ret; 2038 } 2039 2040 struct RAMSrcPageRequest *new_entry = 2041 g_new0(struct RAMSrcPageRequest, 1); 2042 new_entry->rb = ramblock; 2043 new_entry->offset = start; 2044 new_entry->len = len; 2045 2046 memory_region_ref(ramblock->mr); 2047 qemu_mutex_lock(&rs->src_page_req_mutex); 2048 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2049 migration_make_urgent_request(); 2050 qemu_mutex_unlock(&rs->src_page_req_mutex); 2051 2052 return 0; 2053 } 2054 2055 /** 2056 * ram_save_target_page: save one target page to the precopy thread 2057 * OR to multifd workers. 2058 * 2059 * @rs: current RAM state 2060 * @pss: data about the page we want to send 2061 */ 2062 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 2063 { 2064 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2065 int res; 2066 2067 /* Hand over to RDMA first */ 2068 if (migrate_rdma()) { 2069 res = rdma_control_save_page(pss->pss_channel, pss->block->offset, 2070 offset, TARGET_PAGE_SIZE); 2071 2072 if (res == RAM_SAVE_CONTROL_DELAYED) { 2073 res = 1; 2074 } 2075 return res; 2076 } 2077 2078 if (!migrate_multifd() 2079 || migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { 2080 if (save_zero_page(rs, pss, offset)) { 2081 return 1; 2082 } 2083 } 2084 2085 if (migrate_multifd() && !migration_in_postcopy()) { 2086 return ram_save_multifd_page(pss->block, offset); 2087 } 2088 2089 return ram_save_page(rs, pss); 2090 } 2091 2092 /* Should be called before sending a host page */ 2093 static void pss_host_page_prepare(PageSearchStatus *pss) 2094 { 2095 /* How many guest pages are there in one host page? */ 2096 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2097 2098 pss->host_page_sending = true; 2099 if (guest_pfns <= 1) { 2100 /* 2101 * This covers both when guest psize == host psize, or when guest 2102 * has larger psize than the host (guest_pfns==0). 2103 * 2104 * For the latter, we always send one whole guest page per 2105 * iteration of the host page (example: an Alpha VM on x86 host 2106 * will have guest psize 8K while host psize 4K). 2107 */ 2108 pss->host_page_start = pss->page; 2109 pss->host_page_end = pss->page + 1; 2110 } else { 2111 /* 2112 * The host page spans over multiple guest pages, we send them 2113 * within the same host page iteration. 2114 */ 2115 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2116 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2117 } 2118 } 2119 2120 /* 2121 * Whether the page pointed by PSS is within the host page being sent. 2122 * Must be called after a previous pss_host_page_prepare(). 2123 */ 2124 static bool pss_within_range(PageSearchStatus *pss) 2125 { 2126 ram_addr_t ram_addr; 2127 2128 assert(pss->host_page_sending); 2129 2130 /* Over host-page boundary? */ 2131 if (pss->page >= pss->host_page_end) { 2132 return false; 2133 } 2134 2135 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2136 2137 return offset_in_ramblock(pss->block, ram_addr); 2138 } 2139 2140 static void pss_host_page_finish(PageSearchStatus *pss) 2141 { 2142 pss->host_page_sending = false; 2143 /* This is not needed, but just to reset it */ 2144 pss->host_page_start = pss->host_page_end = 0; 2145 } 2146 2147 static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss) 2148 { 2149 PageLocationHint *hint = &rs->page_hint; 2150 2151 /* If there's a pending hint not consumed, don't bother */ 2152 if (hint->valid) { 2153 return; 2154 } 2155 2156 /* Provide a hint to the background stream otherwise */ 2157 hint->location.block = pss->block; 2158 hint->location.offset = pss->page; 2159 hint->valid = true; 2160 } 2161 2162 /* 2163 * Send an urgent host page specified by `pss'. Need to be called with 2164 * bitmap_mutex held. 2165 * 2166 * Returns 0 if save host page succeeded, false otherwise. 2167 */ 2168 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2169 { 2170 bool page_dirty, sent = false; 2171 RAMState *rs = ram_state; 2172 int ret = 0; 2173 2174 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2175 pss_host_page_prepare(pss); 2176 2177 /* 2178 * If precopy is sending the same page, let it be done in precopy, or 2179 * we could send the same page in two channels and none of them will 2180 * receive the whole page. 2181 */ 2182 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2183 trace_postcopy_preempt_hit(pss->block->idstr, 2184 pss->page << TARGET_PAGE_BITS); 2185 return 0; 2186 } 2187 2188 do { 2189 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2190 2191 if (page_dirty) { 2192 /* Be strict to return code; it must be 1, or what else? */ 2193 if (ram_save_target_page(rs, pss) != 1) { 2194 error_report_once("%s: ram_save_target_page failed", __func__); 2195 ret = -1; 2196 goto out; 2197 } 2198 sent = true; 2199 } 2200 pss_find_next_dirty(pss); 2201 } while (pss_within_range(pss)); 2202 out: 2203 pss_host_page_finish(pss); 2204 /* For urgent requests, flush immediately if sent */ 2205 if (sent) { 2206 qemu_fflush(pss->pss_channel); 2207 ram_page_hint_update(rs, pss); 2208 } 2209 return ret; 2210 } 2211 2212 /** 2213 * ram_save_host_page: save a whole host page 2214 * 2215 * Starting at *offset send pages up to the end of the current host 2216 * page. It's valid for the initial offset to point into the middle of 2217 * a host page in which case the remainder of the hostpage is sent. 2218 * Only dirty target pages are sent. Note that the host page size may 2219 * be a huge page for this block. 2220 * 2221 * The saving stops at the boundary of the used_length of the block 2222 * if the RAMBlock isn't a multiple of the host page size. 2223 * 2224 * The caller must be with ram_state.bitmap_mutex held to call this 2225 * function. Note that this function can temporarily release the lock, but 2226 * when the function is returned it'll make sure the lock is still held. 2227 * 2228 * Returns the number of pages written or negative on error 2229 * 2230 * @rs: current RAM state 2231 * @pss: data about the page we want to send 2232 */ 2233 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2234 { 2235 bool page_dirty, preempt_active = postcopy_preempt_active(); 2236 int tmppages, pages = 0; 2237 size_t pagesize_bits = 2238 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2239 unsigned long start_page = pss->page; 2240 int res; 2241 2242 if (migrate_ram_is_ignored(pss->block)) { 2243 error_report("block %s should not be migrated !", pss->block->idstr); 2244 return 0; 2245 } 2246 2247 /* Update host page boundary information */ 2248 pss_host_page_prepare(pss); 2249 2250 do { 2251 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2252 2253 /* Check the pages is dirty and if it is send it */ 2254 if (page_dirty) { 2255 /* 2256 * Properly yield the lock only in postcopy preempt mode 2257 * because both migration thread and rp-return thread can 2258 * operate on the bitmaps. 2259 */ 2260 if (preempt_active) { 2261 qemu_mutex_unlock(&rs->bitmap_mutex); 2262 } 2263 tmppages = ram_save_target_page(rs, pss); 2264 if (tmppages >= 0) { 2265 pages += tmppages; 2266 /* 2267 * Allow rate limiting to happen in the middle of huge pages if 2268 * something is sent in the current iteration. 2269 */ 2270 if (pagesize_bits > 1 && tmppages > 0) { 2271 migration_rate_limit(); 2272 } 2273 } 2274 if (preempt_active) { 2275 qemu_mutex_lock(&rs->bitmap_mutex); 2276 } 2277 } else { 2278 tmppages = 0; 2279 } 2280 2281 if (tmppages < 0) { 2282 pss_host_page_finish(pss); 2283 return tmppages; 2284 } 2285 2286 pss_find_next_dirty(pss); 2287 } while (pss_within_range(pss)); 2288 2289 pss_host_page_finish(pss); 2290 2291 res = ram_save_release_protection(rs, pss, start_page); 2292 return (res < 0 ? res : pages); 2293 } 2294 2295 static bool ram_page_hint_valid(RAMState *rs) 2296 { 2297 /* There's only page hint during postcopy preempt mode */ 2298 if (!postcopy_preempt_active()) { 2299 return false; 2300 } 2301 2302 return rs->page_hint.valid; 2303 } 2304 2305 static void ram_page_hint_collect(RAMState *rs, RAMBlock **block, 2306 unsigned long *page) 2307 { 2308 PageLocationHint *hint = &rs->page_hint; 2309 2310 assert(hint->valid); 2311 2312 *block = hint->location.block; 2313 *page = hint->location.offset; 2314 2315 /* Mark the hint consumed */ 2316 hint->valid = false; 2317 } 2318 2319 /** 2320 * ram_find_and_save_block: finds a dirty page and sends it to f 2321 * 2322 * Called within an RCU critical section. 2323 * 2324 * Returns the number of pages written where zero means no dirty pages, 2325 * or negative on error 2326 * 2327 * @rs: current RAM state 2328 * 2329 * On systems where host-page-size > target-page-size it will send all the 2330 * pages in a host page that are dirty. 2331 */ 2332 static int ram_find_and_save_block(RAMState *rs) 2333 { 2334 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2335 unsigned long next_page; 2336 RAMBlock *next_block; 2337 int pages = 0; 2338 2339 /* No dirty page as there is zero RAM */ 2340 if (!rs->ram_bytes_total) { 2341 return pages; 2342 } 2343 2344 /* 2345 * Always keep last_seen_block/last_page valid during this procedure, 2346 * because find_dirty_block() relies on these values (e.g., we compare 2347 * last_seen_block with pss.block to see whether we searched all the 2348 * ramblocks) to detect the completion of migration. Having NULL value 2349 * of last_seen_block can conditionally cause below loop to run forever. 2350 */ 2351 if (!rs->last_seen_block) { 2352 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2353 rs->last_page = 0; 2354 } 2355 2356 if (ram_page_hint_valid(rs)) { 2357 ram_page_hint_collect(rs, &next_block, &next_page); 2358 } else { 2359 next_block = rs->last_seen_block; 2360 next_page = rs->last_page; 2361 } 2362 2363 pss_init(pss, next_block, next_page); 2364 2365 while (true){ 2366 if (!get_queued_page(rs, pss)) { 2367 /* priority queue empty, so just search for something dirty */ 2368 int res = find_dirty_block(rs, pss); 2369 2370 if (res == PAGE_ALL_CLEAN) { 2371 break; 2372 } else if (res == PAGE_TRY_AGAIN) { 2373 continue; 2374 } else if (res < 0) { 2375 pages = res; 2376 break; 2377 } 2378 2379 /* Otherwise we must have a dirty page to move */ 2380 assert(res == PAGE_DIRTY_FOUND); 2381 } 2382 pages = ram_save_host_page(rs, pss); 2383 if (pages) { 2384 break; 2385 } 2386 } 2387 2388 rs->last_seen_block = pss->block; 2389 rs->last_page = pss->page; 2390 2391 return pages; 2392 } 2393 2394 static uint64_t ram_bytes_total_with_ignored(void) 2395 { 2396 RAMBlock *block; 2397 uint64_t total = 0; 2398 2399 RCU_READ_LOCK_GUARD(); 2400 2401 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2402 total += block->used_length; 2403 } 2404 return total; 2405 } 2406 2407 uint64_t ram_bytes_total(void) 2408 { 2409 RAMBlock *block; 2410 uint64_t total = 0; 2411 2412 RCU_READ_LOCK_GUARD(); 2413 2414 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2415 total += block->used_length; 2416 } 2417 return total; 2418 } 2419 2420 static void xbzrle_load_setup(void) 2421 { 2422 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2423 } 2424 2425 static void xbzrle_load_cleanup(void) 2426 { 2427 g_free(XBZRLE.decoded_buf); 2428 XBZRLE.decoded_buf = NULL; 2429 } 2430 2431 static void ram_state_cleanup(RAMState **rsp) 2432 { 2433 if (*rsp) { 2434 migration_page_queue_free(*rsp); 2435 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2436 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2437 g_free(*rsp); 2438 *rsp = NULL; 2439 } 2440 } 2441 2442 static void xbzrle_cleanup(void) 2443 { 2444 XBZRLE_cache_lock(); 2445 if (XBZRLE.cache) { 2446 cache_fini(XBZRLE.cache); 2447 g_free(XBZRLE.encoded_buf); 2448 g_free(XBZRLE.current_buf); 2449 g_free(XBZRLE.zero_target_page); 2450 XBZRLE.cache = NULL; 2451 XBZRLE.encoded_buf = NULL; 2452 XBZRLE.current_buf = NULL; 2453 XBZRLE.zero_target_page = NULL; 2454 } 2455 XBZRLE_cache_unlock(); 2456 } 2457 2458 static void ram_bitmaps_destroy(void) 2459 { 2460 RAMBlock *block; 2461 2462 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2463 g_free(block->clear_bmap); 2464 block->clear_bmap = NULL; 2465 g_free(block->bmap); 2466 block->bmap = NULL; 2467 g_free(block->file_bmap); 2468 block->file_bmap = NULL; 2469 } 2470 } 2471 2472 static void ram_save_cleanup(void *opaque) 2473 { 2474 RAMState **rsp = opaque; 2475 2476 /* We don't use dirty log with background snapshots */ 2477 if (!migrate_background_snapshot()) { 2478 /* caller have hold BQL or is in a bh, so there is 2479 * no writing race against the migration bitmap 2480 */ 2481 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2482 /* 2483 * do not stop dirty log without starting it, since 2484 * memory_global_dirty_log_stop will assert that 2485 * memory_global_dirty_log_start/stop used in pairs 2486 */ 2487 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2488 } 2489 } 2490 2491 ram_bitmaps_destroy(); 2492 2493 xbzrle_cleanup(); 2494 multifd_ram_save_cleanup(); 2495 ram_state_cleanup(rsp); 2496 } 2497 2498 static void ram_page_hint_reset(PageLocationHint *hint) 2499 { 2500 hint->location.block = NULL; 2501 hint->location.offset = 0; 2502 hint->valid = false; 2503 } 2504 2505 static void ram_state_reset(RAMState *rs) 2506 { 2507 int i; 2508 2509 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2510 rs->pss[i].last_sent_block = NULL; 2511 } 2512 2513 rs->last_seen_block = NULL; 2514 rs->last_page = 0; 2515 rs->last_version = ram_list.version; 2516 rs->xbzrle_started = false; 2517 2518 ram_page_hint_reset(&rs->page_hint); 2519 } 2520 2521 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2522 2523 /* **** functions for postcopy ***** */ 2524 2525 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2526 { 2527 struct RAMBlock *block; 2528 2529 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2530 unsigned long *bitmap = block->bmap; 2531 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2532 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2533 2534 while (run_start < range) { 2535 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2536 ram_discard_range(block->idstr, 2537 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2538 ((ram_addr_t)(run_end - run_start)) 2539 << TARGET_PAGE_BITS); 2540 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2541 } 2542 } 2543 } 2544 2545 /** 2546 * postcopy_send_discard_bm_ram: discard a RAMBlock 2547 * 2548 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2549 * 2550 * @ms: current migration state 2551 * @block: RAMBlock to discard 2552 */ 2553 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2554 { 2555 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2556 unsigned long current; 2557 unsigned long *bitmap = block->bmap; 2558 2559 for (current = 0; current < end; ) { 2560 unsigned long one = find_next_bit(bitmap, end, current); 2561 unsigned long zero, discard_length; 2562 2563 if (one >= end) { 2564 break; 2565 } 2566 2567 zero = find_next_zero_bit(bitmap, end, one + 1); 2568 2569 if (zero >= end) { 2570 discard_length = end - one; 2571 } else { 2572 discard_length = zero - one; 2573 } 2574 postcopy_discard_send_range(ms, one, discard_length); 2575 current = one + discard_length; 2576 } 2577 } 2578 2579 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2580 2581 /** 2582 * postcopy_each_ram_send_discard: discard all RAMBlocks 2583 * 2584 * Utility for the outgoing postcopy code. 2585 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2586 * passing it bitmap indexes and name. 2587 * (qemu_ram_foreach_block ends up passing unscaled lengths 2588 * which would mean postcopy code would have to deal with target page) 2589 * 2590 * @ms: current migration state 2591 */ 2592 static void postcopy_each_ram_send_discard(MigrationState *ms) 2593 { 2594 struct RAMBlock *block; 2595 2596 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2597 postcopy_discard_send_init(ms, block->idstr); 2598 2599 /* 2600 * Deal with TPS != HPS and huge pages. It discard any partially sent 2601 * host-page size chunks, mark any partially dirty host-page size 2602 * chunks as all dirty. In this case the host-page is the host-page 2603 * for the particular RAMBlock, i.e. it might be a huge page. 2604 */ 2605 postcopy_chunk_hostpages_pass(ms, block); 2606 2607 /* 2608 * Postcopy sends chunks of bitmap over the wire, but it 2609 * just needs indexes at this point, avoids it having 2610 * target page specific code. 2611 */ 2612 postcopy_send_discard_bm_ram(ms, block); 2613 postcopy_discard_send_finish(ms); 2614 } 2615 } 2616 2617 /** 2618 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2619 * 2620 * Helper for postcopy_chunk_hostpages; it's called twice to 2621 * canonicalize the two bitmaps, that are similar, but one is 2622 * inverted. 2623 * 2624 * Postcopy requires that all target pages in a hostpage are dirty or 2625 * clean, not a mix. This function canonicalizes the bitmaps. 2626 * 2627 * @ms: current migration state 2628 * @block: block that contains the page we want to canonicalize 2629 */ 2630 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2631 { 2632 RAMState *rs = ram_state; 2633 unsigned long *bitmap = block->bmap; 2634 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2635 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2636 unsigned long run_start; 2637 2638 if (block->page_size == TARGET_PAGE_SIZE) { 2639 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2640 return; 2641 } 2642 2643 /* Find a dirty page */ 2644 run_start = find_next_bit(bitmap, pages, 0); 2645 2646 while (run_start < pages) { 2647 2648 /* 2649 * If the start of this run of pages is in the middle of a host 2650 * page, then we need to fixup this host page. 2651 */ 2652 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2653 /* Find the end of this run */ 2654 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2655 /* 2656 * If the end isn't at the start of a host page, then the 2657 * run doesn't finish at the end of a host page 2658 * and we need to discard. 2659 */ 2660 } 2661 2662 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2663 unsigned long page; 2664 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2665 host_ratio); 2666 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2667 2668 /* Clean up the bitmap */ 2669 for (page = fixup_start_addr; 2670 page < fixup_start_addr + host_ratio; page++) { 2671 /* 2672 * Remark them as dirty, updating the count for any pages 2673 * that weren't previously dirty. 2674 */ 2675 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2676 } 2677 } 2678 2679 /* Find the next dirty page for the next iteration */ 2680 run_start = find_next_bit(bitmap, pages, run_start); 2681 } 2682 } 2683 2684 /** 2685 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2686 * 2687 * Transmit the set of pages to be discarded after precopy to the target 2688 * these are pages that: 2689 * a) Have been previously transmitted but are now dirty again 2690 * b) Pages that have never been transmitted, this ensures that 2691 * any pages on the destination that have been mapped by background 2692 * tasks get discarded (transparent huge pages is the specific concern) 2693 * Hopefully this is pretty sparse 2694 * 2695 * @ms: current migration state 2696 */ 2697 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2698 { 2699 RAMState *rs = ram_state; 2700 2701 RCU_READ_LOCK_GUARD(); 2702 2703 /* This should be our last sync, the src is now paused */ 2704 migration_bitmap_sync(rs, false); 2705 2706 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2707 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2708 rs->last_seen_block = NULL; 2709 rs->last_page = 0; 2710 2711 postcopy_each_ram_send_discard(ms); 2712 2713 trace_ram_postcopy_send_discard_bitmap(); 2714 } 2715 2716 /** 2717 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2718 * 2719 * Returns zero on success 2720 * 2721 * @rbname: name of the RAMBlock of the request. NULL means the 2722 * same that last one. 2723 * @start: RAMBlock starting page 2724 * @length: RAMBlock size 2725 */ 2726 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2727 { 2728 trace_ram_discard_range(rbname, start, length); 2729 2730 RCU_READ_LOCK_GUARD(); 2731 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2732 2733 if (!rb) { 2734 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2735 return -1; 2736 } 2737 2738 /* 2739 * On source VM, we don't need to update the received bitmap since 2740 * we don't even have one. 2741 */ 2742 if (rb->receivedmap) { 2743 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2744 length >> qemu_target_page_bits()); 2745 } 2746 2747 return ram_block_discard_range(rb, start, length); 2748 } 2749 2750 /* 2751 * For every allocation, we will try not to crash the VM if the 2752 * allocation failed. 2753 */ 2754 static bool xbzrle_init(Error **errp) 2755 { 2756 if (!migrate_xbzrle()) { 2757 return true; 2758 } 2759 2760 XBZRLE_cache_lock(); 2761 2762 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2763 if (!XBZRLE.zero_target_page) { 2764 error_setg(errp, "%s: Error allocating zero page", __func__); 2765 goto err_out; 2766 } 2767 2768 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2769 TARGET_PAGE_SIZE, errp); 2770 if (!XBZRLE.cache) { 2771 goto free_zero_page; 2772 } 2773 2774 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2775 if (!XBZRLE.encoded_buf) { 2776 error_setg(errp, "%s: Error allocating encoded_buf", __func__); 2777 goto free_cache; 2778 } 2779 2780 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2781 if (!XBZRLE.current_buf) { 2782 error_setg(errp, "%s: Error allocating current_buf", __func__); 2783 goto free_encoded_buf; 2784 } 2785 2786 /* We are all good */ 2787 XBZRLE_cache_unlock(); 2788 return true; 2789 2790 free_encoded_buf: 2791 g_free(XBZRLE.encoded_buf); 2792 XBZRLE.encoded_buf = NULL; 2793 free_cache: 2794 cache_fini(XBZRLE.cache); 2795 XBZRLE.cache = NULL; 2796 free_zero_page: 2797 g_free(XBZRLE.zero_target_page); 2798 XBZRLE.zero_target_page = NULL; 2799 err_out: 2800 XBZRLE_cache_unlock(); 2801 return false; 2802 } 2803 2804 static bool ram_state_init(RAMState **rsp, Error **errp) 2805 { 2806 *rsp = g_try_new0(RAMState, 1); 2807 2808 if (!*rsp) { 2809 error_setg(errp, "%s: Init ramstate fail", __func__); 2810 return false; 2811 } 2812 2813 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2814 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2815 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2816 (*rsp)->ram_bytes_total = ram_bytes_total(); 2817 2818 /* 2819 * Count the total number of pages used by ram blocks not including any 2820 * gaps due to alignment or unplugs. 2821 * This must match with the initial values of dirty bitmap. 2822 */ 2823 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2824 ram_state_reset(*rsp); 2825 2826 return true; 2827 } 2828 2829 static void ram_list_init_bitmaps(void) 2830 { 2831 MigrationState *ms = migrate_get_current(); 2832 RAMBlock *block; 2833 unsigned long pages; 2834 uint8_t shift; 2835 2836 /* Skip setting bitmap if there is no RAM */ 2837 if (ram_bytes_total()) { 2838 shift = ms->clear_bitmap_shift; 2839 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2840 error_report("clear_bitmap_shift (%u) too big, using " 2841 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2842 shift = CLEAR_BITMAP_SHIFT_MAX; 2843 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2844 error_report("clear_bitmap_shift (%u) too small, using " 2845 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2846 shift = CLEAR_BITMAP_SHIFT_MIN; 2847 } 2848 2849 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2850 pages = block->max_length >> TARGET_PAGE_BITS; 2851 /* 2852 * The initial dirty bitmap for migration must be set with all 2853 * ones to make sure we'll migrate every guest RAM page to 2854 * destination. 2855 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2856 * new migration after a failed migration, ram_list. 2857 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2858 * guest memory. 2859 */ 2860 block->bmap = bitmap_new(pages); 2861 bitmap_set(block->bmap, 0, pages); 2862 if (migrate_mapped_ram()) { 2863 block->file_bmap = bitmap_new(pages); 2864 } 2865 block->clear_bmap_shift = shift; 2866 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2867 } 2868 } 2869 } 2870 2871 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2872 { 2873 unsigned long pages; 2874 RAMBlock *rb; 2875 2876 RCU_READ_LOCK_GUARD(); 2877 2878 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2879 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2880 rs->migration_dirty_pages -= pages; 2881 } 2882 } 2883 2884 static bool ram_init_bitmaps(RAMState *rs, Error **errp) 2885 { 2886 bool ret = true; 2887 2888 qemu_mutex_lock_ramlist(); 2889 2890 WITH_RCU_READ_LOCK_GUARD() { 2891 ram_list_init_bitmaps(); 2892 /* We don't use dirty log with background snapshots */ 2893 if (!migrate_background_snapshot()) { 2894 ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp); 2895 if (!ret) { 2896 goto out_unlock; 2897 } 2898 migration_bitmap_sync_precopy(false); 2899 } 2900 } 2901 out_unlock: 2902 qemu_mutex_unlock_ramlist(); 2903 2904 if (!ret) { 2905 ram_bitmaps_destroy(); 2906 return false; 2907 } 2908 2909 /* 2910 * After an eventual first bitmap sync, fixup the initial bitmap 2911 * containing all 1s to exclude any discarded pages from migration. 2912 */ 2913 migration_bitmap_clear_discarded_pages(rs); 2914 return true; 2915 } 2916 2917 static int ram_init_all(RAMState **rsp, Error **errp) 2918 { 2919 if (!ram_state_init(rsp, errp)) { 2920 return -1; 2921 } 2922 2923 if (!xbzrle_init(errp)) { 2924 ram_state_cleanup(rsp); 2925 return -1; 2926 } 2927 2928 if (!ram_init_bitmaps(*rsp, errp)) { 2929 return -1; 2930 } 2931 2932 return 0; 2933 } 2934 2935 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2936 { 2937 RAMBlock *block; 2938 uint64_t pages = 0; 2939 2940 /* 2941 * Postcopy is not using xbzrle/compression, so no need for that. 2942 * Also, since source are already halted, we don't need to care 2943 * about dirty page logging as well. 2944 */ 2945 2946 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2947 pages += bitmap_count_one(block->bmap, 2948 block->used_length >> TARGET_PAGE_BITS); 2949 } 2950 2951 /* This may not be aligned with current bitmaps. Recalculate. */ 2952 rs->migration_dirty_pages = pages; 2953 2954 ram_state_reset(rs); 2955 2956 /* Update RAMState cache of output QEMUFile */ 2957 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2958 2959 trace_ram_state_resume_prepare(pages); 2960 } 2961 2962 /* 2963 * This function clears bits of the free pages reported by the caller from the 2964 * migration dirty bitmap. @addr is the host address corresponding to the 2965 * start of the continuous guest free pages, and @len is the total bytes of 2966 * those pages. 2967 */ 2968 void qemu_guest_free_page_hint(void *addr, size_t len) 2969 { 2970 RAMBlock *block; 2971 ram_addr_t offset; 2972 size_t used_len, start, npages; 2973 2974 /* This function is currently expected to be used during live migration */ 2975 if (!migration_is_running()) { 2976 return; 2977 } 2978 2979 for (; len > 0; len -= used_len, addr += used_len) { 2980 block = qemu_ram_block_from_host(addr, false, &offset); 2981 if (unlikely(!block || offset >= block->used_length)) { 2982 /* 2983 * The implementation might not support RAMBlock resize during 2984 * live migration, but it could happen in theory with future 2985 * updates. So we add a check here to capture that case. 2986 */ 2987 error_report_once("%s unexpected error", __func__); 2988 return; 2989 } 2990 2991 if (len <= block->used_length - offset) { 2992 used_len = len; 2993 } else { 2994 used_len = block->used_length - offset; 2995 } 2996 2997 start = offset >> TARGET_PAGE_BITS; 2998 npages = used_len >> TARGET_PAGE_BITS; 2999 3000 qemu_mutex_lock(&ram_state->bitmap_mutex); 3001 /* 3002 * The skipped free pages are equavalent to be sent from clear_bmap's 3003 * perspective, so clear the bits from the memory region bitmap which 3004 * are initially set. Otherwise those skipped pages will be sent in 3005 * the next round after syncing from the memory region bitmap. 3006 */ 3007 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3008 ram_state->migration_dirty_pages -= 3009 bitmap_count_one_with_offset(block->bmap, start, npages); 3010 bitmap_clear(block->bmap, start, npages); 3011 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3012 } 3013 } 3014 3015 #define MAPPED_RAM_HDR_VERSION 1 3016 struct MappedRamHeader { 3017 uint32_t version; 3018 /* 3019 * The target's page size, so we know how many pages are in the 3020 * bitmap. 3021 */ 3022 uint64_t page_size; 3023 /* 3024 * The offset in the migration file where the pages bitmap is 3025 * stored. 3026 */ 3027 uint64_t bitmap_offset; 3028 /* 3029 * The offset in the migration file where the actual pages (data) 3030 * are stored. 3031 */ 3032 uint64_t pages_offset; 3033 } QEMU_PACKED; 3034 typedef struct MappedRamHeader MappedRamHeader; 3035 3036 static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) 3037 { 3038 g_autofree MappedRamHeader *header = NULL; 3039 size_t header_size, bitmap_size; 3040 long num_pages; 3041 3042 header = g_new0(MappedRamHeader, 1); 3043 header_size = sizeof(MappedRamHeader); 3044 3045 num_pages = block->used_length >> TARGET_PAGE_BITS; 3046 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3047 3048 /* 3049 * Save the file offsets of where the bitmap and the pages should 3050 * go as they are written at the end of migration and during the 3051 * iterative phase, respectively. 3052 */ 3053 block->bitmap_offset = qemu_get_offset(file) + header_size; 3054 block->pages_offset = ROUND_UP(block->bitmap_offset + 3055 bitmap_size, 3056 MAPPED_RAM_FILE_OFFSET_ALIGNMENT); 3057 3058 header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); 3059 header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); 3060 header->bitmap_offset = cpu_to_be64(block->bitmap_offset); 3061 header->pages_offset = cpu_to_be64(block->pages_offset); 3062 3063 qemu_put_buffer(file, (uint8_t *) header, header_size); 3064 3065 /* prepare offset for next ramblock */ 3066 qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); 3067 } 3068 3069 static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, 3070 Error **errp) 3071 { 3072 size_t ret, header_size = sizeof(MappedRamHeader); 3073 3074 ret = qemu_get_buffer(file, (uint8_t *)header, header_size); 3075 if (ret != header_size) { 3076 error_setg(errp, "Could not read whole mapped-ram migration header " 3077 "(expected %zd, got %zd bytes)", header_size, ret); 3078 return false; 3079 } 3080 3081 /* migration stream is big-endian */ 3082 header->version = be32_to_cpu(header->version); 3083 3084 if (header->version > MAPPED_RAM_HDR_VERSION) { 3085 error_setg(errp, "Migration mapped-ram capability version not " 3086 "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, 3087 header->version); 3088 return false; 3089 } 3090 3091 header->page_size = be64_to_cpu(header->page_size); 3092 header->bitmap_offset = be64_to_cpu(header->bitmap_offset); 3093 header->pages_offset = be64_to_cpu(header->pages_offset); 3094 3095 return true; 3096 } 3097 3098 /* 3099 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3100 * long-running RCU critical section. When rcu-reclaims in the code 3101 * start to become numerous it will be necessary to reduce the 3102 * granularity of these critical sections. 3103 */ 3104 3105 /** 3106 * ram_save_setup: Setup RAM for migration 3107 * 3108 * Returns zero to indicate success and negative for error 3109 * 3110 * @f: QEMUFile where to send the data 3111 * @opaque: RAMState pointer 3112 * @errp: pointer to Error*, to store an error if it happens. 3113 */ 3114 static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp) 3115 { 3116 RAMState **rsp = opaque; 3117 RAMBlock *block; 3118 int ret, max_hg_page_size; 3119 3120 /* migration has already setup the bitmap, reuse it. */ 3121 if (!migration_in_colo_state()) { 3122 if (ram_init_all(rsp, errp) != 0) { 3123 return -1; 3124 } 3125 } 3126 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3127 3128 /* 3129 * ??? Mirrors the previous value of qemu_host_page_size, 3130 * but is this really what was intended for the migration? 3131 */ 3132 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 3133 3134 WITH_RCU_READ_LOCK_GUARD() { 3135 qemu_put_be64(f, ram_bytes_total_with_ignored() 3136 | RAM_SAVE_FLAG_MEM_SIZE); 3137 3138 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3139 qemu_put_byte(f, strlen(block->idstr)); 3140 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3141 qemu_put_be64(f, block->used_length); 3142 if (migrate_postcopy_ram() && 3143 block->page_size != max_hg_page_size) { 3144 qemu_put_be64(f, block->page_size); 3145 } 3146 if (migrate_ignore_shared()) { 3147 qemu_put_be64(f, block->mr->addr); 3148 } 3149 3150 if (migrate_mapped_ram()) { 3151 mapped_ram_setup_ramblock(f, block); 3152 } 3153 } 3154 } 3155 3156 ret = rdma_registration_start(f, RAM_CONTROL_SETUP); 3157 if (ret < 0) { 3158 error_setg(errp, "%s: failed to start RDMA registration", __func__); 3159 qemu_file_set_error(f, ret); 3160 return ret; 3161 } 3162 3163 ret = rdma_registration_stop(f, RAM_CONTROL_SETUP); 3164 if (ret < 0) { 3165 error_setg(errp, "%s: failed to stop RDMA registration", __func__); 3166 qemu_file_set_error(f, ret); 3167 return ret; 3168 } 3169 3170 if (migrate_multifd()) { 3171 multifd_ram_save_setup(); 3172 } 3173 3174 /* 3175 * This operation is unfortunate.. 3176 * 3177 * For legacy QEMUs using per-section sync 3178 * ======================================= 3179 * 3180 * This must exist because the EOS below requires the SYNC messages 3181 * per-channel to work. 3182 * 3183 * For modern QEMUs using per-round sync 3184 * ===================================== 3185 * 3186 * Logically such sync is not needed, and recv threads should not run 3187 * until setup ready (using things like channels_ready on src). Then 3188 * we should be all fine. 3189 * 3190 * However even if we add channels_ready to recv side in new QEMUs, old 3191 * QEMU won't have them so this sync will still be needed to make sure 3192 * multifd recv threads won't start processing guest pages early before 3193 * ram_load_setup() is properly done. 3194 * 3195 * Let's stick with this. Fortunately the overhead is low to sync 3196 * during setup because the VM is running, so at least it's not 3197 * accounted as part of downtime. 3198 */ 3199 bql_unlock(); 3200 ret = multifd_ram_flush_and_sync(f); 3201 bql_lock(); 3202 if (ret < 0) { 3203 error_setg(errp, "%s: multifd synchronization failed", __func__); 3204 return ret; 3205 } 3206 3207 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3208 ret = qemu_fflush(f); 3209 if (ret < 0) { 3210 error_setg_errno(errp, -ret, "%s failed", __func__); 3211 } 3212 return ret; 3213 } 3214 3215 static void ram_save_file_bmap(QEMUFile *f) 3216 { 3217 RAMBlock *block; 3218 3219 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3220 long num_pages = block->used_length >> TARGET_PAGE_BITS; 3221 long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3222 3223 qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, 3224 block->bitmap_offset); 3225 ram_transferred_add(bitmap_size); 3226 3227 /* 3228 * Free the bitmap here to catch any synchronization issues 3229 * with multifd channels. No channels should be sending pages 3230 * after we've written the bitmap to file. 3231 */ 3232 g_free(block->file_bmap); 3233 block->file_bmap = NULL; 3234 } 3235 } 3236 3237 void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) 3238 { 3239 if (set) { 3240 set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3241 } else { 3242 clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3243 } 3244 } 3245 3246 /** 3247 * ram_save_iterate: iterative stage for migration 3248 * 3249 * Returns zero to indicate success and negative for error 3250 * 3251 * @f: QEMUFile where to send the data 3252 * @opaque: RAMState pointer 3253 */ 3254 static int ram_save_iterate(QEMUFile *f, void *opaque) 3255 { 3256 RAMState **temp = opaque; 3257 RAMState *rs = *temp; 3258 int ret = 0; 3259 int i; 3260 int64_t t0; 3261 int done = 0; 3262 3263 /* 3264 * We'll take this lock a little bit long, but it's okay for two reasons. 3265 * Firstly, the only possible other thread to take it is who calls 3266 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3267 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3268 * guarantees that we'll at least released it in a regular basis. 3269 */ 3270 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 3271 WITH_RCU_READ_LOCK_GUARD() { 3272 if (ram_list.version != rs->last_version) { 3273 ram_state_reset(rs); 3274 } 3275 3276 /* Read version before ram_list.blocks */ 3277 smp_rmb(); 3278 3279 ret = rdma_registration_start(f, RAM_CONTROL_ROUND); 3280 if (ret < 0) { 3281 qemu_file_set_error(f, ret); 3282 goto out; 3283 } 3284 3285 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3286 i = 0; 3287 while ((ret = migration_rate_exceeded(f)) == 0 || 3288 postcopy_has_request(rs)) { 3289 int pages; 3290 3291 if (qemu_file_get_error(f)) { 3292 break; 3293 } 3294 3295 pages = ram_find_and_save_block(rs); 3296 /* no more pages to sent */ 3297 if (pages == 0) { 3298 done = 1; 3299 break; 3300 } 3301 3302 if (pages < 0) { 3303 qemu_file_set_error(f, pages); 3304 break; 3305 } 3306 3307 rs->target_page_count += pages; 3308 3309 /* 3310 * we want to check in the 1st loop, just in case it was the 1st 3311 * time and we had to sync the dirty bitmap. 3312 * qemu_clock_get_ns() is a bit expensive, so we only check each 3313 * some iterations 3314 */ 3315 if ((i & 63) == 0) { 3316 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3317 1000000; 3318 if (t1 > MAX_WAIT) { 3319 trace_ram_save_iterate_big_wait(t1, i); 3320 break; 3321 } 3322 } 3323 i++; 3324 } 3325 } 3326 } 3327 3328 /* 3329 * Must occur before EOS (or any QEMUFile operation) 3330 * because of RDMA protocol. 3331 */ 3332 ret = rdma_registration_stop(f, RAM_CONTROL_ROUND); 3333 if (ret < 0) { 3334 qemu_file_set_error(f, ret); 3335 } 3336 3337 out: 3338 if (ret >= 0 && migration_is_running()) { 3339 if (multifd_ram_sync_per_section()) { 3340 ret = multifd_ram_flush_and_sync(f); 3341 if (ret < 0) { 3342 return ret; 3343 } 3344 } 3345 3346 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3347 ram_transferred_add(8); 3348 ret = qemu_fflush(f); 3349 } 3350 if (ret < 0) { 3351 return ret; 3352 } 3353 3354 return done; 3355 } 3356 3357 /** 3358 * ram_save_complete: function called to send the remaining amount of ram 3359 * 3360 * Returns zero to indicate success or negative on error 3361 * 3362 * Called with the BQL 3363 * 3364 * @f: QEMUFile where to send the data 3365 * @opaque: RAMState pointer 3366 */ 3367 static int ram_save_complete(QEMUFile *f, void *opaque) 3368 { 3369 RAMState **temp = opaque; 3370 RAMState *rs = *temp; 3371 int ret = 0; 3372 3373 trace_ram_save_complete(rs->migration_dirty_pages, 0); 3374 3375 rs->last_stage = !migration_in_colo_state(); 3376 3377 WITH_RCU_READ_LOCK_GUARD() { 3378 if (!migration_in_postcopy()) { 3379 migration_bitmap_sync_precopy(true); 3380 } 3381 3382 ret = rdma_registration_start(f, RAM_CONTROL_FINISH); 3383 if (ret < 0) { 3384 qemu_file_set_error(f, ret); 3385 return ret; 3386 } 3387 3388 /* try transferring iterative blocks of memory */ 3389 3390 /* flush all remaining blocks regardless of rate limiting */ 3391 qemu_mutex_lock(&rs->bitmap_mutex); 3392 while (true) { 3393 int pages; 3394 3395 pages = ram_find_and_save_block(rs); 3396 /* no more blocks to sent */ 3397 if (pages == 0) { 3398 break; 3399 } 3400 if (pages < 0) { 3401 qemu_mutex_unlock(&rs->bitmap_mutex); 3402 return pages; 3403 } 3404 } 3405 qemu_mutex_unlock(&rs->bitmap_mutex); 3406 3407 ret = rdma_registration_stop(f, RAM_CONTROL_FINISH); 3408 if (ret < 0) { 3409 qemu_file_set_error(f, ret); 3410 return ret; 3411 } 3412 } 3413 3414 if (multifd_ram_sync_per_section()) { 3415 /* 3416 * Only the old dest QEMU will need this sync, because each EOS 3417 * will require one SYNC message on each channel. 3418 */ 3419 ret = multifd_ram_flush_and_sync(f); 3420 if (ret < 0) { 3421 return ret; 3422 } 3423 } 3424 3425 if (migrate_mapped_ram()) { 3426 ram_save_file_bmap(f); 3427 3428 if (qemu_file_get_error(f)) { 3429 Error *local_err = NULL; 3430 int err = qemu_file_get_error_obj(f, &local_err); 3431 3432 error_reportf_err(local_err, "Failed to write bitmap to file: "); 3433 return -err; 3434 } 3435 } 3436 3437 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3438 3439 trace_ram_save_complete(rs->migration_dirty_pages, 1); 3440 3441 return qemu_fflush(f); 3442 } 3443 3444 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3445 uint64_t *can_postcopy) 3446 { 3447 RAMState **temp = opaque; 3448 RAMState *rs = *temp; 3449 3450 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3451 3452 if (migrate_postcopy_ram()) { 3453 /* We can do postcopy, and all the data is postcopiable */ 3454 *can_postcopy += remaining_size; 3455 } else { 3456 *must_precopy += remaining_size; 3457 } 3458 } 3459 3460 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3461 uint64_t *can_postcopy) 3462 { 3463 RAMState **temp = opaque; 3464 RAMState *rs = *temp; 3465 uint64_t remaining_size; 3466 3467 if (!migration_in_postcopy()) { 3468 bql_lock(); 3469 WITH_RCU_READ_LOCK_GUARD() { 3470 migration_bitmap_sync_precopy(false); 3471 } 3472 bql_unlock(); 3473 } 3474 3475 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3476 3477 if (migrate_postcopy_ram()) { 3478 /* We can do postcopy, and all the data is postcopiable */ 3479 *can_postcopy += remaining_size; 3480 } else { 3481 *must_precopy += remaining_size; 3482 } 3483 } 3484 3485 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3486 { 3487 unsigned int xh_len; 3488 int xh_flags; 3489 uint8_t *loaded_data; 3490 3491 /* extract RLE header */ 3492 xh_flags = qemu_get_byte(f); 3493 xh_len = qemu_get_be16(f); 3494 3495 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3496 error_report("Failed to load XBZRLE page - wrong compression!"); 3497 return -1; 3498 } 3499 3500 if (xh_len > TARGET_PAGE_SIZE) { 3501 error_report("Failed to load XBZRLE page - len overflow!"); 3502 return -1; 3503 } 3504 loaded_data = XBZRLE.decoded_buf; 3505 /* load data and decode */ 3506 /* it can change loaded_data to point to an internal buffer */ 3507 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3508 3509 /* decode RLE */ 3510 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3511 TARGET_PAGE_SIZE) == -1) { 3512 error_report("Failed to load XBZRLE page - decode error!"); 3513 return -1; 3514 } 3515 3516 return 0; 3517 } 3518 3519 /** 3520 * ram_block_from_stream: read a RAMBlock id from the migration stream 3521 * 3522 * Must be called from within a rcu critical section. 3523 * 3524 * Returns a pointer from within the RCU-protected ram_list. 3525 * 3526 * @mis: the migration incoming state pointer 3527 * @f: QEMUFile where to read the data from 3528 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3529 * @channel: the channel we're using 3530 */ 3531 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3532 QEMUFile *f, int flags, 3533 int channel) 3534 { 3535 RAMBlock *block = mis->last_recv_block[channel]; 3536 char id[256]; 3537 uint8_t len; 3538 3539 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3540 if (!block) { 3541 error_report("Ack, bad migration stream!"); 3542 return NULL; 3543 } 3544 return block; 3545 } 3546 3547 len = qemu_get_byte(f); 3548 qemu_get_buffer(f, (uint8_t *)id, len); 3549 id[len] = 0; 3550 3551 block = qemu_ram_block_by_name(id); 3552 if (!block) { 3553 error_report("Can't find block %s", id); 3554 return NULL; 3555 } 3556 3557 if (migrate_ram_is_ignored(block)) { 3558 error_report("block %s should not be migrated !", id); 3559 return NULL; 3560 } 3561 3562 mis->last_recv_block[channel] = block; 3563 3564 return block; 3565 } 3566 3567 static inline void *host_from_ram_block_offset(RAMBlock *block, 3568 ram_addr_t offset) 3569 { 3570 if (!offset_in_ramblock(block, offset)) { 3571 return NULL; 3572 } 3573 3574 return block->host + offset; 3575 } 3576 3577 static void *host_page_from_ram_block_offset(RAMBlock *block, 3578 ram_addr_t offset) 3579 { 3580 /* Note: Explicitly no check against offset_in_ramblock(). */ 3581 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3582 block->page_size); 3583 } 3584 3585 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3586 ram_addr_t offset) 3587 { 3588 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3589 } 3590 3591 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3592 { 3593 qemu_mutex_lock(&ram_state->bitmap_mutex); 3594 for (int i = 0; i < pages; i++) { 3595 ram_addr_t offset = normal[i]; 3596 ram_state->migration_dirty_pages += !test_and_set_bit( 3597 offset >> TARGET_PAGE_BITS, 3598 block->bmap); 3599 } 3600 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3601 } 3602 3603 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3604 ram_addr_t offset, bool record_bitmap) 3605 { 3606 if (!offset_in_ramblock(block, offset)) { 3607 return NULL; 3608 } 3609 if (!block->colo_cache) { 3610 error_report("%s: colo_cache is NULL in block :%s", 3611 __func__, block->idstr); 3612 return NULL; 3613 } 3614 3615 /* 3616 * During colo checkpoint, we need bitmap of these migrated pages. 3617 * It help us to decide which pages in ram cache should be flushed 3618 * into VM's RAM later. 3619 */ 3620 if (record_bitmap) { 3621 colo_record_bitmap(block, &offset, 1); 3622 } 3623 return block->colo_cache + offset; 3624 } 3625 3626 /** 3627 * ram_handle_zero: handle the zero page case 3628 * 3629 * If a page (or a whole RDMA chunk) has been 3630 * determined to be zero, then zap it. 3631 * 3632 * @host: host address for the zero page 3633 * @size: size of the zero page 3634 */ 3635 void ram_handle_zero(void *host, uint64_t size) 3636 { 3637 if (!buffer_is_zero(host, size)) { 3638 memset(host, 0, size); 3639 } 3640 } 3641 3642 static void colo_init_ram_state(void) 3643 { 3644 Error *local_err = NULL; 3645 3646 if (!ram_state_init(&ram_state, &local_err)) { 3647 error_report_err(local_err); 3648 } 3649 } 3650 3651 /* 3652 * colo cache: this is for secondary VM, we cache the whole 3653 * memory of the secondary VM, it is need to hold the global lock 3654 * to call this helper. 3655 * 3656 * Returns zero to indicate success or -1 on error. 3657 */ 3658 int colo_init_ram_cache(Error **errp) 3659 { 3660 RAMBlock *block; 3661 3662 WITH_RCU_READ_LOCK_GUARD() { 3663 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3664 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3665 NULL, false, false); 3666 if (!block->colo_cache) { 3667 error_setg(errp, "Can't alloc memory for COLO cache of " 3668 "block %s, size 0x" RAM_ADDR_FMT, 3669 block->idstr, block->used_length); 3670 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3671 if (block->colo_cache) { 3672 qemu_anon_ram_free(block->colo_cache, block->used_length); 3673 block->colo_cache = NULL; 3674 } 3675 } 3676 return -1; 3677 } 3678 if (!machine_dump_guest_core(current_machine)) { 3679 qemu_madvise(block->colo_cache, block->used_length, 3680 QEMU_MADV_DONTDUMP); 3681 } 3682 } 3683 } 3684 3685 /* 3686 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3687 * with to decide which page in cache should be flushed into SVM's RAM. Here 3688 * we use the same name 'ram_bitmap' as for migration. 3689 */ 3690 if (ram_bytes_total()) { 3691 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3692 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3693 block->bmap = bitmap_new(pages); 3694 } 3695 } 3696 3697 colo_init_ram_state(); 3698 return 0; 3699 } 3700 3701 /* TODO: duplicated with ram_init_bitmaps */ 3702 void colo_incoming_start_dirty_log(void) 3703 { 3704 RAMBlock *block = NULL; 3705 Error *local_err = NULL; 3706 3707 /* For memory_global_dirty_log_start below. */ 3708 bql_lock(); 3709 qemu_mutex_lock_ramlist(); 3710 3711 memory_global_dirty_log_sync(false); 3712 WITH_RCU_READ_LOCK_GUARD() { 3713 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3714 ramblock_sync_dirty_bitmap(ram_state, block); 3715 /* Discard this dirty bitmap record */ 3716 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3717 } 3718 if (!memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, 3719 &local_err)) { 3720 error_report_err(local_err); 3721 } 3722 } 3723 ram_state->migration_dirty_pages = 0; 3724 qemu_mutex_unlock_ramlist(); 3725 bql_unlock(); 3726 } 3727 3728 /* It is need to hold the global lock to call this helper */ 3729 void colo_release_ram_cache(void) 3730 { 3731 RAMBlock *block; 3732 3733 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3734 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3735 g_free(block->bmap); 3736 block->bmap = NULL; 3737 } 3738 3739 WITH_RCU_READ_LOCK_GUARD() { 3740 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3741 if (block->colo_cache) { 3742 qemu_anon_ram_free(block->colo_cache, block->used_length); 3743 block->colo_cache = NULL; 3744 } 3745 } 3746 } 3747 ram_state_cleanup(&ram_state); 3748 } 3749 3750 /** 3751 * ram_load_setup: Setup RAM for migration incoming side 3752 * 3753 * Returns zero to indicate success and negative for error 3754 * 3755 * @f: QEMUFile where to receive the data 3756 * @opaque: RAMState pointer 3757 * @errp: pointer to Error*, to store an error if it happens. 3758 */ 3759 static int ram_load_setup(QEMUFile *f, void *opaque, Error **errp) 3760 { 3761 xbzrle_load_setup(); 3762 ramblock_recv_map_init(); 3763 3764 return 0; 3765 } 3766 3767 static int ram_load_cleanup(void *opaque) 3768 { 3769 RAMBlock *rb; 3770 3771 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3772 if (memory_region_is_nonvolatile(rb->mr)) { 3773 qemu_ram_block_writeback(rb); 3774 } 3775 } 3776 3777 xbzrle_load_cleanup(); 3778 3779 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3780 g_free(rb->receivedmap); 3781 rb->receivedmap = NULL; 3782 } 3783 3784 return 0; 3785 } 3786 3787 /** 3788 * ram_postcopy_incoming_init: allocate postcopy data structures 3789 * 3790 * Returns 0 for success and negative if there was one error 3791 * 3792 * @mis: current migration incoming state 3793 * 3794 * Allocate data structures etc needed by incoming migration with 3795 * postcopy-ram. postcopy-ram's similarly names 3796 * postcopy_ram_incoming_init does the work. 3797 */ 3798 int ram_postcopy_incoming_init(MigrationIncomingState *mis, Error **errp) 3799 { 3800 return postcopy_ram_incoming_init(mis, errp); 3801 } 3802 3803 /** 3804 * ram_load_postcopy: load a page in postcopy case 3805 * 3806 * Returns 0 for success or -errno in case of error 3807 * 3808 * Called in postcopy mode by ram_load(). 3809 * rcu_read_lock is taken prior to this being called. 3810 * 3811 * @f: QEMUFile where to send the data 3812 * @channel: the channel to use for loading 3813 */ 3814 int ram_load_postcopy(QEMUFile *f, int channel) 3815 { 3816 int flags = 0, ret = 0; 3817 bool place_needed = false; 3818 bool matches_target_page_size = false; 3819 MigrationIncomingState *mis = migration_incoming_get_current(); 3820 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3821 3822 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3823 ram_addr_t addr; 3824 void *page_buffer = NULL; 3825 void *place_source = NULL; 3826 RAMBlock *block = NULL; 3827 uint8_t ch; 3828 3829 addr = qemu_get_be64(f); 3830 3831 /* 3832 * If qemu file error, we should stop here, and then "addr" 3833 * may be invalid 3834 */ 3835 ret = qemu_file_get_error(f); 3836 if (ret) { 3837 break; 3838 } 3839 3840 flags = addr & ~TARGET_PAGE_MASK; 3841 addr &= TARGET_PAGE_MASK; 3842 3843 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3844 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { 3845 block = ram_block_from_stream(mis, f, flags, channel); 3846 if (!block) { 3847 ret = -EINVAL; 3848 break; 3849 } 3850 3851 /* 3852 * Relying on used_length is racy and can result in false positives. 3853 * We might place pages beyond used_length in case RAM was shrunk 3854 * while in postcopy, which is fine - trying to place via 3855 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3856 */ 3857 if (!block->host || addr >= block->postcopy_length) { 3858 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3859 ret = -EINVAL; 3860 break; 3861 } 3862 tmp_page->target_pages++; 3863 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3864 /* 3865 * Postcopy requires that we place whole host pages atomically; 3866 * these may be huge pages for RAMBlocks that are backed by 3867 * hugetlbfs. 3868 * To make it atomic, the data is read into a temporary page 3869 * that's moved into place later. 3870 * The migration protocol uses, possibly smaller, target-pages 3871 * however the source ensures it always sends all the components 3872 * of a host page in one chunk. 3873 */ 3874 page_buffer = tmp_page->tmp_huge_page + 3875 host_page_offset_from_ram_block_offset(block, addr); 3876 /* If all TP are zero then we can optimise the place */ 3877 if (tmp_page->target_pages == 1) { 3878 tmp_page->host_addr = 3879 host_page_from_ram_block_offset(block, addr); 3880 } else if (tmp_page->host_addr != 3881 host_page_from_ram_block_offset(block, addr)) { 3882 /* not the 1st TP within the HP */ 3883 error_report("Non-same host page detected on channel %d: " 3884 "Target host page %p, received host page %p " 3885 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3886 channel, tmp_page->host_addr, 3887 host_page_from_ram_block_offset(block, addr), 3888 block->idstr, addr, tmp_page->target_pages); 3889 ret = -EINVAL; 3890 break; 3891 } 3892 3893 /* 3894 * If it's the last part of a host page then we place the host 3895 * page 3896 */ 3897 if (tmp_page->target_pages == 3898 (block->page_size / TARGET_PAGE_SIZE)) { 3899 place_needed = true; 3900 } 3901 place_source = tmp_page->tmp_huge_page; 3902 } 3903 3904 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3905 case RAM_SAVE_FLAG_ZERO: 3906 ch = qemu_get_byte(f); 3907 if (ch != 0) { 3908 error_report("Found a zero page with value %d", ch); 3909 ret = -EINVAL; 3910 break; 3911 } 3912 /* 3913 * Can skip to set page_buffer when 3914 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3915 */ 3916 if (!matches_target_page_size) { 3917 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3918 } 3919 break; 3920 3921 case RAM_SAVE_FLAG_PAGE: 3922 tmp_page->all_zero = false; 3923 if (!matches_target_page_size) { 3924 /* For huge pages, we always use temporary buffer */ 3925 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3926 } else { 3927 /* 3928 * For small pages that matches target page size, we 3929 * avoid the qemu_file copy. Instead we directly use 3930 * the buffer of QEMUFile to place the page. Note: we 3931 * cannot do any QEMUFile operation before using that 3932 * buffer to make sure the buffer is valid when 3933 * placing the page. 3934 */ 3935 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3936 TARGET_PAGE_SIZE); 3937 } 3938 break; 3939 case RAM_SAVE_FLAG_EOS: 3940 break; 3941 default: 3942 error_report("Unknown combination of migration flags: 0x%x" 3943 " (postcopy mode)", flags); 3944 ret = -EINVAL; 3945 break; 3946 } 3947 3948 /* Detect for any possible file errors */ 3949 if (!ret && qemu_file_get_error(f)) { 3950 ret = qemu_file_get_error(f); 3951 } 3952 3953 if (!ret && place_needed) { 3954 if (tmp_page->all_zero) { 3955 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3956 } else { 3957 ret = postcopy_place_page(mis, tmp_page->host_addr, 3958 place_source, block); 3959 } 3960 place_needed = false; 3961 postcopy_temp_page_reset(tmp_page); 3962 } 3963 } 3964 3965 return ret; 3966 } 3967 3968 static bool postcopy_is_running(void) 3969 { 3970 PostcopyState ps = postcopy_state_get(); 3971 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3972 } 3973 3974 /* 3975 * Flush content of RAM cache into SVM's memory. 3976 * Only flush the pages that be dirtied by PVM or SVM or both. 3977 */ 3978 void colo_flush_ram_cache(void) 3979 { 3980 RAMBlock *block = NULL; 3981 void *dst_host; 3982 void *src_host; 3983 unsigned long offset = 0; 3984 3985 memory_global_dirty_log_sync(false); 3986 qemu_mutex_lock(&ram_state->bitmap_mutex); 3987 WITH_RCU_READ_LOCK_GUARD() { 3988 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3989 ramblock_sync_dirty_bitmap(ram_state, block); 3990 } 3991 } 3992 3993 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3994 WITH_RCU_READ_LOCK_GUARD() { 3995 block = QLIST_FIRST_RCU(&ram_list.blocks); 3996 3997 while (block) { 3998 unsigned long num = 0; 3999 4000 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4001 if (!offset_in_ramblock(block, 4002 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4003 offset = 0; 4004 num = 0; 4005 block = QLIST_NEXT_RCU(block, next); 4006 } else { 4007 unsigned long i = 0; 4008 4009 for (i = 0; i < num; i++) { 4010 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4011 } 4012 dst_host = block->host 4013 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4014 src_host = block->colo_cache 4015 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4016 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4017 offset += num; 4018 } 4019 } 4020 } 4021 qemu_mutex_unlock(&ram_state->bitmap_mutex); 4022 trace_colo_flush_ram_cache_end(); 4023 } 4024 4025 static size_t ram_load_multifd_pages(void *host_addr, size_t size, 4026 uint64_t offset) 4027 { 4028 MultiFDRecvData *data = multifd_get_recv_data(); 4029 4030 data->opaque = host_addr; 4031 data->file_offset = offset; 4032 data->size = size; 4033 4034 if (!multifd_recv()) { 4035 return 0; 4036 } 4037 4038 return size; 4039 } 4040 4041 /** 4042 * handle_zero_mapped_ram: Zero out a range of RAM pages if required during 4043 * mapped-ram load 4044 * 4045 * Zeroing is only performed when restoring from a snapshot (HMP loadvm). 4046 * During incoming migration or -loadvm cli snapshot load, the function is a 4047 * no-op and returns true as in those cases the pages are already guaranteed to 4048 * be zeroed. 4049 * 4050 * Returns: true on success, false on error (with @errp set). 4051 * @from_bit_idx: Starting index relative to the map of the page (inclusive) 4052 * @to_bit_idx: Ending index relative to the map of the page (exclusive) 4053 */ 4054 static bool handle_zero_mapped_ram(RAMBlock *block, unsigned long from_bit_idx, 4055 unsigned long to_bit_idx, Error **errp) 4056 { 4057 ERRP_GUARD(); 4058 ram_addr_t offset; 4059 size_t size; 4060 void *host; 4061 4062 /* 4063 * Zeroing is not needed for either -loadvm (RUN_STATE_PRELAUNCH), or 4064 * -incoming (RUN_STATE_INMIGRATE). 4065 */ 4066 if (!runstate_check(RUN_STATE_RESTORE_VM)) { 4067 return true; 4068 } 4069 4070 if (from_bit_idx >= to_bit_idx) { 4071 return true; 4072 } 4073 4074 size = TARGET_PAGE_SIZE * (to_bit_idx - from_bit_idx); 4075 offset = from_bit_idx << TARGET_PAGE_BITS; 4076 host = host_from_ram_block_offset(block, offset); 4077 if (!host) { 4078 error_setg(errp, "zero page outside of ramblock %s range", 4079 block->idstr); 4080 return false; 4081 } 4082 ram_handle_zero(host, size); 4083 4084 return true; 4085 } 4086 4087 static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 4088 long num_pages, unsigned long *bitmap, 4089 Error **errp) 4090 { 4091 ERRP_GUARD(); 4092 unsigned long set_bit_idx, clear_bit_idx = 0; 4093 ram_addr_t offset; 4094 void *host; 4095 size_t read, unread, size; 4096 4097 for (set_bit_idx = find_first_bit(bitmap, num_pages); 4098 set_bit_idx < num_pages; 4099 set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { 4100 4101 /* Zero pages */ 4102 if (!handle_zero_mapped_ram(block, clear_bit_idx, set_bit_idx, errp)) { 4103 return false; 4104 } 4105 4106 /* Non-zero pages */ 4107 clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); 4108 4109 unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); 4110 offset = set_bit_idx << TARGET_PAGE_BITS; 4111 4112 while (unread > 0) { 4113 host = host_from_ram_block_offset(block, offset); 4114 if (!host) { 4115 error_setg(errp, "page outside of ramblock %s range", 4116 block->idstr); 4117 return false; 4118 } 4119 4120 size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); 4121 4122 if (migrate_multifd()) { 4123 read = ram_load_multifd_pages(host, size, 4124 block->pages_offset + offset); 4125 } else { 4126 read = qemu_get_buffer_at(f, host, size, 4127 block->pages_offset + offset); 4128 } 4129 4130 if (!read) { 4131 goto err; 4132 } 4133 offset += read; 4134 unread -= read; 4135 } 4136 } 4137 4138 /* Handle trailing 0 pages */ 4139 if (!handle_zero_mapped_ram(block, clear_bit_idx, num_pages, errp)) { 4140 return false; 4141 } 4142 4143 return true; 4144 4145 err: 4146 qemu_file_get_error_obj(f, errp); 4147 error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT 4148 "from file offset %" PRIx64 ": ", block->idstr, offset, 4149 block->pages_offset + offset); 4150 return false; 4151 } 4152 4153 static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 4154 ram_addr_t length, Error **errp) 4155 { 4156 g_autofree unsigned long *bitmap = NULL; 4157 MappedRamHeader header; 4158 size_t bitmap_size; 4159 long num_pages; 4160 4161 if (!mapped_ram_read_header(f, &header, errp)) { 4162 return; 4163 } 4164 4165 block->pages_offset = header.pages_offset; 4166 4167 /* 4168 * Check the alignment of the file region that contains pages. We 4169 * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that 4170 * value to change in the future. Do only a sanity check with page 4171 * size alignment. 4172 */ 4173 if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { 4174 error_setg(errp, 4175 "Error reading ramblock %s pages, region has bad alignment", 4176 block->idstr); 4177 return; 4178 } 4179 4180 num_pages = length / header.page_size; 4181 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 4182 4183 bitmap = g_malloc0(bitmap_size); 4184 if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, 4185 header.bitmap_offset) != bitmap_size) { 4186 error_setg(errp, "Error reading dirty bitmap"); 4187 return; 4188 } 4189 4190 if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { 4191 return; 4192 } 4193 4194 /* Skip pages array */ 4195 qemu_set_offset(f, block->pages_offset + length, SEEK_SET); 4196 } 4197 4198 static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) 4199 { 4200 int ret = 0; 4201 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4202 bool postcopy_advised = migration_incoming_postcopy_advised(); 4203 int max_hg_page_size; 4204 Error *local_err = NULL; 4205 4206 assert(block); 4207 4208 if (migrate_mapped_ram()) { 4209 parse_ramblock_mapped_ram(f, block, length, &local_err); 4210 if (local_err) { 4211 error_report_err(local_err); 4212 return -EINVAL; 4213 } 4214 return 0; 4215 } 4216 4217 if (!qemu_ram_is_migratable(block)) { 4218 error_report("block %s should not be migrated !", block->idstr); 4219 return -EINVAL; 4220 } 4221 4222 if (length != block->used_length) { 4223 ret = qemu_ram_resize(block, length, &local_err); 4224 if (local_err) { 4225 error_report_err(local_err); 4226 return ret; 4227 } 4228 } 4229 4230 /* 4231 * ??? Mirrors the previous value of qemu_host_page_size, 4232 * but is this really what was intended for the migration? 4233 */ 4234 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 4235 4236 /* For postcopy we need to check hugepage sizes match */ 4237 if (postcopy_advised && migrate_postcopy_ram() && 4238 block->page_size != max_hg_page_size) { 4239 uint64_t remote_page_size = qemu_get_be64(f); 4240 if (remote_page_size != block->page_size) { 4241 error_report("Mismatched RAM page size %s " 4242 "(local) %zd != %" PRId64, block->idstr, 4243 block->page_size, remote_page_size); 4244 return -EINVAL; 4245 } 4246 } 4247 if (migrate_ignore_shared()) { 4248 hwaddr addr = qemu_get_be64(f); 4249 if (migrate_ram_is_ignored(block) && 4250 block->mr->addr != addr) { 4251 error_report("Mismatched GPAs for block %s " 4252 "%" PRId64 "!= %" PRId64, block->idstr, 4253 (uint64_t)addr, (uint64_t)block->mr->addr); 4254 return -EINVAL; 4255 } 4256 } 4257 ret = rdma_block_notification_handle(f, block->idstr); 4258 if (ret < 0) { 4259 qemu_file_set_error(f, ret); 4260 } 4261 4262 return ret; 4263 } 4264 4265 static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes) 4266 { 4267 int ret = 0; 4268 4269 /* Synchronize RAM block list */ 4270 while (!ret && total_ram_bytes) { 4271 RAMBlock *block; 4272 char id[256]; 4273 ram_addr_t length; 4274 int len = qemu_get_byte(f); 4275 4276 qemu_get_buffer(f, (uint8_t *)id, len); 4277 id[len] = 0; 4278 length = qemu_get_be64(f); 4279 4280 block = qemu_ram_block_by_name(id); 4281 if (block) { 4282 ret = parse_ramblock(f, block, length); 4283 } else { 4284 error_report("Unknown ramblock \"%s\", cannot accept " 4285 "migration", id); 4286 ret = -EINVAL; 4287 } 4288 total_ram_bytes -= length; 4289 } 4290 4291 return ret; 4292 } 4293 4294 /** 4295 * ram_load_precopy: load pages in precopy case 4296 * 4297 * Returns 0 for success or -errno in case of error 4298 * 4299 * Called in precopy mode by ram_load(). 4300 * rcu_read_lock is taken prior to this being called. 4301 * 4302 * @f: QEMUFile where to send the data 4303 */ 4304 static int ram_load_precopy(QEMUFile *f) 4305 { 4306 MigrationIncomingState *mis = migration_incoming_get_current(); 4307 int flags = 0, ret = 0, invalid_flags = 0, i = 0; 4308 4309 if (migrate_mapped_ram()) { 4310 invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | 4311 RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | 4312 RAM_SAVE_FLAG_ZERO); 4313 } 4314 4315 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4316 ram_addr_t addr; 4317 void *host = NULL, *host_bak = NULL; 4318 uint8_t ch; 4319 4320 /* 4321 * Yield periodically to let main loop run, but an iteration of 4322 * the main loop is expensive, so do it each some iterations 4323 */ 4324 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4325 aio_co_schedule(qemu_get_current_aio_context(), 4326 qemu_coroutine_self()); 4327 qemu_coroutine_yield(); 4328 } 4329 i++; 4330 4331 addr = qemu_get_be64(f); 4332 ret = qemu_file_get_error(f); 4333 if (ret) { 4334 error_report("Getting RAM address failed"); 4335 break; 4336 } 4337 4338 flags = addr & ~TARGET_PAGE_MASK; 4339 addr &= TARGET_PAGE_MASK; 4340 4341 if (flags & invalid_flags) { 4342 error_report("Unexpected RAM flags: %d", flags & invalid_flags); 4343 4344 ret = -EINVAL; 4345 break; 4346 } 4347 4348 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4349 RAM_SAVE_FLAG_XBZRLE)) { 4350 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4351 RAM_CHANNEL_PRECOPY); 4352 4353 host = host_from_ram_block_offset(block, addr); 4354 /* 4355 * After going into COLO stage, we should not load the page 4356 * into SVM's memory directly, we put them into colo_cache firstly. 4357 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4358 * Previously, we copied all these memory in preparing stage of COLO 4359 * while we need to stop VM, which is a time-consuming process. 4360 * Here we optimize it by a trick, back-up every page while in 4361 * migration process while COLO is enabled, though it affects the 4362 * speed of the migration, but it obviously reduce the downtime of 4363 * back-up all SVM'S memory in COLO preparing stage. 4364 */ 4365 if (migration_incoming_colo_enabled()) { 4366 if (migration_incoming_in_colo_state()) { 4367 /* In COLO stage, put all pages into cache temporarily */ 4368 host = colo_cache_from_block_offset(block, addr, true); 4369 } else { 4370 /* 4371 * In migration stage but before COLO stage, 4372 * Put all pages into both cache and SVM's memory. 4373 */ 4374 host_bak = colo_cache_from_block_offset(block, addr, false); 4375 } 4376 } 4377 if (!host) { 4378 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4379 ret = -EINVAL; 4380 break; 4381 } 4382 if (!migration_incoming_in_colo_state()) { 4383 ramblock_recv_bitmap_set(block, host); 4384 } 4385 4386 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4387 } 4388 4389 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4390 case RAM_SAVE_FLAG_MEM_SIZE: 4391 ret = parse_ramblocks(f, addr); 4392 /* 4393 * For mapped-ram migration (to a file) using multifd, we sync 4394 * once and for all here to make sure all tasks we queued to 4395 * multifd threads are completed, so that all the ramblocks 4396 * (including all the guest memory pages within) are fully 4397 * loaded after this sync returns. 4398 */ 4399 if (migrate_mapped_ram()) { 4400 multifd_recv_sync_main(); 4401 } 4402 break; 4403 4404 case RAM_SAVE_FLAG_ZERO: 4405 ch = qemu_get_byte(f); 4406 if (ch != 0) { 4407 error_report("Found a zero page with value %d", ch); 4408 ret = -EINVAL; 4409 break; 4410 } 4411 ram_handle_zero(host, TARGET_PAGE_SIZE); 4412 break; 4413 4414 case RAM_SAVE_FLAG_PAGE: 4415 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4416 break; 4417 4418 case RAM_SAVE_FLAG_XBZRLE: 4419 if (load_xbzrle(f, addr, host) < 0) { 4420 error_report("Failed to decompress XBZRLE page at " 4421 RAM_ADDR_FMT, addr); 4422 ret = -EINVAL; 4423 break; 4424 } 4425 break; 4426 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4427 multifd_recv_sync_main(); 4428 break; 4429 case RAM_SAVE_FLAG_EOS: 4430 /* normal exit */ 4431 if (migrate_multifd() && 4432 migrate_multifd_flush_after_each_section() && 4433 /* 4434 * Mapped-ram migration flushes once and for all after 4435 * parsing ramblocks. Always ignore EOS for it. 4436 */ 4437 !migrate_mapped_ram()) { 4438 multifd_recv_sync_main(); 4439 } 4440 break; 4441 case RAM_SAVE_FLAG_HOOK: 4442 ret = rdma_registration_handle(f); 4443 if (ret < 0) { 4444 qemu_file_set_error(f, ret); 4445 } 4446 break; 4447 default: 4448 error_report("Unknown combination of migration flags: 0x%x", flags); 4449 ret = -EINVAL; 4450 } 4451 if (!ret) { 4452 ret = qemu_file_get_error(f); 4453 } 4454 if (!ret && host_bak) { 4455 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4456 } 4457 } 4458 4459 return ret; 4460 } 4461 4462 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4463 { 4464 int ret = 0; 4465 static uint64_t seq_iter; 4466 /* 4467 * If system is running in postcopy mode, page inserts to host memory must 4468 * be atomic 4469 */ 4470 bool postcopy_running = postcopy_is_running(); 4471 4472 seq_iter++; 4473 4474 if (version_id != 4) { 4475 return -EINVAL; 4476 } 4477 4478 /* 4479 * This RCU critical section can be very long running. 4480 * When RCU reclaims in the code start to become numerous, 4481 * it will be necessary to reduce the granularity of this 4482 * critical section. 4483 */ 4484 trace_ram_load_start(); 4485 WITH_RCU_READ_LOCK_GUARD() { 4486 if (postcopy_running) { 4487 /* 4488 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4489 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4490 * service fast page faults. 4491 */ 4492 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4493 } else { 4494 ret = ram_load_precopy(f); 4495 } 4496 } 4497 trace_ram_load_complete(ret, seq_iter); 4498 4499 return ret; 4500 } 4501 4502 static bool ram_has_postcopy(void *opaque) 4503 { 4504 RAMBlock *rb; 4505 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4506 if (ram_block_is_pmem(rb)) { 4507 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4508 "is not supported now!", rb->idstr, rb->host); 4509 return false; 4510 } 4511 } 4512 4513 return migrate_postcopy_ram(); 4514 } 4515 4516 /* Sync all the dirty bitmap with destination VM. */ 4517 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4518 { 4519 RAMBlock *block; 4520 QEMUFile *file = s->to_dst_file; 4521 4522 trace_ram_dirty_bitmap_sync_start(); 4523 4524 qatomic_set(&rs->postcopy_bmap_sync_requested, 0); 4525 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4526 qemu_savevm_send_recv_bitmap(file, block->idstr); 4527 trace_ram_dirty_bitmap_request(block->idstr); 4528 qatomic_inc(&rs->postcopy_bmap_sync_requested); 4529 } 4530 4531 trace_ram_dirty_bitmap_sync_wait(); 4532 4533 /* Wait until all the ramblocks' dirty bitmap synced */ 4534 while (qatomic_read(&rs->postcopy_bmap_sync_requested)) { 4535 if (migration_rp_wait(s)) { 4536 return -1; 4537 } 4538 } 4539 4540 trace_ram_dirty_bitmap_sync_complete(); 4541 4542 return 0; 4543 } 4544 4545 /* 4546 * Read the received bitmap, revert it as the initial dirty bitmap. 4547 * This is only used when the postcopy migration is paused but wants 4548 * to resume from a middle point. 4549 * 4550 * Returns true if succeeded, false for errors. 4551 */ 4552 bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp) 4553 { 4554 /* from_dst_file is always valid because we're within rp_thread */ 4555 QEMUFile *file = s->rp_state.from_dst_file; 4556 g_autofree unsigned long *le_bitmap = NULL; 4557 unsigned long nbits = block->used_length >> TARGET_PAGE_BITS; 4558 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4559 uint64_t size, end_mark; 4560 RAMState *rs = ram_state; 4561 4562 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4563 4564 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4565 error_setg(errp, "Reload bitmap in incorrect state %s", 4566 MigrationStatus_str(s->state)); 4567 return false; 4568 } 4569 4570 /* 4571 * Note: see comments in ramblock_recv_bitmap_send() on why we 4572 * need the endianness conversion, and the paddings. 4573 */ 4574 local_size = ROUND_UP(local_size, 8); 4575 4576 /* Add paddings */ 4577 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4578 4579 size = qemu_get_be64(file); 4580 4581 /* The size of the bitmap should match with our ramblock */ 4582 if (size != local_size) { 4583 error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64 4584 " != 0x%"PRIx64")", block->idstr, size, local_size); 4585 return false; 4586 } 4587 4588 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4589 end_mark = qemu_get_be64(file); 4590 4591 if (qemu_file_get_error(file) || size != local_size) { 4592 error_setg(errp, "read bitmap failed for ramblock '%s': " 4593 "(size 0x%"PRIx64", got: 0x%"PRIx64")", 4594 block->idstr, local_size, size); 4595 return false; 4596 } 4597 4598 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4599 error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64, 4600 block->idstr, end_mark); 4601 return false; 4602 } 4603 4604 /* 4605 * Endianness conversion. We are during postcopy (though paused). 4606 * The dirty bitmap won't change. We can directly modify it. 4607 */ 4608 bitmap_from_le(block->bmap, le_bitmap, nbits); 4609 4610 /* 4611 * What we received is "received bitmap". Revert it as the initial 4612 * dirty bitmap for this ramblock. 4613 */ 4614 bitmap_complement(block->bmap, block->bmap, nbits); 4615 4616 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4617 ramblock_dirty_bitmap_clear_discarded_pages(block); 4618 4619 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4620 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4621 4622 qatomic_dec(&rs->postcopy_bmap_sync_requested); 4623 4624 /* 4625 * We succeeded to sync bitmap for current ramblock. Always kick the 4626 * migration thread to check whether all requested bitmaps are 4627 * reloaded. NOTE: it's racy to only kick when requested==0, because 4628 * we don't know whether the migration thread may still be increasing 4629 * it. 4630 */ 4631 migration_rp_kick(s); 4632 4633 return true; 4634 } 4635 4636 static int ram_resume_prepare(MigrationState *s, void *opaque) 4637 { 4638 RAMState *rs = *(RAMState **)opaque; 4639 int ret; 4640 4641 ret = ram_dirty_bitmap_sync_all(s, rs); 4642 if (ret) { 4643 return ret; 4644 } 4645 4646 ram_state_resume_prepare(rs, s->to_dst_file); 4647 4648 return 0; 4649 } 4650 4651 static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp) 4652 { 4653 int ret; 4654 4655 if (migrate_multifd()) { 4656 /* 4657 * When multifd is enabled, source QEMU needs to make sure all the 4658 * pages queued before postcopy starts have been flushed. 4659 * 4660 * The load of these pages must happen before switching to postcopy. 4661 * It's because loading of guest pages (so far) in multifd recv 4662 * threads is still non-atomic, so the load cannot happen with vCPUs 4663 * running on the destination side. 4664 * 4665 * This flush and sync will guarantee that those pages are loaded 4666 * _before_ postcopy starts on the destination. The rationale is, 4667 * this happens before VM stops (and before source QEMU sends all 4668 * the rest of the postcopy messages). So when the destination QEMU 4669 * receives the postcopy messages, it must have received the sync 4670 * message on the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH, 4671 * or RAM_SAVE_FLAG_EOS), and such message would guarantee that 4672 * all previous guest pages queued in the multifd channels are 4673 * completely loaded. 4674 */ 4675 ret = multifd_ram_flush_and_sync(f); 4676 if (ret < 0) { 4677 error_setg(errp, "%s: multifd flush and sync failed", __func__); 4678 return false; 4679 } 4680 } 4681 4682 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 4683 4684 return true; 4685 } 4686 4687 void postcopy_preempt_shutdown_file(MigrationState *s) 4688 { 4689 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4690 qemu_fflush(s->postcopy_qemufile_src); 4691 } 4692 4693 static SaveVMHandlers savevm_ram_handlers = { 4694 .save_setup = ram_save_setup, 4695 .save_live_iterate = ram_save_iterate, 4696 .save_complete = ram_save_complete, 4697 .has_postcopy = ram_has_postcopy, 4698 .state_pending_exact = ram_state_pending_exact, 4699 .state_pending_estimate = ram_state_pending_estimate, 4700 .load_state = ram_load, 4701 .save_cleanup = ram_save_cleanup, 4702 .load_setup = ram_load_setup, 4703 .load_cleanup = ram_load_cleanup, 4704 .resume_prepare = ram_resume_prepare, 4705 .save_postcopy_prepare = ram_save_postcopy_prepare, 4706 }; 4707 4708 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4709 size_t old_size, size_t new_size) 4710 { 4711 PostcopyState ps = postcopy_state_get(); 4712 ram_addr_t offset; 4713 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4714 Error *err = NULL; 4715 4716 if (!rb) { 4717 error_report("RAM block not found"); 4718 return; 4719 } 4720 4721 if (migrate_ram_is_ignored(rb)) { 4722 return; 4723 } 4724 4725 if (migration_is_running()) { 4726 /* 4727 * Precopy code on the source cannot deal with the size of RAM blocks 4728 * changing at random points in time - especially after sending the 4729 * RAM block sizes in the migration stream, they must no longer change. 4730 * Abort and indicate a proper reason. 4731 */ 4732 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4733 migrate_set_error(migrate_get_current(), err); 4734 error_free(err); 4735 4736 migration_cancel(); 4737 } 4738 4739 switch (ps) { 4740 case POSTCOPY_INCOMING_ADVISE: 4741 /* 4742 * Update what ram_postcopy_incoming_init()->init_range() does at the 4743 * time postcopy was advised. Syncing RAM blocks with the source will 4744 * result in RAM resizes. 4745 */ 4746 if (old_size < new_size) { 4747 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4748 error_report("RAM block '%s' discard of resized RAM failed", 4749 rb->idstr); 4750 } 4751 } 4752 rb->postcopy_length = new_size; 4753 break; 4754 case POSTCOPY_INCOMING_NONE: 4755 case POSTCOPY_INCOMING_RUNNING: 4756 case POSTCOPY_INCOMING_END: 4757 /* 4758 * Once our guest is running, postcopy does no longer care about 4759 * resizes. When growing, the new memory was not available on the 4760 * source, no handler needed. 4761 */ 4762 break; 4763 default: 4764 error_report("RAM block '%s' resized during postcopy state: %d", 4765 rb->idstr, ps); 4766 exit(-1); 4767 } 4768 } 4769 4770 static RAMBlockNotifier ram_mig_ram_notifier = { 4771 .ram_block_resized = ram_mig_ram_block_resized, 4772 }; 4773 4774 void ram_mig_init(void) 4775 { 4776 qemu_mutex_init(&XBZRLE.lock); 4777 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4778 ram_block_notifier_add(&ram_mig_ram_notifier); 4779 } 4780