1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram-compress.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration-stats.h" 40 #include "migration/register.h" 41 #include "migration/misc.h" 42 #include "qemu-file.h" 43 #include "postcopy-ram.h" 44 #include "page_cache.h" 45 #include "qemu/error-report.h" 46 #include "qapi/error.h" 47 #include "qapi/qapi-types-migration.h" 48 #include "qapi/qapi-events-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "exec/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "block.h" 56 #include "sysemu/cpu-throttle.h" 57 #include "savevm.h" 58 #include "qemu/iov.h" 59 #include "multifd.h" 60 #include "sysemu/runstate.h" 61 #include "options.h" 62 63 #include "hw/boards.h" /* for machine_dump_guest_core() */ 64 65 #if defined(__linux__) 66 #include "qemu/userfaultfd.h" 67 #endif /* defined(__linux__) */ 68 69 /***********************************************************/ 70 /* ram save/restore */ 71 72 /* 73 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 74 * worked for pages that were filled with the same char. We switched 75 * it to only search for the zero value. And to avoid confusion with 76 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 77 */ 78 /* 79 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 80 */ 81 #define RAM_SAVE_FLAG_FULL 0x01 82 #define RAM_SAVE_FLAG_ZERO 0x02 83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 84 #define RAM_SAVE_FLAG_PAGE 0x08 85 #define RAM_SAVE_FLAG_EOS 0x10 86 #define RAM_SAVE_FLAG_CONTINUE 0x20 87 #define RAM_SAVE_FLAG_XBZRLE 0x40 88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 89 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 91 /* We can't use any flag that is bigger than 0x200 */ 92 93 XBZRLECacheStats xbzrle_counters; 94 95 /* used by the search for pages to send */ 96 struct PageSearchStatus { 97 /* The migration channel used for a specific host page */ 98 QEMUFile *pss_channel; 99 /* Last block from where we have sent data */ 100 RAMBlock *last_sent_block; 101 /* Current block being searched */ 102 RAMBlock *block; 103 /* Current page to search from */ 104 unsigned long page; 105 /* Set once we wrap around */ 106 bool complete_round; 107 /* Whether we're sending a host page */ 108 bool host_page_sending; 109 /* The start/end of current host page. Invalid if host_page_sending==false */ 110 unsigned long host_page_start; 111 unsigned long host_page_end; 112 }; 113 typedef struct PageSearchStatus PageSearchStatus; 114 115 /* struct contains XBZRLE cache and a static page 116 used by the compression */ 117 static struct { 118 /* buffer used for XBZRLE encoding */ 119 uint8_t *encoded_buf; 120 /* buffer for storing page content */ 121 uint8_t *current_buf; 122 /* Cache for XBZRLE, Protected by lock. */ 123 PageCache *cache; 124 QemuMutex lock; 125 /* it will store a page full of zeros */ 126 uint8_t *zero_target_page; 127 /* buffer used for XBZRLE decoding */ 128 uint8_t *decoded_buf; 129 } XBZRLE; 130 131 static void XBZRLE_cache_lock(void) 132 { 133 if (migrate_xbzrle()) { 134 qemu_mutex_lock(&XBZRLE.lock); 135 } 136 } 137 138 static void XBZRLE_cache_unlock(void) 139 { 140 if (migrate_xbzrle()) { 141 qemu_mutex_unlock(&XBZRLE.lock); 142 } 143 } 144 145 /** 146 * xbzrle_cache_resize: resize the xbzrle cache 147 * 148 * This function is called from migrate_params_apply in main 149 * thread, possibly while a migration is in progress. A running 150 * migration may be using the cache and might finish during this call, 151 * hence changes to the cache are protected by XBZRLE.lock(). 152 * 153 * Returns 0 for success or -1 for error 154 * 155 * @new_size: new cache size 156 * @errp: set *errp if the check failed, with reason 157 */ 158 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 159 { 160 PageCache *new_cache; 161 int64_t ret = 0; 162 163 /* Check for truncation */ 164 if (new_size != (size_t)new_size) { 165 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 166 "exceeding address space"); 167 return -1; 168 } 169 170 if (new_size == migrate_xbzrle_cache_size()) { 171 /* nothing to do */ 172 return 0; 173 } 174 175 XBZRLE_cache_lock(); 176 177 if (XBZRLE.cache != NULL) { 178 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 179 if (!new_cache) { 180 ret = -1; 181 goto out; 182 } 183 184 cache_fini(XBZRLE.cache); 185 XBZRLE.cache = new_cache; 186 } 187 out: 188 XBZRLE_cache_unlock(); 189 return ret; 190 } 191 192 static bool postcopy_preempt_active(void) 193 { 194 return migrate_postcopy_preempt() && migration_in_postcopy(); 195 } 196 197 bool ramblock_is_ignored(RAMBlock *block) 198 { 199 return !qemu_ram_is_migratable(block) || 200 (migrate_ignore_shared() && qemu_ram_is_shared(block) 201 && qemu_ram_is_named_file(block)); 202 } 203 204 #undef RAMBLOCK_FOREACH 205 206 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 207 { 208 RAMBlock *block; 209 int ret = 0; 210 211 RCU_READ_LOCK_GUARD(); 212 213 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 214 ret = func(block, opaque); 215 if (ret) { 216 break; 217 } 218 } 219 return ret; 220 } 221 222 static void ramblock_recv_map_init(void) 223 { 224 RAMBlock *rb; 225 226 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 227 assert(!rb->receivedmap); 228 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 229 } 230 } 231 232 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 233 { 234 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 235 rb->receivedmap); 236 } 237 238 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 239 { 240 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 241 } 242 243 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 244 { 245 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 246 } 247 248 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 249 size_t nr) 250 { 251 bitmap_set_atomic(rb->receivedmap, 252 ramblock_recv_bitmap_offset(host_addr, rb), 253 nr); 254 } 255 256 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 257 258 /* 259 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 260 * 261 * Returns >0 if success with sent bytes, or <0 if error. 262 */ 263 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 264 const char *block_name) 265 { 266 RAMBlock *block = qemu_ram_block_by_name(block_name); 267 unsigned long *le_bitmap, nbits; 268 uint64_t size; 269 270 if (!block) { 271 error_report("%s: invalid block name: %s", __func__, block_name); 272 return -1; 273 } 274 275 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 276 277 /* 278 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 279 * machines we may need 4 more bytes for padding (see below 280 * comment). So extend it a bit before hand. 281 */ 282 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 283 284 /* 285 * Always use little endian when sending the bitmap. This is 286 * required that when source and destination VMs are not using the 287 * same endianness. (Note: big endian won't work.) 288 */ 289 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 290 291 /* Size of the bitmap, in bytes */ 292 size = DIV_ROUND_UP(nbits, 8); 293 294 /* 295 * size is always aligned to 8 bytes for 64bit machines, but it 296 * may not be true for 32bit machines. We need this padding to 297 * make sure the migration can survive even between 32bit and 298 * 64bit machines. 299 */ 300 size = ROUND_UP(size, 8); 301 302 qemu_put_be64(file, size); 303 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 304 /* 305 * Mark as an end, in case the middle part is screwed up due to 306 * some "mysterious" reason. 307 */ 308 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 309 qemu_fflush(file); 310 311 g_free(le_bitmap); 312 313 if (qemu_file_get_error(file)) { 314 return qemu_file_get_error(file); 315 } 316 317 return size + sizeof(size); 318 } 319 320 /* 321 * An outstanding page request, on the source, having been received 322 * and queued 323 */ 324 struct RAMSrcPageRequest { 325 RAMBlock *rb; 326 hwaddr offset; 327 hwaddr len; 328 329 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 330 }; 331 332 /* State of RAM for migration */ 333 struct RAMState { 334 /* 335 * PageSearchStatus structures for the channels when send pages. 336 * Protected by the bitmap_mutex. 337 */ 338 PageSearchStatus pss[RAM_CHANNEL_MAX]; 339 /* UFFD file descriptor, used in 'write-tracking' migration */ 340 int uffdio_fd; 341 /* total ram size in bytes */ 342 uint64_t ram_bytes_total; 343 /* Last block that we have visited searching for dirty pages */ 344 RAMBlock *last_seen_block; 345 /* Last dirty target page we have sent */ 346 ram_addr_t last_page; 347 /* last ram version we have seen */ 348 uint32_t last_version; 349 /* How many times we have dirty too many pages */ 350 int dirty_rate_high_cnt; 351 /* these variables are used for bitmap sync */ 352 /* last time we did a full bitmap_sync */ 353 int64_t time_last_bitmap_sync; 354 /* bytes transferred at start_time */ 355 uint64_t bytes_xfer_prev; 356 /* number of dirty pages since start_time */ 357 uint64_t num_dirty_pages_period; 358 /* xbzrle misses since the beginning of the period */ 359 uint64_t xbzrle_cache_miss_prev; 360 /* Amount of xbzrle pages since the beginning of the period */ 361 uint64_t xbzrle_pages_prev; 362 /* Amount of xbzrle encoded bytes since the beginning of the period */ 363 uint64_t xbzrle_bytes_prev; 364 /* Are we really using XBZRLE (e.g., after the first round). */ 365 bool xbzrle_started; 366 /* Are we on the last stage of migration */ 367 bool last_stage; 368 /* compression statistics since the beginning of the period */ 369 /* amount of count that no free thread to compress data */ 370 uint64_t compress_thread_busy_prev; 371 /* amount bytes after compression */ 372 uint64_t compressed_size_prev; 373 /* amount of compressed pages */ 374 uint64_t compress_pages_prev; 375 376 /* total handled target pages at the beginning of period */ 377 uint64_t target_page_count_prev; 378 /* total handled target pages since start */ 379 uint64_t target_page_count; 380 /* number of dirty bits in the bitmap */ 381 uint64_t migration_dirty_pages; 382 /* 383 * Protects: 384 * - dirty/clear bitmap 385 * - migration_dirty_pages 386 * - pss structures 387 */ 388 QemuMutex bitmap_mutex; 389 /* The RAMBlock used in the last src_page_requests */ 390 RAMBlock *last_req_rb; 391 /* Queue of outstanding page requests from the destination */ 392 QemuMutex src_page_req_mutex; 393 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 394 }; 395 typedef struct RAMState RAMState; 396 397 static RAMState *ram_state; 398 399 static NotifierWithReturnList precopy_notifier_list; 400 401 /* Whether postcopy has queued requests? */ 402 static bool postcopy_has_request(RAMState *rs) 403 { 404 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 405 } 406 407 void precopy_infrastructure_init(void) 408 { 409 notifier_with_return_list_init(&precopy_notifier_list); 410 } 411 412 void precopy_add_notifier(NotifierWithReturn *n) 413 { 414 notifier_with_return_list_add(&precopy_notifier_list, n); 415 } 416 417 void precopy_remove_notifier(NotifierWithReturn *n) 418 { 419 notifier_with_return_remove(n); 420 } 421 422 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 423 { 424 PrecopyNotifyData pnd; 425 pnd.reason = reason; 426 pnd.errp = errp; 427 428 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 429 } 430 431 uint64_t ram_bytes_remaining(void) 432 { 433 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 434 0; 435 } 436 437 void ram_transferred_add(uint64_t bytes) 438 { 439 if (runstate_is_running()) { 440 stat64_add(&mig_stats.precopy_bytes, bytes); 441 } else if (migration_in_postcopy()) { 442 stat64_add(&mig_stats.postcopy_bytes, bytes); 443 } else { 444 stat64_add(&mig_stats.downtime_bytes, bytes); 445 } 446 stat64_add(&mig_stats.transferred, bytes); 447 } 448 449 struct MigrationOps { 450 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 451 }; 452 typedef struct MigrationOps MigrationOps; 453 454 MigrationOps *migration_ops; 455 456 static int ram_save_host_page_urgent(PageSearchStatus *pss); 457 458 /* NOTE: page is the PFN not real ram_addr_t. */ 459 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 460 { 461 pss->block = rb; 462 pss->page = page; 463 pss->complete_round = false; 464 } 465 466 /* 467 * Check whether two PSSs are actively sending the same page. Return true 468 * if it is, false otherwise. 469 */ 470 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 471 { 472 return pss1->host_page_sending && pss2->host_page_sending && 473 (pss1->host_page_start == pss2->host_page_start); 474 } 475 476 /** 477 * save_page_header: write page header to wire 478 * 479 * If this is the 1st block, it also writes the block identification 480 * 481 * Returns the number of bytes written 482 * 483 * @pss: current PSS channel status 484 * @block: block that contains the page we want to send 485 * @offset: offset inside the block for the page 486 * in the lower bits, it contains flags 487 */ 488 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 489 RAMBlock *block, ram_addr_t offset) 490 { 491 size_t size, len; 492 bool same_block = (block == pss->last_sent_block); 493 494 if (same_block) { 495 offset |= RAM_SAVE_FLAG_CONTINUE; 496 } 497 qemu_put_be64(f, offset); 498 size = 8; 499 500 if (!same_block) { 501 len = strlen(block->idstr); 502 qemu_put_byte(f, len); 503 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 504 size += 1 + len; 505 pss->last_sent_block = block; 506 } 507 return size; 508 } 509 510 /** 511 * mig_throttle_guest_down: throttle down the guest 512 * 513 * Reduce amount of guest cpu execution to hopefully slow down memory 514 * writes. If guest dirty memory rate is reduced below the rate at 515 * which we can transfer pages to the destination then we should be 516 * able to complete migration. Some workloads dirty memory way too 517 * fast and will not effectively converge, even with auto-converge. 518 */ 519 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 520 uint64_t bytes_dirty_threshold) 521 { 522 uint64_t pct_initial = migrate_cpu_throttle_initial(); 523 uint64_t pct_increment = migrate_cpu_throttle_increment(); 524 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 525 int pct_max = migrate_max_cpu_throttle(); 526 527 uint64_t throttle_now = cpu_throttle_get_percentage(); 528 uint64_t cpu_now, cpu_ideal, throttle_inc; 529 530 /* We have not started throttling yet. Let's start it. */ 531 if (!cpu_throttle_active()) { 532 cpu_throttle_set(pct_initial); 533 } else { 534 /* Throttling already on, just increase the rate */ 535 if (!pct_tailslow) { 536 throttle_inc = pct_increment; 537 } else { 538 /* Compute the ideal CPU percentage used by Guest, which may 539 * make the dirty rate match the dirty rate threshold. */ 540 cpu_now = 100 - throttle_now; 541 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 542 bytes_dirty_period); 543 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 544 } 545 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 546 } 547 } 548 549 void mig_throttle_counter_reset(void) 550 { 551 RAMState *rs = ram_state; 552 553 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 554 rs->num_dirty_pages_period = 0; 555 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred); 556 } 557 558 /** 559 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 560 * 561 * @rs: current RAM state 562 * @current_addr: address for the zero page 563 * 564 * Update the xbzrle cache to reflect a page that's been sent as all 0. 565 * The important thing is that a stale (not-yet-0'd) page be replaced 566 * by the new data. 567 * As a bonus, if the page wasn't in the cache it gets added so that 568 * when a small write is made into the 0'd page it gets XBZRLE sent. 569 */ 570 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 571 { 572 /* We don't care if this fails to allocate a new cache page 573 * as long as it updated an old one */ 574 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 575 stat64_get(&mig_stats.dirty_sync_count)); 576 } 577 578 #define ENCODING_FLAG_XBZRLE 0x1 579 580 /** 581 * save_xbzrle_page: compress and send current page 582 * 583 * Returns: 1 means that we wrote the page 584 * 0 means that page is identical to the one already sent 585 * -1 means that xbzrle would be longer than normal 586 * 587 * @rs: current RAM state 588 * @pss: current PSS channel 589 * @current_data: pointer to the address of the page contents 590 * @current_addr: addr of the page 591 * @block: block that contains the page we want to send 592 * @offset: offset inside the block for the page 593 */ 594 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 595 uint8_t **current_data, ram_addr_t current_addr, 596 RAMBlock *block, ram_addr_t offset) 597 { 598 int encoded_len = 0, bytes_xbzrle; 599 uint8_t *prev_cached_page; 600 QEMUFile *file = pss->pss_channel; 601 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 602 603 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 604 xbzrle_counters.cache_miss++; 605 if (!rs->last_stage) { 606 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 607 generation) == -1) { 608 return -1; 609 } else { 610 /* update *current_data when the page has been 611 inserted into cache */ 612 *current_data = get_cached_data(XBZRLE.cache, current_addr); 613 } 614 } 615 return -1; 616 } 617 618 /* 619 * Reaching here means the page has hit the xbzrle cache, no matter what 620 * encoding result it is (normal encoding, overflow or skipping the page), 621 * count the page as encoded. This is used to calculate the encoding rate. 622 * 623 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 624 * 2nd page turns out to be skipped (i.e. no new bytes written to the 625 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 626 * skipped page included. In this way, the encoding rate can tell if the 627 * guest page is good for xbzrle encoding. 628 */ 629 xbzrle_counters.pages++; 630 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 631 632 /* save current buffer into memory */ 633 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 634 635 /* XBZRLE encoding (if there is no overflow) */ 636 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 637 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 638 TARGET_PAGE_SIZE); 639 640 /* 641 * Update the cache contents, so that it corresponds to the data 642 * sent, in all cases except where we skip the page. 643 */ 644 if (!rs->last_stage && encoded_len != 0) { 645 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 646 /* 647 * In the case where we couldn't compress, ensure that the caller 648 * sends the data from the cache, since the guest might have 649 * changed the RAM since we copied it. 650 */ 651 *current_data = prev_cached_page; 652 } 653 654 if (encoded_len == 0) { 655 trace_save_xbzrle_page_skipping(); 656 return 0; 657 } else if (encoded_len == -1) { 658 trace_save_xbzrle_page_overflow(); 659 xbzrle_counters.overflow++; 660 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 661 return -1; 662 } 663 664 /* Send XBZRLE based compressed page */ 665 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 666 offset | RAM_SAVE_FLAG_XBZRLE); 667 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 668 qemu_put_be16(file, encoded_len); 669 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 670 bytes_xbzrle += encoded_len + 1 + 2; 671 /* 672 * Like compressed_size (please see update_compress_thread_counts), 673 * the xbzrle encoded bytes don't count the 8 byte header with 674 * RAM_SAVE_FLAG_CONTINUE. 675 */ 676 xbzrle_counters.bytes += bytes_xbzrle - 8; 677 ram_transferred_add(bytes_xbzrle); 678 679 return 1; 680 } 681 682 /** 683 * pss_find_next_dirty: find the next dirty page of current ramblock 684 * 685 * This function updates pss->page to point to the next dirty page index 686 * within the ramblock to migrate, or the end of ramblock when nothing 687 * found. Note that when pss->host_page_sending==true it means we're 688 * during sending a host page, so we won't look for dirty page that is 689 * outside the host page boundary. 690 * 691 * @pss: the current page search status 692 */ 693 static void pss_find_next_dirty(PageSearchStatus *pss) 694 { 695 RAMBlock *rb = pss->block; 696 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 697 unsigned long *bitmap = rb->bmap; 698 699 if (ramblock_is_ignored(rb)) { 700 /* Points directly to the end, so we know no dirty page */ 701 pss->page = size; 702 return; 703 } 704 705 /* 706 * If during sending a host page, only look for dirty pages within the 707 * current host page being send. 708 */ 709 if (pss->host_page_sending) { 710 assert(pss->host_page_end); 711 size = MIN(size, pss->host_page_end); 712 } 713 714 pss->page = find_next_bit(bitmap, size, pss->page); 715 } 716 717 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 718 unsigned long page) 719 { 720 uint8_t shift; 721 hwaddr size, start; 722 723 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 724 return; 725 } 726 727 shift = rb->clear_bmap_shift; 728 /* 729 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 730 * can make things easier sometimes since then start address 731 * of the small chunk will always be 64 pages aligned so the 732 * bitmap will always be aligned to unsigned long. We should 733 * even be able to remove this restriction but I'm simply 734 * keeping it. 735 */ 736 assert(shift >= 6); 737 738 size = 1ULL << (TARGET_PAGE_BITS + shift); 739 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 740 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 741 memory_region_clear_dirty_bitmap(rb->mr, start, size); 742 } 743 744 static void 745 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 746 unsigned long start, 747 unsigned long npages) 748 { 749 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 750 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 751 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 752 753 /* 754 * Clear pages from start to start + npages - 1, so the end boundary is 755 * exclusive. 756 */ 757 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 758 migration_clear_memory_region_dirty_bitmap(rb, i); 759 } 760 } 761 762 /* 763 * colo_bitmap_find_diry:find contiguous dirty pages from start 764 * 765 * Returns the page offset within memory region of the start of the contiguout 766 * dirty page 767 * 768 * @rs: current RAM state 769 * @rb: RAMBlock where to search for dirty pages 770 * @start: page where we start the search 771 * @num: the number of contiguous dirty pages 772 */ 773 static inline 774 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 775 unsigned long start, unsigned long *num) 776 { 777 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 778 unsigned long *bitmap = rb->bmap; 779 unsigned long first, next; 780 781 *num = 0; 782 783 if (ramblock_is_ignored(rb)) { 784 return size; 785 } 786 787 first = find_next_bit(bitmap, size, start); 788 if (first >= size) { 789 return first; 790 } 791 next = find_next_zero_bit(bitmap, size, first + 1); 792 assert(next >= first); 793 *num = next - first; 794 return first; 795 } 796 797 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 798 RAMBlock *rb, 799 unsigned long page) 800 { 801 bool ret; 802 803 /* 804 * Clear dirty bitmap if needed. This _must_ be called before we 805 * send any of the page in the chunk because we need to make sure 806 * we can capture further page content changes when we sync dirty 807 * log the next time. So as long as we are going to send any of 808 * the page in the chunk we clear the remote dirty bitmap for all. 809 * Clearing it earlier won't be a problem, but too late will. 810 */ 811 migration_clear_memory_region_dirty_bitmap(rb, page); 812 813 ret = test_and_clear_bit(page, rb->bmap); 814 if (ret) { 815 rs->migration_dirty_pages--; 816 } 817 818 return ret; 819 } 820 821 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 822 void *opaque) 823 { 824 const hwaddr offset = section->offset_within_region; 825 const hwaddr size = int128_get64(section->size); 826 const unsigned long start = offset >> TARGET_PAGE_BITS; 827 const unsigned long npages = size >> TARGET_PAGE_BITS; 828 RAMBlock *rb = section->mr->ram_block; 829 uint64_t *cleared_bits = opaque; 830 831 /* 832 * We don't grab ram_state->bitmap_mutex because we expect to run 833 * only when starting migration or during postcopy recovery where 834 * we don't have concurrent access. 835 */ 836 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 837 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 838 } 839 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 840 bitmap_clear(rb->bmap, start, npages); 841 } 842 843 /* 844 * Exclude all dirty pages from migration that fall into a discarded range as 845 * managed by a RamDiscardManager responsible for the mapped memory region of 846 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 847 * 848 * Discarded pages ("logically unplugged") have undefined content and must 849 * not get migrated, because even reading these pages for migration might 850 * result in undesired behavior. 851 * 852 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 853 * 854 * Note: The result is only stable while migrating (precopy/postcopy). 855 */ 856 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 857 { 858 uint64_t cleared_bits = 0; 859 860 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 861 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 862 MemoryRegionSection section = { 863 .mr = rb->mr, 864 .offset_within_region = 0, 865 .size = int128_make64(qemu_ram_get_used_length(rb)), 866 }; 867 868 ram_discard_manager_replay_discarded(rdm, §ion, 869 dirty_bitmap_clear_section, 870 &cleared_bits); 871 } 872 return cleared_bits; 873 } 874 875 /* 876 * Check if a host-page aligned page falls into a discarded range as managed by 877 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 878 * 879 * Note: The result is only stable while migrating (precopy/postcopy). 880 */ 881 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 882 { 883 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 884 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 885 MemoryRegionSection section = { 886 .mr = rb->mr, 887 .offset_within_region = start, 888 .size = int128_make64(qemu_ram_pagesize(rb)), 889 }; 890 891 return !ram_discard_manager_is_populated(rdm, §ion); 892 } 893 return false; 894 } 895 896 /* Called with RCU critical section */ 897 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 898 { 899 uint64_t new_dirty_pages = 900 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 901 902 rs->migration_dirty_pages += new_dirty_pages; 903 rs->num_dirty_pages_period += new_dirty_pages; 904 } 905 906 /** 907 * ram_pagesize_summary: calculate all the pagesizes of a VM 908 * 909 * Returns a summary bitmap of the page sizes of all RAMBlocks 910 * 911 * For VMs with just normal pages this is equivalent to the host page 912 * size. If it's got some huge pages then it's the OR of all the 913 * different page sizes. 914 */ 915 uint64_t ram_pagesize_summary(void) 916 { 917 RAMBlock *block; 918 uint64_t summary = 0; 919 920 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 921 summary |= block->page_size; 922 } 923 924 return summary; 925 } 926 927 uint64_t ram_get_total_transferred_pages(void) 928 { 929 return stat64_get(&mig_stats.normal_pages) + 930 stat64_get(&mig_stats.zero_pages) + 931 compression_counters.pages + xbzrle_counters.pages; 932 } 933 934 static void migration_update_rates(RAMState *rs, int64_t end_time) 935 { 936 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 937 double compressed_size; 938 939 /* calculate period counters */ 940 stat64_set(&mig_stats.dirty_pages_rate, 941 rs->num_dirty_pages_period * 1000 / 942 (end_time - rs->time_last_bitmap_sync)); 943 944 if (!page_count) { 945 return; 946 } 947 948 if (migrate_xbzrle()) { 949 double encoded_size, unencoded_size; 950 951 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 952 rs->xbzrle_cache_miss_prev) / page_count; 953 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 954 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 955 TARGET_PAGE_SIZE; 956 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 957 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 958 xbzrle_counters.encoding_rate = 0; 959 } else { 960 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 961 } 962 rs->xbzrle_pages_prev = xbzrle_counters.pages; 963 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 964 } 965 966 if (migrate_compress()) { 967 compression_counters.busy_rate = (double)(compression_counters.busy - 968 rs->compress_thread_busy_prev) / page_count; 969 rs->compress_thread_busy_prev = compression_counters.busy; 970 971 compressed_size = compression_counters.compressed_size - 972 rs->compressed_size_prev; 973 if (compressed_size) { 974 double uncompressed_size = (compression_counters.pages - 975 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 976 977 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 978 compression_counters.compression_rate = 979 uncompressed_size / compressed_size; 980 981 rs->compress_pages_prev = compression_counters.pages; 982 rs->compressed_size_prev = compression_counters.compressed_size; 983 } 984 } 985 } 986 987 static void migration_trigger_throttle(RAMState *rs) 988 { 989 uint64_t threshold = migrate_throttle_trigger_threshold(); 990 uint64_t bytes_xfer_period = 991 stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev; 992 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 993 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 994 995 /* During block migration the auto-converge logic incorrectly detects 996 * that ram migration makes no progress. Avoid this by disabling the 997 * throttling logic during the bulk phase of block migration. */ 998 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 999 /* The following detection logic can be refined later. For now: 1000 Check to see if the ratio between dirtied bytes and the approx. 1001 amount of bytes that just got transferred since the last time 1002 we were in this routine reaches the threshold. If that happens 1003 twice, start or increase throttling. */ 1004 1005 if ((bytes_dirty_period > bytes_dirty_threshold) && 1006 (++rs->dirty_rate_high_cnt >= 2)) { 1007 trace_migration_throttle(); 1008 rs->dirty_rate_high_cnt = 0; 1009 mig_throttle_guest_down(bytes_dirty_period, 1010 bytes_dirty_threshold); 1011 } 1012 } 1013 } 1014 1015 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1016 { 1017 RAMBlock *block; 1018 int64_t end_time; 1019 1020 stat64_add(&mig_stats.dirty_sync_count, 1); 1021 1022 if (!rs->time_last_bitmap_sync) { 1023 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1024 } 1025 1026 trace_migration_bitmap_sync_start(); 1027 memory_global_dirty_log_sync(last_stage); 1028 1029 qemu_mutex_lock(&rs->bitmap_mutex); 1030 WITH_RCU_READ_LOCK_GUARD() { 1031 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1032 ramblock_sync_dirty_bitmap(rs, block); 1033 } 1034 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1035 } 1036 qemu_mutex_unlock(&rs->bitmap_mutex); 1037 1038 memory_global_after_dirty_log_sync(); 1039 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1040 1041 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1042 1043 /* more than 1 second = 1000 millisecons */ 1044 if (end_time > rs->time_last_bitmap_sync + 1000) { 1045 migration_trigger_throttle(rs); 1046 1047 migration_update_rates(rs, end_time); 1048 1049 rs->target_page_count_prev = rs->target_page_count; 1050 1051 /* reset period counters */ 1052 rs->time_last_bitmap_sync = end_time; 1053 rs->num_dirty_pages_period = 0; 1054 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred); 1055 } 1056 if (migrate_events()) { 1057 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1058 qapi_event_send_migration_pass(generation); 1059 } 1060 } 1061 1062 static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage) 1063 { 1064 Error *local_err = NULL; 1065 1066 /* 1067 * The current notifier usage is just an optimization to migration, so we 1068 * don't stop the normal migration process in the error case. 1069 */ 1070 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1071 error_report_err(local_err); 1072 local_err = NULL; 1073 } 1074 1075 migration_bitmap_sync(rs, last_stage); 1076 1077 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1078 error_report_err(local_err); 1079 } 1080 } 1081 1082 void ram_release_page(const char *rbname, uint64_t offset) 1083 { 1084 if (!migrate_release_ram() || !migration_in_postcopy()) { 1085 return; 1086 } 1087 1088 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1089 } 1090 1091 /** 1092 * save_zero_page_to_file: send the zero page to the file 1093 * 1094 * Returns the size of data written to the file, 0 means the page is not 1095 * a zero page 1096 * 1097 * @pss: current PSS channel 1098 * @block: block that contains the page we want to send 1099 * @offset: offset inside the block for the page 1100 */ 1101 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1102 RAMBlock *block, ram_addr_t offset) 1103 { 1104 uint8_t *p = block->host + offset; 1105 int len = 0; 1106 1107 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1108 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1109 qemu_put_byte(file, 0); 1110 len += 1; 1111 ram_release_page(block->idstr, offset); 1112 } 1113 return len; 1114 } 1115 1116 /** 1117 * save_zero_page: send the zero page to the stream 1118 * 1119 * Returns the number of pages written. 1120 * 1121 * @pss: current PSS channel 1122 * @block: block that contains the page we want to send 1123 * @offset: offset inside the block for the page 1124 */ 1125 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1126 ram_addr_t offset) 1127 { 1128 int len = save_zero_page_to_file(pss, f, block, offset); 1129 1130 if (len) { 1131 stat64_add(&mig_stats.zero_pages, 1); 1132 ram_transferred_add(len); 1133 return 1; 1134 } 1135 return -1; 1136 } 1137 1138 /* 1139 * @pages: the number of pages written by the control path, 1140 * < 0 - error 1141 * > 0 - number of pages written 1142 * 1143 * Return true if the pages has been saved, otherwise false is returned. 1144 */ 1145 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1146 ram_addr_t offset, int *pages) 1147 { 1148 uint64_t bytes_xmit = 0; 1149 int ret; 1150 1151 *pages = -1; 1152 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1153 TARGET_PAGE_SIZE, &bytes_xmit); 1154 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1155 return false; 1156 } 1157 1158 if (bytes_xmit) { 1159 ram_transferred_add(bytes_xmit); 1160 *pages = 1; 1161 } 1162 1163 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1164 return true; 1165 } 1166 1167 if (bytes_xmit > 0) { 1168 stat64_add(&mig_stats.normal_pages, 1); 1169 } else if (bytes_xmit == 0) { 1170 stat64_add(&mig_stats.zero_pages, 1); 1171 } 1172 1173 return true; 1174 } 1175 1176 /* 1177 * directly send the page to the stream 1178 * 1179 * Returns the number of pages written. 1180 * 1181 * @pss: current PSS channel 1182 * @block: block that contains the page we want to send 1183 * @offset: offset inside the block for the page 1184 * @buf: the page to be sent 1185 * @async: send to page asyncly 1186 */ 1187 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1188 ram_addr_t offset, uint8_t *buf, bool async) 1189 { 1190 QEMUFile *file = pss->pss_channel; 1191 1192 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1193 offset | RAM_SAVE_FLAG_PAGE)); 1194 if (async) { 1195 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1196 migrate_release_ram() && 1197 migration_in_postcopy()); 1198 } else { 1199 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1200 } 1201 ram_transferred_add(TARGET_PAGE_SIZE); 1202 stat64_add(&mig_stats.normal_pages, 1); 1203 return 1; 1204 } 1205 1206 /** 1207 * ram_save_page: send the given page to the stream 1208 * 1209 * Returns the number of pages written. 1210 * < 0 - error 1211 * >=0 - Number of pages written - this might legally be 0 1212 * if xbzrle noticed the page was the same. 1213 * 1214 * @rs: current RAM state 1215 * @block: block that contains the page we want to send 1216 * @offset: offset inside the block for the page 1217 */ 1218 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1219 { 1220 int pages = -1; 1221 uint8_t *p; 1222 bool send_async = true; 1223 RAMBlock *block = pss->block; 1224 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1225 ram_addr_t current_addr = block->offset + offset; 1226 1227 p = block->host + offset; 1228 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1229 1230 XBZRLE_cache_lock(); 1231 if (rs->xbzrle_started && !migration_in_postcopy()) { 1232 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1233 block, offset); 1234 if (!rs->last_stage) { 1235 /* Can't send this cached data async, since the cache page 1236 * might get updated before it gets to the wire 1237 */ 1238 send_async = false; 1239 } 1240 } 1241 1242 /* XBZRLE overflow or normal page */ 1243 if (pages == -1) { 1244 pages = save_normal_page(pss, block, offset, p, send_async); 1245 } 1246 1247 XBZRLE_cache_unlock(); 1248 1249 return pages; 1250 } 1251 1252 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1253 ram_addr_t offset) 1254 { 1255 if (multifd_queue_page(file, block, offset) < 0) { 1256 return -1; 1257 } 1258 stat64_add(&mig_stats.normal_pages, 1); 1259 1260 return 1; 1261 } 1262 1263 static void 1264 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1265 { 1266 ram_transferred_add(bytes_xmit); 1267 1268 if (param->result == RES_ZEROPAGE) { 1269 stat64_add(&mig_stats.zero_pages, 1); 1270 return; 1271 } 1272 1273 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1274 compression_counters.compressed_size += bytes_xmit - 8; 1275 compression_counters.pages++; 1276 } 1277 1278 static bool save_page_use_compression(RAMState *rs); 1279 1280 static int send_queued_data(CompressParam *param) 1281 { 1282 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY]; 1283 MigrationState *ms = migrate_get_current(); 1284 QEMUFile *file = ms->to_dst_file; 1285 int len = 0; 1286 1287 RAMBlock *block = param->block; 1288 ram_addr_t offset = param->offset; 1289 1290 if (param->result == RES_NONE) { 1291 return 0; 1292 } 1293 1294 assert(block == pss->last_sent_block); 1295 1296 if (param->result == RES_ZEROPAGE) { 1297 assert(qemu_file_buffer_empty(param->file)); 1298 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1299 qemu_put_byte(file, 0); 1300 len += 1; 1301 ram_release_page(block->idstr, offset); 1302 } else if (param->result == RES_COMPRESS) { 1303 assert(!qemu_file_buffer_empty(param->file)); 1304 len += save_page_header(pss, file, block, 1305 offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1306 len += qemu_put_qemu_file(file, param->file); 1307 } else { 1308 abort(); 1309 } 1310 1311 update_compress_thread_counts(param, len); 1312 1313 return len; 1314 } 1315 1316 static void ram_flush_compressed_data(RAMState *rs) 1317 { 1318 if (!save_page_use_compression(rs)) { 1319 return; 1320 } 1321 1322 flush_compressed_data(send_queued_data); 1323 } 1324 1325 #define PAGE_ALL_CLEAN 0 1326 #define PAGE_TRY_AGAIN 1 1327 #define PAGE_DIRTY_FOUND 2 1328 /** 1329 * find_dirty_block: find the next dirty page and update any state 1330 * associated with the search process. 1331 * 1332 * Returns: 1333 * <0: An error happened 1334 * PAGE_ALL_CLEAN: no dirty page found, give up 1335 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1336 * PAGE_DIRTY_FOUND: dirty page found 1337 * 1338 * @rs: current RAM state 1339 * @pss: data about the state of the current dirty page scan 1340 * @again: set to false if the search has scanned the whole of RAM 1341 */ 1342 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1343 { 1344 /* Update pss->page for the next dirty bit in ramblock */ 1345 pss_find_next_dirty(pss); 1346 1347 if (pss->complete_round && pss->block == rs->last_seen_block && 1348 pss->page >= rs->last_page) { 1349 /* 1350 * We've been once around the RAM and haven't found anything. 1351 * Give up. 1352 */ 1353 return PAGE_ALL_CLEAN; 1354 } 1355 if (!offset_in_ramblock(pss->block, 1356 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1357 /* Didn't find anything in this RAM Block */ 1358 pss->page = 0; 1359 pss->block = QLIST_NEXT_RCU(pss->block, next); 1360 if (!pss->block) { 1361 if (!migrate_multifd_flush_after_each_section()) { 1362 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1363 int ret = multifd_send_sync_main(f); 1364 if (ret < 0) { 1365 return ret; 1366 } 1367 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 1368 qemu_fflush(f); 1369 } 1370 /* 1371 * If memory migration starts over, we will meet a dirtied page 1372 * which may still exists in compression threads's ring, so we 1373 * should flush the compressed data to make sure the new page 1374 * is not overwritten by the old one in the destination. 1375 * 1376 * Also If xbzrle is on, stop using the data compression at this 1377 * point. In theory, xbzrle can do better than compression. 1378 */ 1379 ram_flush_compressed_data(rs); 1380 1381 /* Hit the end of the list */ 1382 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1383 /* Flag that we've looped */ 1384 pss->complete_round = true; 1385 /* After the first round, enable XBZRLE. */ 1386 if (migrate_xbzrle()) { 1387 rs->xbzrle_started = true; 1388 } 1389 } 1390 /* Didn't find anything this time, but try again on the new block */ 1391 return PAGE_TRY_AGAIN; 1392 } else { 1393 /* We've found something */ 1394 return PAGE_DIRTY_FOUND; 1395 } 1396 } 1397 1398 /** 1399 * unqueue_page: gets a page of the queue 1400 * 1401 * Helper for 'get_queued_page' - gets a page off the queue 1402 * 1403 * Returns the block of the page (or NULL if none available) 1404 * 1405 * @rs: current RAM state 1406 * @offset: used to return the offset within the RAMBlock 1407 */ 1408 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1409 { 1410 struct RAMSrcPageRequest *entry; 1411 RAMBlock *block = NULL; 1412 1413 if (!postcopy_has_request(rs)) { 1414 return NULL; 1415 } 1416 1417 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1418 1419 /* 1420 * This should _never_ change even after we take the lock, because no one 1421 * should be taking anything off the request list other than us. 1422 */ 1423 assert(postcopy_has_request(rs)); 1424 1425 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1426 block = entry->rb; 1427 *offset = entry->offset; 1428 1429 if (entry->len > TARGET_PAGE_SIZE) { 1430 entry->len -= TARGET_PAGE_SIZE; 1431 entry->offset += TARGET_PAGE_SIZE; 1432 } else { 1433 memory_region_unref(block->mr); 1434 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1435 g_free(entry); 1436 migration_consume_urgent_request(); 1437 } 1438 1439 return block; 1440 } 1441 1442 #if defined(__linux__) 1443 /** 1444 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1445 * is found, return RAM block pointer and page offset 1446 * 1447 * Returns pointer to the RAMBlock containing faulting page, 1448 * NULL if no write faults are pending 1449 * 1450 * @rs: current RAM state 1451 * @offset: page offset from the beginning of the block 1452 */ 1453 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1454 { 1455 struct uffd_msg uffd_msg; 1456 void *page_address; 1457 RAMBlock *block; 1458 int res; 1459 1460 if (!migrate_background_snapshot()) { 1461 return NULL; 1462 } 1463 1464 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1465 if (res <= 0) { 1466 return NULL; 1467 } 1468 1469 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1470 block = qemu_ram_block_from_host(page_address, false, offset); 1471 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1472 return block; 1473 } 1474 1475 /** 1476 * ram_save_release_protection: release UFFD write protection after 1477 * a range of pages has been saved 1478 * 1479 * @rs: current RAM state 1480 * @pss: page-search-status structure 1481 * @start_page: index of the first page in the range relative to pss->block 1482 * 1483 * Returns 0 on success, negative value in case of an error 1484 */ 1485 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1486 unsigned long start_page) 1487 { 1488 int res = 0; 1489 1490 /* Check if page is from UFFD-managed region. */ 1491 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1492 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1493 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1494 1495 /* Flush async buffers before un-protect. */ 1496 qemu_fflush(pss->pss_channel); 1497 /* Un-protect memory range. */ 1498 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1499 false, false); 1500 } 1501 1502 return res; 1503 } 1504 1505 /* ram_write_tracking_available: check if kernel supports required UFFD features 1506 * 1507 * Returns true if supports, false otherwise 1508 */ 1509 bool ram_write_tracking_available(void) 1510 { 1511 uint64_t uffd_features; 1512 int res; 1513 1514 res = uffd_query_features(&uffd_features); 1515 return (res == 0 && 1516 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1517 } 1518 1519 /* ram_write_tracking_compatible: check if guest configuration is 1520 * compatible with 'write-tracking' 1521 * 1522 * Returns true if compatible, false otherwise 1523 */ 1524 bool ram_write_tracking_compatible(void) 1525 { 1526 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1527 int uffd_fd; 1528 RAMBlock *block; 1529 bool ret = false; 1530 1531 /* Open UFFD file descriptor */ 1532 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1533 if (uffd_fd < 0) { 1534 return false; 1535 } 1536 1537 RCU_READ_LOCK_GUARD(); 1538 1539 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1540 uint64_t uffd_ioctls; 1541 1542 /* Nothing to do with read-only and MMIO-writable regions */ 1543 if (block->mr->readonly || block->mr->rom_device) { 1544 continue; 1545 } 1546 /* Try to register block memory via UFFD-IO to track writes */ 1547 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1548 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1549 goto out; 1550 } 1551 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1552 goto out; 1553 } 1554 } 1555 ret = true; 1556 1557 out: 1558 uffd_close_fd(uffd_fd); 1559 return ret; 1560 } 1561 1562 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1563 ram_addr_t size) 1564 { 1565 const ram_addr_t end = offset + size; 1566 1567 /* 1568 * We read one byte of each page; this will preallocate page tables if 1569 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1570 * where no page was populated yet. This might require adaption when 1571 * supporting other mappings, like shmem. 1572 */ 1573 for (; offset < end; offset += block->page_size) { 1574 char tmp = *((char *)block->host + offset); 1575 1576 /* Don't optimize the read out */ 1577 asm volatile("" : "+r" (tmp)); 1578 } 1579 } 1580 1581 static inline int populate_read_section(MemoryRegionSection *section, 1582 void *opaque) 1583 { 1584 const hwaddr size = int128_get64(section->size); 1585 hwaddr offset = section->offset_within_region; 1586 RAMBlock *block = section->mr->ram_block; 1587 1588 populate_read_range(block, offset, size); 1589 return 0; 1590 } 1591 1592 /* 1593 * ram_block_populate_read: preallocate page tables and populate pages in the 1594 * RAM block by reading a byte of each page. 1595 * 1596 * Since it's solely used for userfault_fd WP feature, here we just 1597 * hardcode page size to qemu_real_host_page_size. 1598 * 1599 * @block: RAM block to populate 1600 */ 1601 static void ram_block_populate_read(RAMBlock *rb) 1602 { 1603 /* 1604 * Skip populating all pages that fall into a discarded range as managed by 1605 * a RamDiscardManager responsible for the mapped memory region of the 1606 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1607 * must not get populated automatically. We don't have to track 1608 * modifications via userfaultfd WP reliably, because these pages will 1609 * not be part of the migration stream either way -- see 1610 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1611 * 1612 * Note: The result is only stable while migrating (precopy/postcopy). 1613 */ 1614 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1615 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1616 MemoryRegionSection section = { 1617 .mr = rb->mr, 1618 .offset_within_region = 0, 1619 .size = rb->mr->size, 1620 }; 1621 1622 ram_discard_manager_replay_populated(rdm, §ion, 1623 populate_read_section, NULL); 1624 } else { 1625 populate_read_range(rb, 0, rb->used_length); 1626 } 1627 } 1628 1629 /* 1630 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1631 */ 1632 void ram_write_tracking_prepare(void) 1633 { 1634 RAMBlock *block; 1635 1636 RCU_READ_LOCK_GUARD(); 1637 1638 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1639 /* Nothing to do with read-only and MMIO-writable regions */ 1640 if (block->mr->readonly || block->mr->rom_device) { 1641 continue; 1642 } 1643 1644 /* 1645 * Populate pages of the RAM block before enabling userfault_fd 1646 * write protection. 1647 * 1648 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1649 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1650 * pages with pte_none() entries in page table. 1651 */ 1652 ram_block_populate_read(block); 1653 } 1654 } 1655 1656 static inline int uffd_protect_section(MemoryRegionSection *section, 1657 void *opaque) 1658 { 1659 const hwaddr size = int128_get64(section->size); 1660 const hwaddr offset = section->offset_within_region; 1661 RAMBlock *rb = section->mr->ram_block; 1662 int uffd_fd = (uintptr_t)opaque; 1663 1664 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1665 false); 1666 } 1667 1668 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1669 { 1670 assert(rb->flags & RAM_UF_WRITEPROTECT); 1671 1672 /* See ram_block_populate_read() */ 1673 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1674 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1675 MemoryRegionSection section = { 1676 .mr = rb->mr, 1677 .offset_within_region = 0, 1678 .size = rb->mr->size, 1679 }; 1680 1681 return ram_discard_manager_replay_populated(rdm, §ion, 1682 uffd_protect_section, 1683 (void *)(uintptr_t)uffd_fd); 1684 } 1685 return uffd_change_protection(uffd_fd, rb->host, 1686 rb->used_length, true, false); 1687 } 1688 1689 /* 1690 * ram_write_tracking_start: start UFFD-WP memory tracking 1691 * 1692 * Returns 0 for success or negative value in case of error 1693 */ 1694 int ram_write_tracking_start(void) 1695 { 1696 int uffd_fd; 1697 RAMState *rs = ram_state; 1698 RAMBlock *block; 1699 1700 /* Open UFFD file descriptor */ 1701 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1702 if (uffd_fd < 0) { 1703 return uffd_fd; 1704 } 1705 rs->uffdio_fd = uffd_fd; 1706 1707 RCU_READ_LOCK_GUARD(); 1708 1709 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1710 /* Nothing to do with read-only and MMIO-writable regions */ 1711 if (block->mr->readonly || block->mr->rom_device) { 1712 continue; 1713 } 1714 1715 /* Register block memory with UFFD to track writes */ 1716 if (uffd_register_memory(rs->uffdio_fd, block->host, 1717 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1718 goto fail; 1719 } 1720 block->flags |= RAM_UF_WRITEPROTECT; 1721 memory_region_ref(block->mr); 1722 1723 /* Apply UFFD write protection to the block memory range */ 1724 if (ram_block_uffd_protect(block, uffd_fd)) { 1725 goto fail; 1726 } 1727 1728 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1729 block->host, block->max_length); 1730 } 1731 1732 return 0; 1733 1734 fail: 1735 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1736 1737 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1738 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1739 continue; 1740 } 1741 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1742 /* Cleanup flags and remove reference */ 1743 block->flags &= ~RAM_UF_WRITEPROTECT; 1744 memory_region_unref(block->mr); 1745 } 1746 1747 uffd_close_fd(uffd_fd); 1748 rs->uffdio_fd = -1; 1749 return -1; 1750 } 1751 1752 /** 1753 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1754 */ 1755 void ram_write_tracking_stop(void) 1756 { 1757 RAMState *rs = ram_state; 1758 RAMBlock *block; 1759 1760 RCU_READ_LOCK_GUARD(); 1761 1762 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1763 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1764 continue; 1765 } 1766 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1767 1768 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1769 block->host, block->max_length); 1770 1771 /* Cleanup flags and remove reference */ 1772 block->flags &= ~RAM_UF_WRITEPROTECT; 1773 memory_region_unref(block->mr); 1774 } 1775 1776 /* Finally close UFFD file descriptor */ 1777 uffd_close_fd(rs->uffdio_fd); 1778 rs->uffdio_fd = -1; 1779 } 1780 1781 #else 1782 /* No target OS support, stubs just fail or ignore */ 1783 1784 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1785 { 1786 (void) rs; 1787 (void) offset; 1788 1789 return NULL; 1790 } 1791 1792 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1793 unsigned long start_page) 1794 { 1795 (void) rs; 1796 (void) pss; 1797 (void) start_page; 1798 1799 return 0; 1800 } 1801 1802 bool ram_write_tracking_available(void) 1803 { 1804 return false; 1805 } 1806 1807 bool ram_write_tracking_compatible(void) 1808 { 1809 assert(0); 1810 return false; 1811 } 1812 1813 int ram_write_tracking_start(void) 1814 { 1815 assert(0); 1816 return -1; 1817 } 1818 1819 void ram_write_tracking_stop(void) 1820 { 1821 assert(0); 1822 } 1823 #endif /* defined(__linux__) */ 1824 1825 /** 1826 * get_queued_page: unqueue a page from the postcopy requests 1827 * 1828 * Skips pages that are already sent (!dirty) 1829 * 1830 * Returns true if a queued page is found 1831 * 1832 * @rs: current RAM state 1833 * @pss: data about the state of the current dirty page scan 1834 */ 1835 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1836 { 1837 RAMBlock *block; 1838 ram_addr_t offset; 1839 bool dirty; 1840 1841 do { 1842 block = unqueue_page(rs, &offset); 1843 /* 1844 * We're sending this page, and since it's postcopy nothing else 1845 * will dirty it, and we must make sure it doesn't get sent again 1846 * even if this queue request was received after the background 1847 * search already sent it. 1848 */ 1849 if (block) { 1850 unsigned long page; 1851 1852 page = offset >> TARGET_PAGE_BITS; 1853 dirty = test_bit(page, block->bmap); 1854 if (!dirty) { 1855 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1856 page); 1857 } else { 1858 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1859 } 1860 } 1861 1862 } while (block && !dirty); 1863 1864 if (!block) { 1865 /* 1866 * Poll write faults too if background snapshot is enabled; that's 1867 * when we have vcpus got blocked by the write protected pages. 1868 */ 1869 block = poll_fault_page(rs, &offset); 1870 } 1871 1872 if (block) { 1873 /* 1874 * We want the background search to continue from the queued page 1875 * since the guest is likely to want other pages near to the page 1876 * it just requested. 1877 */ 1878 pss->block = block; 1879 pss->page = offset >> TARGET_PAGE_BITS; 1880 1881 /* 1882 * This unqueued page would break the "one round" check, even is 1883 * really rare. 1884 */ 1885 pss->complete_round = false; 1886 } 1887 1888 return !!block; 1889 } 1890 1891 /** 1892 * migration_page_queue_free: drop any remaining pages in the ram 1893 * request queue 1894 * 1895 * It should be empty at the end anyway, but in error cases there may 1896 * be some left. in case that there is any page left, we drop it. 1897 * 1898 */ 1899 static void migration_page_queue_free(RAMState *rs) 1900 { 1901 struct RAMSrcPageRequest *mspr, *next_mspr; 1902 /* This queue generally should be empty - but in the case of a failed 1903 * migration might have some droppings in. 1904 */ 1905 RCU_READ_LOCK_GUARD(); 1906 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1907 memory_region_unref(mspr->rb->mr); 1908 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1909 g_free(mspr); 1910 } 1911 } 1912 1913 /** 1914 * ram_save_queue_pages: queue the page for transmission 1915 * 1916 * A request from postcopy destination for example. 1917 * 1918 * Returns zero on success or negative on error 1919 * 1920 * @rbname: Name of the RAMBLock of the request. NULL means the 1921 * same that last one. 1922 * @start: starting address from the start of the RAMBlock 1923 * @len: length (in bytes) to send 1924 */ 1925 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1926 { 1927 RAMBlock *ramblock; 1928 RAMState *rs = ram_state; 1929 1930 stat64_add(&mig_stats.postcopy_requests, 1); 1931 RCU_READ_LOCK_GUARD(); 1932 1933 if (!rbname) { 1934 /* Reuse last RAMBlock */ 1935 ramblock = rs->last_req_rb; 1936 1937 if (!ramblock) { 1938 /* 1939 * Shouldn't happen, we can't reuse the last RAMBlock if 1940 * it's the 1st request. 1941 */ 1942 error_report("ram_save_queue_pages no previous block"); 1943 return -1; 1944 } 1945 } else { 1946 ramblock = qemu_ram_block_by_name(rbname); 1947 1948 if (!ramblock) { 1949 /* We shouldn't be asked for a non-existent RAMBlock */ 1950 error_report("ram_save_queue_pages no block '%s'", rbname); 1951 return -1; 1952 } 1953 rs->last_req_rb = ramblock; 1954 } 1955 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1956 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1957 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1958 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1959 __func__, start, len, ramblock->used_length); 1960 return -1; 1961 } 1962 1963 /* 1964 * When with postcopy preempt, we send back the page directly in the 1965 * rp-return thread. 1966 */ 1967 if (postcopy_preempt_active()) { 1968 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1969 size_t page_size = qemu_ram_pagesize(ramblock); 1970 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1971 int ret = 0; 1972 1973 qemu_mutex_lock(&rs->bitmap_mutex); 1974 1975 pss_init(pss, ramblock, page_start); 1976 /* 1977 * Always use the preempt channel, and make sure it's there. It's 1978 * safe to access without lock, because when rp-thread is running 1979 * we should be the only one who operates on the qemufile 1980 */ 1981 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 1982 assert(pss->pss_channel); 1983 1984 /* 1985 * It must be either one or multiple of host page size. Just 1986 * assert; if something wrong we're mostly split brain anyway. 1987 */ 1988 assert(len % page_size == 0); 1989 while (len) { 1990 if (ram_save_host_page_urgent(pss)) { 1991 error_report("%s: ram_save_host_page_urgent() failed: " 1992 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 1993 __func__, ramblock->idstr, start); 1994 ret = -1; 1995 break; 1996 } 1997 /* 1998 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 1999 * will automatically be moved and point to the next host page 2000 * we're going to send, so no need to update here. 2001 * 2002 * Normally QEMU never sends >1 host page in requests, so 2003 * logically we don't even need that as the loop should only 2004 * run once, but just to be consistent. 2005 */ 2006 len -= page_size; 2007 }; 2008 qemu_mutex_unlock(&rs->bitmap_mutex); 2009 2010 return ret; 2011 } 2012 2013 struct RAMSrcPageRequest *new_entry = 2014 g_new0(struct RAMSrcPageRequest, 1); 2015 new_entry->rb = ramblock; 2016 new_entry->offset = start; 2017 new_entry->len = len; 2018 2019 memory_region_ref(ramblock->mr); 2020 qemu_mutex_lock(&rs->src_page_req_mutex); 2021 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2022 migration_make_urgent_request(); 2023 qemu_mutex_unlock(&rs->src_page_req_mutex); 2024 2025 return 0; 2026 } 2027 2028 static bool save_page_use_compression(RAMState *rs) 2029 { 2030 if (!migrate_compress()) { 2031 return false; 2032 } 2033 2034 /* 2035 * If xbzrle is enabled (e.g., after first round of migration), stop 2036 * using the data compression. In theory, xbzrle can do better than 2037 * compression. 2038 */ 2039 if (rs->xbzrle_started) { 2040 return false; 2041 } 2042 2043 return true; 2044 } 2045 2046 /* 2047 * try to compress the page before posting it out, return true if the page 2048 * has been properly handled by compression, otherwise needs other 2049 * paths to handle it 2050 */ 2051 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2052 RAMBlock *block, ram_addr_t offset) 2053 { 2054 if (!save_page_use_compression(rs)) { 2055 return false; 2056 } 2057 2058 /* 2059 * When starting the process of a new block, the first page of 2060 * the block should be sent out before other pages in the same 2061 * block, and all the pages in last block should have been sent 2062 * out, keeping this order is important, because the 'cont' flag 2063 * is used to avoid resending the block name. 2064 * 2065 * We post the fist page as normal page as compression will take 2066 * much CPU resource. 2067 */ 2068 if (block != pss->last_sent_block) { 2069 ram_flush_compressed_data(rs); 2070 return false; 2071 } 2072 2073 if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) { 2074 return true; 2075 } 2076 2077 compression_counters.busy++; 2078 return false; 2079 } 2080 2081 /** 2082 * ram_save_target_page_legacy: save one target page 2083 * 2084 * Returns the number of pages written 2085 * 2086 * @rs: current RAM state 2087 * @pss: data about the page we want to send 2088 */ 2089 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2090 { 2091 RAMBlock *block = pss->block; 2092 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2093 int res; 2094 2095 if (control_save_page(pss, block, offset, &res)) { 2096 return res; 2097 } 2098 2099 if (save_compress_page(rs, pss, block, offset)) { 2100 return 1; 2101 } 2102 2103 res = save_zero_page(pss, pss->pss_channel, block, offset); 2104 if (res > 0) { 2105 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2106 * page would be stale 2107 */ 2108 if (rs->xbzrle_started) { 2109 XBZRLE_cache_lock(); 2110 xbzrle_cache_zero_page(rs, block->offset + offset); 2111 XBZRLE_cache_unlock(); 2112 } 2113 return res; 2114 } 2115 2116 /* 2117 * Do not use multifd in postcopy as one whole host page should be 2118 * placed. Meanwhile postcopy requires atomic update of pages, so even 2119 * if host page size == guest page size the dest guest during run may 2120 * still see partially copied pages which is data corruption. 2121 */ 2122 if (migrate_multifd() && !migration_in_postcopy()) { 2123 return ram_save_multifd_page(pss->pss_channel, block, offset); 2124 } 2125 2126 return ram_save_page(rs, pss); 2127 } 2128 2129 /* Should be called before sending a host page */ 2130 static void pss_host_page_prepare(PageSearchStatus *pss) 2131 { 2132 /* How many guest pages are there in one host page? */ 2133 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2134 2135 pss->host_page_sending = true; 2136 if (guest_pfns <= 1) { 2137 /* 2138 * This covers both when guest psize == host psize, or when guest 2139 * has larger psize than the host (guest_pfns==0). 2140 * 2141 * For the latter, we always send one whole guest page per 2142 * iteration of the host page (example: an Alpha VM on x86 host 2143 * will have guest psize 8K while host psize 4K). 2144 */ 2145 pss->host_page_start = pss->page; 2146 pss->host_page_end = pss->page + 1; 2147 } else { 2148 /* 2149 * The host page spans over multiple guest pages, we send them 2150 * within the same host page iteration. 2151 */ 2152 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2153 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2154 } 2155 } 2156 2157 /* 2158 * Whether the page pointed by PSS is within the host page being sent. 2159 * Must be called after a previous pss_host_page_prepare(). 2160 */ 2161 static bool pss_within_range(PageSearchStatus *pss) 2162 { 2163 ram_addr_t ram_addr; 2164 2165 assert(pss->host_page_sending); 2166 2167 /* Over host-page boundary? */ 2168 if (pss->page >= pss->host_page_end) { 2169 return false; 2170 } 2171 2172 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2173 2174 return offset_in_ramblock(pss->block, ram_addr); 2175 } 2176 2177 static void pss_host_page_finish(PageSearchStatus *pss) 2178 { 2179 pss->host_page_sending = false; 2180 /* This is not needed, but just to reset it */ 2181 pss->host_page_start = pss->host_page_end = 0; 2182 } 2183 2184 /* 2185 * Send an urgent host page specified by `pss'. Need to be called with 2186 * bitmap_mutex held. 2187 * 2188 * Returns 0 if save host page succeeded, false otherwise. 2189 */ 2190 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2191 { 2192 bool page_dirty, sent = false; 2193 RAMState *rs = ram_state; 2194 int ret = 0; 2195 2196 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2197 pss_host_page_prepare(pss); 2198 2199 /* 2200 * If precopy is sending the same page, let it be done in precopy, or 2201 * we could send the same page in two channels and none of them will 2202 * receive the whole page. 2203 */ 2204 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2205 trace_postcopy_preempt_hit(pss->block->idstr, 2206 pss->page << TARGET_PAGE_BITS); 2207 return 0; 2208 } 2209 2210 do { 2211 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2212 2213 if (page_dirty) { 2214 /* Be strict to return code; it must be 1, or what else? */ 2215 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2216 error_report_once("%s: ram_save_target_page failed", __func__); 2217 ret = -1; 2218 goto out; 2219 } 2220 sent = true; 2221 } 2222 pss_find_next_dirty(pss); 2223 } while (pss_within_range(pss)); 2224 out: 2225 pss_host_page_finish(pss); 2226 /* For urgent requests, flush immediately if sent */ 2227 if (sent) { 2228 qemu_fflush(pss->pss_channel); 2229 } 2230 return ret; 2231 } 2232 2233 /** 2234 * ram_save_host_page: save a whole host page 2235 * 2236 * Starting at *offset send pages up to the end of the current host 2237 * page. It's valid for the initial offset to point into the middle of 2238 * a host page in which case the remainder of the hostpage is sent. 2239 * Only dirty target pages are sent. Note that the host page size may 2240 * be a huge page for this block. 2241 * 2242 * The saving stops at the boundary of the used_length of the block 2243 * if the RAMBlock isn't a multiple of the host page size. 2244 * 2245 * The caller must be with ram_state.bitmap_mutex held to call this 2246 * function. Note that this function can temporarily release the lock, but 2247 * when the function is returned it'll make sure the lock is still held. 2248 * 2249 * Returns the number of pages written or negative on error 2250 * 2251 * @rs: current RAM state 2252 * @pss: data about the page we want to send 2253 */ 2254 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2255 { 2256 bool page_dirty, preempt_active = postcopy_preempt_active(); 2257 int tmppages, pages = 0; 2258 size_t pagesize_bits = 2259 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2260 unsigned long start_page = pss->page; 2261 int res; 2262 2263 if (ramblock_is_ignored(pss->block)) { 2264 error_report("block %s should not be migrated !", pss->block->idstr); 2265 return 0; 2266 } 2267 2268 /* Update host page boundary information */ 2269 pss_host_page_prepare(pss); 2270 2271 do { 2272 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2273 2274 /* Check the pages is dirty and if it is send it */ 2275 if (page_dirty) { 2276 /* 2277 * Properly yield the lock only in postcopy preempt mode 2278 * because both migration thread and rp-return thread can 2279 * operate on the bitmaps. 2280 */ 2281 if (preempt_active) { 2282 qemu_mutex_unlock(&rs->bitmap_mutex); 2283 } 2284 tmppages = migration_ops->ram_save_target_page(rs, pss); 2285 if (tmppages >= 0) { 2286 pages += tmppages; 2287 /* 2288 * Allow rate limiting to happen in the middle of huge pages if 2289 * something is sent in the current iteration. 2290 */ 2291 if (pagesize_bits > 1 && tmppages > 0) { 2292 migration_rate_limit(); 2293 } 2294 } 2295 if (preempt_active) { 2296 qemu_mutex_lock(&rs->bitmap_mutex); 2297 } 2298 } else { 2299 tmppages = 0; 2300 } 2301 2302 if (tmppages < 0) { 2303 pss_host_page_finish(pss); 2304 return tmppages; 2305 } 2306 2307 pss_find_next_dirty(pss); 2308 } while (pss_within_range(pss)); 2309 2310 pss_host_page_finish(pss); 2311 2312 res = ram_save_release_protection(rs, pss, start_page); 2313 return (res < 0 ? res : pages); 2314 } 2315 2316 /** 2317 * ram_find_and_save_block: finds a dirty page and sends it to f 2318 * 2319 * Called within an RCU critical section. 2320 * 2321 * Returns the number of pages written where zero means no dirty pages, 2322 * or negative on error 2323 * 2324 * @rs: current RAM state 2325 * 2326 * On systems where host-page-size > target-page-size it will send all the 2327 * pages in a host page that are dirty. 2328 */ 2329 static int ram_find_and_save_block(RAMState *rs) 2330 { 2331 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2332 int pages = 0; 2333 2334 /* No dirty page as there is zero RAM */ 2335 if (!rs->ram_bytes_total) { 2336 return pages; 2337 } 2338 2339 /* 2340 * Always keep last_seen_block/last_page valid during this procedure, 2341 * because find_dirty_block() relies on these values (e.g., we compare 2342 * last_seen_block with pss.block to see whether we searched all the 2343 * ramblocks) to detect the completion of migration. Having NULL value 2344 * of last_seen_block can conditionally cause below loop to run forever. 2345 */ 2346 if (!rs->last_seen_block) { 2347 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2348 rs->last_page = 0; 2349 } 2350 2351 pss_init(pss, rs->last_seen_block, rs->last_page); 2352 2353 while (true){ 2354 if (!get_queued_page(rs, pss)) { 2355 /* priority queue empty, so just search for something dirty */ 2356 int res = find_dirty_block(rs, pss); 2357 if (res != PAGE_DIRTY_FOUND) { 2358 if (res == PAGE_ALL_CLEAN) { 2359 break; 2360 } else if (res == PAGE_TRY_AGAIN) { 2361 continue; 2362 } else if (res < 0) { 2363 pages = res; 2364 break; 2365 } 2366 } 2367 } 2368 pages = ram_save_host_page(rs, pss); 2369 if (pages) { 2370 break; 2371 } 2372 } 2373 2374 rs->last_seen_block = pss->block; 2375 rs->last_page = pss->page; 2376 2377 return pages; 2378 } 2379 2380 static uint64_t ram_bytes_total_with_ignored(void) 2381 { 2382 RAMBlock *block; 2383 uint64_t total = 0; 2384 2385 RCU_READ_LOCK_GUARD(); 2386 2387 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2388 total += block->used_length; 2389 } 2390 return total; 2391 } 2392 2393 uint64_t ram_bytes_total(void) 2394 { 2395 RAMBlock *block; 2396 uint64_t total = 0; 2397 2398 RCU_READ_LOCK_GUARD(); 2399 2400 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2401 total += block->used_length; 2402 } 2403 return total; 2404 } 2405 2406 static void xbzrle_load_setup(void) 2407 { 2408 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2409 } 2410 2411 static void xbzrle_load_cleanup(void) 2412 { 2413 g_free(XBZRLE.decoded_buf); 2414 XBZRLE.decoded_buf = NULL; 2415 } 2416 2417 static void ram_state_cleanup(RAMState **rsp) 2418 { 2419 if (*rsp) { 2420 migration_page_queue_free(*rsp); 2421 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2422 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2423 g_free(*rsp); 2424 *rsp = NULL; 2425 } 2426 } 2427 2428 static void xbzrle_cleanup(void) 2429 { 2430 XBZRLE_cache_lock(); 2431 if (XBZRLE.cache) { 2432 cache_fini(XBZRLE.cache); 2433 g_free(XBZRLE.encoded_buf); 2434 g_free(XBZRLE.current_buf); 2435 g_free(XBZRLE.zero_target_page); 2436 XBZRLE.cache = NULL; 2437 XBZRLE.encoded_buf = NULL; 2438 XBZRLE.current_buf = NULL; 2439 XBZRLE.zero_target_page = NULL; 2440 } 2441 XBZRLE_cache_unlock(); 2442 } 2443 2444 static void ram_save_cleanup(void *opaque) 2445 { 2446 RAMState **rsp = opaque; 2447 RAMBlock *block; 2448 2449 /* We don't use dirty log with background snapshots */ 2450 if (!migrate_background_snapshot()) { 2451 /* caller have hold iothread lock or is in a bh, so there is 2452 * no writing race against the migration bitmap 2453 */ 2454 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2455 /* 2456 * do not stop dirty log without starting it, since 2457 * memory_global_dirty_log_stop will assert that 2458 * memory_global_dirty_log_start/stop used in pairs 2459 */ 2460 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2461 } 2462 } 2463 2464 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2465 g_free(block->clear_bmap); 2466 block->clear_bmap = NULL; 2467 g_free(block->bmap); 2468 block->bmap = NULL; 2469 } 2470 2471 xbzrle_cleanup(); 2472 compress_threads_save_cleanup(); 2473 ram_state_cleanup(rsp); 2474 g_free(migration_ops); 2475 migration_ops = NULL; 2476 } 2477 2478 static void ram_state_reset(RAMState *rs) 2479 { 2480 int i; 2481 2482 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2483 rs->pss[i].last_sent_block = NULL; 2484 } 2485 2486 rs->last_seen_block = NULL; 2487 rs->last_page = 0; 2488 rs->last_version = ram_list.version; 2489 rs->xbzrle_started = false; 2490 } 2491 2492 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2493 2494 /* **** functions for postcopy ***** */ 2495 2496 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2497 { 2498 struct RAMBlock *block; 2499 2500 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2501 unsigned long *bitmap = block->bmap; 2502 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2503 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2504 2505 while (run_start < range) { 2506 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2507 ram_discard_range(block->idstr, 2508 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2509 ((ram_addr_t)(run_end - run_start)) 2510 << TARGET_PAGE_BITS); 2511 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2512 } 2513 } 2514 } 2515 2516 /** 2517 * postcopy_send_discard_bm_ram: discard a RAMBlock 2518 * 2519 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2520 * 2521 * @ms: current migration state 2522 * @block: RAMBlock to discard 2523 */ 2524 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2525 { 2526 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2527 unsigned long current; 2528 unsigned long *bitmap = block->bmap; 2529 2530 for (current = 0; current < end; ) { 2531 unsigned long one = find_next_bit(bitmap, end, current); 2532 unsigned long zero, discard_length; 2533 2534 if (one >= end) { 2535 break; 2536 } 2537 2538 zero = find_next_zero_bit(bitmap, end, one + 1); 2539 2540 if (zero >= end) { 2541 discard_length = end - one; 2542 } else { 2543 discard_length = zero - one; 2544 } 2545 postcopy_discard_send_range(ms, one, discard_length); 2546 current = one + discard_length; 2547 } 2548 } 2549 2550 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2551 2552 /** 2553 * postcopy_each_ram_send_discard: discard all RAMBlocks 2554 * 2555 * Utility for the outgoing postcopy code. 2556 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2557 * passing it bitmap indexes and name. 2558 * (qemu_ram_foreach_block ends up passing unscaled lengths 2559 * which would mean postcopy code would have to deal with target page) 2560 * 2561 * @ms: current migration state 2562 */ 2563 static void postcopy_each_ram_send_discard(MigrationState *ms) 2564 { 2565 struct RAMBlock *block; 2566 2567 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2568 postcopy_discard_send_init(ms, block->idstr); 2569 2570 /* 2571 * Deal with TPS != HPS and huge pages. It discard any partially sent 2572 * host-page size chunks, mark any partially dirty host-page size 2573 * chunks as all dirty. In this case the host-page is the host-page 2574 * for the particular RAMBlock, i.e. it might be a huge page. 2575 */ 2576 postcopy_chunk_hostpages_pass(ms, block); 2577 2578 /* 2579 * Postcopy sends chunks of bitmap over the wire, but it 2580 * just needs indexes at this point, avoids it having 2581 * target page specific code. 2582 */ 2583 postcopy_send_discard_bm_ram(ms, block); 2584 postcopy_discard_send_finish(ms); 2585 } 2586 } 2587 2588 /** 2589 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2590 * 2591 * Helper for postcopy_chunk_hostpages; it's called twice to 2592 * canonicalize the two bitmaps, that are similar, but one is 2593 * inverted. 2594 * 2595 * Postcopy requires that all target pages in a hostpage are dirty or 2596 * clean, not a mix. This function canonicalizes the bitmaps. 2597 * 2598 * @ms: current migration state 2599 * @block: block that contains the page we want to canonicalize 2600 */ 2601 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2602 { 2603 RAMState *rs = ram_state; 2604 unsigned long *bitmap = block->bmap; 2605 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2606 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2607 unsigned long run_start; 2608 2609 if (block->page_size == TARGET_PAGE_SIZE) { 2610 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2611 return; 2612 } 2613 2614 /* Find a dirty page */ 2615 run_start = find_next_bit(bitmap, pages, 0); 2616 2617 while (run_start < pages) { 2618 2619 /* 2620 * If the start of this run of pages is in the middle of a host 2621 * page, then we need to fixup this host page. 2622 */ 2623 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2624 /* Find the end of this run */ 2625 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2626 /* 2627 * If the end isn't at the start of a host page, then the 2628 * run doesn't finish at the end of a host page 2629 * and we need to discard. 2630 */ 2631 } 2632 2633 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2634 unsigned long page; 2635 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2636 host_ratio); 2637 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2638 2639 /* Clean up the bitmap */ 2640 for (page = fixup_start_addr; 2641 page < fixup_start_addr + host_ratio; page++) { 2642 /* 2643 * Remark them as dirty, updating the count for any pages 2644 * that weren't previously dirty. 2645 */ 2646 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2647 } 2648 } 2649 2650 /* Find the next dirty page for the next iteration */ 2651 run_start = find_next_bit(bitmap, pages, run_start); 2652 } 2653 } 2654 2655 /** 2656 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2657 * 2658 * Transmit the set of pages to be discarded after precopy to the target 2659 * these are pages that: 2660 * a) Have been previously transmitted but are now dirty again 2661 * b) Pages that have never been transmitted, this ensures that 2662 * any pages on the destination that have been mapped by background 2663 * tasks get discarded (transparent huge pages is the specific concern) 2664 * Hopefully this is pretty sparse 2665 * 2666 * @ms: current migration state 2667 */ 2668 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2669 { 2670 RAMState *rs = ram_state; 2671 2672 RCU_READ_LOCK_GUARD(); 2673 2674 /* This should be our last sync, the src is now paused */ 2675 migration_bitmap_sync(rs, false); 2676 2677 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2678 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2679 rs->last_seen_block = NULL; 2680 rs->last_page = 0; 2681 2682 postcopy_each_ram_send_discard(ms); 2683 2684 trace_ram_postcopy_send_discard_bitmap(); 2685 } 2686 2687 /** 2688 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2689 * 2690 * Returns zero on success 2691 * 2692 * @rbname: name of the RAMBlock of the request. NULL means the 2693 * same that last one. 2694 * @start: RAMBlock starting page 2695 * @length: RAMBlock size 2696 */ 2697 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2698 { 2699 trace_ram_discard_range(rbname, start, length); 2700 2701 RCU_READ_LOCK_GUARD(); 2702 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2703 2704 if (!rb) { 2705 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2706 return -1; 2707 } 2708 2709 /* 2710 * On source VM, we don't need to update the received bitmap since 2711 * we don't even have one. 2712 */ 2713 if (rb->receivedmap) { 2714 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2715 length >> qemu_target_page_bits()); 2716 } 2717 2718 return ram_block_discard_range(rb, start, length); 2719 } 2720 2721 /* 2722 * For every allocation, we will try not to crash the VM if the 2723 * allocation failed. 2724 */ 2725 static int xbzrle_init(void) 2726 { 2727 Error *local_err = NULL; 2728 2729 if (!migrate_xbzrle()) { 2730 return 0; 2731 } 2732 2733 XBZRLE_cache_lock(); 2734 2735 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2736 if (!XBZRLE.zero_target_page) { 2737 error_report("%s: Error allocating zero page", __func__); 2738 goto err_out; 2739 } 2740 2741 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2742 TARGET_PAGE_SIZE, &local_err); 2743 if (!XBZRLE.cache) { 2744 error_report_err(local_err); 2745 goto free_zero_page; 2746 } 2747 2748 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2749 if (!XBZRLE.encoded_buf) { 2750 error_report("%s: Error allocating encoded_buf", __func__); 2751 goto free_cache; 2752 } 2753 2754 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2755 if (!XBZRLE.current_buf) { 2756 error_report("%s: Error allocating current_buf", __func__); 2757 goto free_encoded_buf; 2758 } 2759 2760 /* We are all good */ 2761 XBZRLE_cache_unlock(); 2762 return 0; 2763 2764 free_encoded_buf: 2765 g_free(XBZRLE.encoded_buf); 2766 XBZRLE.encoded_buf = NULL; 2767 free_cache: 2768 cache_fini(XBZRLE.cache); 2769 XBZRLE.cache = NULL; 2770 free_zero_page: 2771 g_free(XBZRLE.zero_target_page); 2772 XBZRLE.zero_target_page = NULL; 2773 err_out: 2774 XBZRLE_cache_unlock(); 2775 return -ENOMEM; 2776 } 2777 2778 static int ram_state_init(RAMState **rsp) 2779 { 2780 *rsp = g_try_new0(RAMState, 1); 2781 2782 if (!*rsp) { 2783 error_report("%s: Init ramstate fail", __func__); 2784 return -1; 2785 } 2786 2787 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2788 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2789 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2790 (*rsp)->ram_bytes_total = ram_bytes_total(); 2791 2792 /* 2793 * Count the total number of pages used by ram blocks not including any 2794 * gaps due to alignment or unplugs. 2795 * This must match with the initial values of dirty bitmap. 2796 */ 2797 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2798 ram_state_reset(*rsp); 2799 2800 return 0; 2801 } 2802 2803 static void ram_list_init_bitmaps(void) 2804 { 2805 MigrationState *ms = migrate_get_current(); 2806 RAMBlock *block; 2807 unsigned long pages; 2808 uint8_t shift; 2809 2810 /* Skip setting bitmap if there is no RAM */ 2811 if (ram_bytes_total()) { 2812 shift = ms->clear_bitmap_shift; 2813 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2814 error_report("clear_bitmap_shift (%u) too big, using " 2815 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2816 shift = CLEAR_BITMAP_SHIFT_MAX; 2817 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2818 error_report("clear_bitmap_shift (%u) too small, using " 2819 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2820 shift = CLEAR_BITMAP_SHIFT_MIN; 2821 } 2822 2823 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2824 pages = block->max_length >> TARGET_PAGE_BITS; 2825 /* 2826 * The initial dirty bitmap for migration must be set with all 2827 * ones to make sure we'll migrate every guest RAM page to 2828 * destination. 2829 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2830 * new migration after a failed migration, ram_list. 2831 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2832 * guest memory. 2833 */ 2834 block->bmap = bitmap_new(pages); 2835 bitmap_set(block->bmap, 0, pages); 2836 block->clear_bmap_shift = shift; 2837 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2838 } 2839 } 2840 } 2841 2842 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2843 { 2844 unsigned long pages; 2845 RAMBlock *rb; 2846 2847 RCU_READ_LOCK_GUARD(); 2848 2849 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2850 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2851 rs->migration_dirty_pages -= pages; 2852 } 2853 } 2854 2855 static void ram_init_bitmaps(RAMState *rs) 2856 { 2857 /* For memory_global_dirty_log_start below. */ 2858 qemu_mutex_lock_iothread(); 2859 qemu_mutex_lock_ramlist(); 2860 2861 WITH_RCU_READ_LOCK_GUARD() { 2862 ram_list_init_bitmaps(); 2863 /* We don't use dirty log with background snapshots */ 2864 if (!migrate_background_snapshot()) { 2865 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 2866 migration_bitmap_sync_precopy(rs, false); 2867 } 2868 } 2869 qemu_mutex_unlock_ramlist(); 2870 qemu_mutex_unlock_iothread(); 2871 2872 /* 2873 * After an eventual first bitmap sync, fixup the initial bitmap 2874 * containing all 1s to exclude any discarded pages from migration. 2875 */ 2876 migration_bitmap_clear_discarded_pages(rs); 2877 } 2878 2879 static int ram_init_all(RAMState **rsp) 2880 { 2881 if (ram_state_init(rsp)) { 2882 return -1; 2883 } 2884 2885 if (xbzrle_init()) { 2886 ram_state_cleanup(rsp); 2887 return -1; 2888 } 2889 2890 ram_init_bitmaps(*rsp); 2891 2892 return 0; 2893 } 2894 2895 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2896 { 2897 RAMBlock *block; 2898 uint64_t pages = 0; 2899 2900 /* 2901 * Postcopy is not using xbzrle/compression, so no need for that. 2902 * Also, since source are already halted, we don't need to care 2903 * about dirty page logging as well. 2904 */ 2905 2906 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2907 pages += bitmap_count_one(block->bmap, 2908 block->used_length >> TARGET_PAGE_BITS); 2909 } 2910 2911 /* This may not be aligned with current bitmaps. Recalculate. */ 2912 rs->migration_dirty_pages = pages; 2913 2914 ram_state_reset(rs); 2915 2916 /* Update RAMState cache of output QEMUFile */ 2917 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2918 2919 trace_ram_state_resume_prepare(pages); 2920 } 2921 2922 /* 2923 * This function clears bits of the free pages reported by the caller from the 2924 * migration dirty bitmap. @addr is the host address corresponding to the 2925 * start of the continuous guest free pages, and @len is the total bytes of 2926 * those pages. 2927 */ 2928 void qemu_guest_free_page_hint(void *addr, size_t len) 2929 { 2930 RAMBlock *block; 2931 ram_addr_t offset; 2932 size_t used_len, start, npages; 2933 MigrationState *s = migrate_get_current(); 2934 2935 /* This function is currently expected to be used during live migration */ 2936 if (!migration_is_setup_or_active(s->state)) { 2937 return; 2938 } 2939 2940 for (; len > 0; len -= used_len, addr += used_len) { 2941 block = qemu_ram_block_from_host(addr, false, &offset); 2942 if (unlikely(!block || offset >= block->used_length)) { 2943 /* 2944 * The implementation might not support RAMBlock resize during 2945 * live migration, but it could happen in theory with future 2946 * updates. So we add a check here to capture that case. 2947 */ 2948 error_report_once("%s unexpected error", __func__); 2949 return; 2950 } 2951 2952 if (len <= block->used_length - offset) { 2953 used_len = len; 2954 } else { 2955 used_len = block->used_length - offset; 2956 } 2957 2958 start = offset >> TARGET_PAGE_BITS; 2959 npages = used_len >> TARGET_PAGE_BITS; 2960 2961 qemu_mutex_lock(&ram_state->bitmap_mutex); 2962 /* 2963 * The skipped free pages are equavalent to be sent from clear_bmap's 2964 * perspective, so clear the bits from the memory region bitmap which 2965 * are initially set. Otherwise those skipped pages will be sent in 2966 * the next round after syncing from the memory region bitmap. 2967 */ 2968 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2969 ram_state->migration_dirty_pages -= 2970 bitmap_count_one_with_offset(block->bmap, start, npages); 2971 bitmap_clear(block->bmap, start, npages); 2972 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2973 } 2974 } 2975 2976 /* 2977 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2978 * long-running RCU critical section. When rcu-reclaims in the code 2979 * start to become numerous it will be necessary to reduce the 2980 * granularity of these critical sections. 2981 */ 2982 2983 /** 2984 * ram_save_setup: Setup RAM for migration 2985 * 2986 * Returns zero to indicate success and negative for error 2987 * 2988 * @f: QEMUFile where to send the data 2989 * @opaque: RAMState pointer 2990 */ 2991 static int ram_save_setup(QEMUFile *f, void *opaque) 2992 { 2993 RAMState **rsp = opaque; 2994 RAMBlock *block; 2995 int ret; 2996 2997 if (compress_threads_save_setup()) { 2998 return -1; 2999 } 3000 3001 /* migration has already setup the bitmap, reuse it. */ 3002 if (!migration_in_colo_state()) { 3003 if (ram_init_all(rsp) != 0) { 3004 compress_threads_save_cleanup(); 3005 return -1; 3006 } 3007 } 3008 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3009 3010 WITH_RCU_READ_LOCK_GUARD() { 3011 qemu_put_be64(f, ram_bytes_total_with_ignored() 3012 | RAM_SAVE_FLAG_MEM_SIZE); 3013 3014 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3015 qemu_put_byte(f, strlen(block->idstr)); 3016 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3017 qemu_put_be64(f, block->used_length); 3018 if (migrate_postcopy_ram() && block->page_size != 3019 qemu_host_page_size) { 3020 qemu_put_be64(f, block->page_size); 3021 } 3022 if (migrate_ignore_shared()) { 3023 qemu_put_be64(f, block->mr->addr); 3024 } 3025 } 3026 } 3027 3028 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3029 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3030 3031 migration_ops = g_malloc0(sizeof(MigrationOps)); 3032 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3033 ret = multifd_send_sync_main(f); 3034 if (ret < 0) { 3035 return ret; 3036 } 3037 3038 if (!migrate_multifd_flush_after_each_section()) { 3039 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3040 } 3041 3042 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3043 qemu_fflush(f); 3044 3045 return 0; 3046 } 3047 3048 /** 3049 * ram_save_iterate: iterative stage for migration 3050 * 3051 * Returns zero to indicate success and negative for error 3052 * 3053 * @f: QEMUFile where to send the data 3054 * @opaque: RAMState pointer 3055 */ 3056 static int ram_save_iterate(QEMUFile *f, void *opaque) 3057 { 3058 RAMState **temp = opaque; 3059 RAMState *rs = *temp; 3060 int ret = 0; 3061 int i; 3062 int64_t t0; 3063 int done = 0; 3064 3065 if (blk_mig_bulk_active()) { 3066 /* Avoid transferring ram during bulk phase of block migration as 3067 * the bulk phase will usually take a long time and transferring 3068 * ram updates during that time is pointless. */ 3069 goto out; 3070 } 3071 3072 /* 3073 * We'll take this lock a little bit long, but it's okay for two reasons. 3074 * Firstly, the only possible other thread to take it is who calls 3075 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3076 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3077 * guarantees that we'll at least released it in a regular basis. 3078 */ 3079 qemu_mutex_lock(&rs->bitmap_mutex); 3080 WITH_RCU_READ_LOCK_GUARD() { 3081 if (ram_list.version != rs->last_version) { 3082 ram_state_reset(rs); 3083 } 3084 3085 /* Read version before ram_list.blocks */ 3086 smp_rmb(); 3087 3088 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3089 3090 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3091 i = 0; 3092 while ((ret = migration_rate_exceeded(f)) == 0 || 3093 postcopy_has_request(rs)) { 3094 int pages; 3095 3096 if (qemu_file_get_error(f)) { 3097 break; 3098 } 3099 3100 pages = ram_find_and_save_block(rs); 3101 /* no more pages to sent */ 3102 if (pages == 0) { 3103 done = 1; 3104 break; 3105 } 3106 3107 if (pages < 0) { 3108 qemu_file_set_error(f, pages); 3109 break; 3110 } 3111 3112 rs->target_page_count += pages; 3113 3114 /* 3115 * During postcopy, it is necessary to make sure one whole host 3116 * page is sent in one chunk. 3117 */ 3118 if (migrate_postcopy_ram()) { 3119 ram_flush_compressed_data(rs); 3120 } 3121 3122 /* 3123 * we want to check in the 1st loop, just in case it was the 1st 3124 * time and we had to sync the dirty bitmap. 3125 * qemu_clock_get_ns() is a bit expensive, so we only check each 3126 * some iterations 3127 */ 3128 if ((i & 63) == 0) { 3129 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3130 1000000; 3131 if (t1 > MAX_WAIT) { 3132 trace_ram_save_iterate_big_wait(t1, i); 3133 break; 3134 } 3135 } 3136 i++; 3137 } 3138 } 3139 qemu_mutex_unlock(&rs->bitmap_mutex); 3140 3141 /* 3142 * Must occur before EOS (or any QEMUFile operation) 3143 * because of RDMA protocol. 3144 */ 3145 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3146 3147 out: 3148 if (ret >= 0 3149 && migration_is_setup_or_active(migrate_get_current()->state)) { 3150 if (migrate_multifd_flush_after_each_section()) { 3151 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3152 if (ret < 0) { 3153 return ret; 3154 } 3155 } 3156 3157 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3158 qemu_fflush(f); 3159 ram_transferred_add(8); 3160 3161 ret = qemu_file_get_error(f); 3162 } 3163 if (ret < 0) { 3164 return ret; 3165 } 3166 3167 return done; 3168 } 3169 3170 /** 3171 * ram_save_complete: function called to send the remaining amount of ram 3172 * 3173 * Returns zero to indicate success or negative on error 3174 * 3175 * Called with iothread lock 3176 * 3177 * @f: QEMUFile where to send the data 3178 * @opaque: RAMState pointer 3179 */ 3180 static int ram_save_complete(QEMUFile *f, void *opaque) 3181 { 3182 RAMState **temp = opaque; 3183 RAMState *rs = *temp; 3184 int ret = 0; 3185 3186 rs->last_stage = !migration_in_colo_state(); 3187 3188 WITH_RCU_READ_LOCK_GUARD() { 3189 if (!migration_in_postcopy()) { 3190 migration_bitmap_sync_precopy(rs, true); 3191 } 3192 3193 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3194 3195 /* try transferring iterative blocks of memory */ 3196 3197 /* flush all remaining blocks regardless of rate limiting */ 3198 qemu_mutex_lock(&rs->bitmap_mutex); 3199 while (true) { 3200 int pages; 3201 3202 pages = ram_find_and_save_block(rs); 3203 /* no more blocks to sent */ 3204 if (pages == 0) { 3205 break; 3206 } 3207 if (pages < 0) { 3208 ret = pages; 3209 break; 3210 } 3211 } 3212 qemu_mutex_unlock(&rs->bitmap_mutex); 3213 3214 ram_flush_compressed_data(rs); 3215 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3216 } 3217 3218 if (ret < 0) { 3219 return ret; 3220 } 3221 3222 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3223 if (ret < 0) { 3224 return ret; 3225 } 3226 3227 if (!migrate_multifd_flush_after_each_section()) { 3228 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3229 } 3230 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3231 qemu_fflush(f); 3232 3233 return 0; 3234 } 3235 3236 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3237 uint64_t *can_postcopy) 3238 { 3239 RAMState **temp = opaque; 3240 RAMState *rs = *temp; 3241 3242 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3243 3244 if (migrate_postcopy_ram()) { 3245 /* We can do postcopy, and all the data is postcopiable */ 3246 *can_postcopy += remaining_size; 3247 } else { 3248 *must_precopy += remaining_size; 3249 } 3250 } 3251 3252 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3253 uint64_t *can_postcopy) 3254 { 3255 MigrationState *s = migrate_get_current(); 3256 RAMState **temp = opaque; 3257 RAMState *rs = *temp; 3258 3259 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3260 3261 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3262 qemu_mutex_lock_iothread(); 3263 WITH_RCU_READ_LOCK_GUARD() { 3264 migration_bitmap_sync_precopy(rs, false); 3265 } 3266 qemu_mutex_unlock_iothread(); 3267 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3268 } 3269 3270 if (migrate_postcopy_ram()) { 3271 /* We can do postcopy, and all the data is postcopiable */ 3272 *can_postcopy += remaining_size; 3273 } else { 3274 *must_precopy += remaining_size; 3275 } 3276 } 3277 3278 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3279 { 3280 unsigned int xh_len; 3281 int xh_flags; 3282 uint8_t *loaded_data; 3283 3284 /* extract RLE header */ 3285 xh_flags = qemu_get_byte(f); 3286 xh_len = qemu_get_be16(f); 3287 3288 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3289 error_report("Failed to load XBZRLE page - wrong compression!"); 3290 return -1; 3291 } 3292 3293 if (xh_len > TARGET_PAGE_SIZE) { 3294 error_report("Failed to load XBZRLE page - len overflow!"); 3295 return -1; 3296 } 3297 loaded_data = XBZRLE.decoded_buf; 3298 /* load data and decode */ 3299 /* it can change loaded_data to point to an internal buffer */ 3300 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3301 3302 /* decode RLE */ 3303 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3304 TARGET_PAGE_SIZE) == -1) { 3305 error_report("Failed to load XBZRLE page - decode error!"); 3306 return -1; 3307 } 3308 3309 return 0; 3310 } 3311 3312 /** 3313 * ram_block_from_stream: read a RAMBlock id from the migration stream 3314 * 3315 * Must be called from within a rcu critical section. 3316 * 3317 * Returns a pointer from within the RCU-protected ram_list. 3318 * 3319 * @mis: the migration incoming state pointer 3320 * @f: QEMUFile where to read the data from 3321 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3322 * @channel: the channel we're using 3323 */ 3324 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3325 QEMUFile *f, int flags, 3326 int channel) 3327 { 3328 RAMBlock *block = mis->last_recv_block[channel]; 3329 char id[256]; 3330 uint8_t len; 3331 3332 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3333 if (!block) { 3334 error_report("Ack, bad migration stream!"); 3335 return NULL; 3336 } 3337 return block; 3338 } 3339 3340 len = qemu_get_byte(f); 3341 qemu_get_buffer(f, (uint8_t *)id, len); 3342 id[len] = 0; 3343 3344 block = qemu_ram_block_by_name(id); 3345 if (!block) { 3346 error_report("Can't find block %s", id); 3347 return NULL; 3348 } 3349 3350 if (ramblock_is_ignored(block)) { 3351 error_report("block %s should not be migrated !", id); 3352 return NULL; 3353 } 3354 3355 mis->last_recv_block[channel] = block; 3356 3357 return block; 3358 } 3359 3360 static inline void *host_from_ram_block_offset(RAMBlock *block, 3361 ram_addr_t offset) 3362 { 3363 if (!offset_in_ramblock(block, offset)) { 3364 return NULL; 3365 } 3366 3367 return block->host + offset; 3368 } 3369 3370 static void *host_page_from_ram_block_offset(RAMBlock *block, 3371 ram_addr_t offset) 3372 { 3373 /* Note: Explicitly no check against offset_in_ramblock(). */ 3374 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3375 block->page_size); 3376 } 3377 3378 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3379 ram_addr_t offset) 3380 { 3381 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3382 } 3383 3384 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3385 { 3386 qemu_mutex_lock(&ram_state->bitmap_mutex); 3387 for (int i = 0; i < pages; i++) { 3388 ram_addr_t offset = normal[i]; 3389 ram_state->migration_dirty_pages += !test_and_set_bit( 3390 offset >> TARGET_PAGE_BITS, 3391 block->bmap); 3392 } 3393 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3394 } 3395 3396 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3397 ram_addr_t offset, bool record_bitmap) 3398 { 3399 if (!offset_in_ramblock(block, offset)) { 3400 return NULL; 3401 } 3402 if (!block->colo_cache) { 3403 error_report("%s: colo_cache is NULL in block :%s", 3404 __func__, block->idstr); 3405 return NULL; 3406 } 3407 3408 /* 3409 * During colo checkpoint, we need bitmap of these migrated pages. 3410 * It help us to decide which pages in ram cache should be flushed 3411 * into VM's RAM later. 3412 */ 3413 if (record_bitmap) { 3414 colo_record_bitmap(block, &offset, 1); 3415 } 3416 return block->colo_cache + offset; 3417 } 3418 3419 /** 3420 * ram_handle_compressed: handle the zero page case 3421 * 3422 * If a page (or a whole RDMA chunk) has been 3423 * determined to be zero, then zap it. 3424 * 3425 * @host: host address for the zero page 3426 * @ch: what the page is filled from. We only support zero 3427 * @size: size of the zero page 3428 */ 3429 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3430 { 3431 if (ch != 0 || !buffer_is_zero(host, size)) { 3432 memset(host, ch, size); 3433 } 3434 } 3435 3436 static void colo_init_ram_state(void) 3437 { 3438 ram_state_init(&ram_state); 3439 } 3440 3441 /* 3442 * colo cache: this is for secondary VM, we cache the whole 3443 * memory of the secondary VM, it is need to hold the global lock 3444 * to call this helper. 3445 */ 3446 int colo_init_ram_cache(void) 3447 { 3448 RAMBlock *block; 3449 3450 WITH_RCU_READ_LOCK_GUARD() { 3451 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3452 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3453 NULL, false, false); 3454 if (!block->colo_cache) { 3455 error_report("%s: Can't alloc memory for COLO cache of block %s," 3456 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3457 block->used_length); 3458 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3459 if (block->colo_cache) { 3460 qemu_anon_ram_free(block->colo_cache, block->used_length); 3461 block->colo_cache = NULL; 3462 } 3463 } 3464 return -errno; 3465 } 3466 if (!machine_dump_guest_core(current_machine)) { 3467 qemu_madvise(block->colo_cache, block->used_length, 3468 QEMU_MADV_DONTDUMP); 3469 } 3470 } 3471 } 3472 3473 /* 3474 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3475 * with to decide which page in cache should be flushed into SVM's RAM. Here 3476 * we use the same name 'ram_bitmap' as for migration. 3477 */ 3478 if (ram_bytes_total()) { 3479 RAMBlock *block; 3480 3481 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3482 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3483 block->bmap = bitmap_new(pages); 3484 } 3485 } 3486 3487 colo_init_ram_state(); 3488 return 0; 3489 } 3490 3491 /* TODO: duplicated with ram_init_bitmaps */ 3492 void colo_incoming_start_dirty_log(void) 3493 { 3494 RAMBlock *block = NULL; 3495 /* For memory_global_dirty_log_start below. */ 3496 qemu_mutex_lock_iothread(); 3497 qemu_mutex_lock_ramlist(); 3498 3499 memory_global_dirty_log_sync(false); 3500 WITH_RCU_READ_LOCK_GUARD() { 3501 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3502 ramblock_sync_dirty_bitmap(ram_state, block); 3503 /* Discard this dirty bitmap record */ 3504 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3505 } 3506 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3507 } 3508 ram_state->migration_dirty_pages = 0; 3509 qemu_mutex_unlock_ramlist(); 3510 qemu_mutex_unlock_iothread(); 3511 } 3512 3513 /* It is need to hold the global lock to call this helper */ 3514 void colo_release_ram_cache(void) 3515 { 3516 RAMBlock *block; 3517 3518 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3519 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3520 g_free(block->bmap); 3521 block->bmap = NULL; 3522 } 3523 3524 WITH_RCU_READ_LOCK_GUARD() { 3525 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3526 if (block->colo_cache) { 3527 qemu_anon_ram_free(block->colo_cache, block->used_length); 3528 block->colo_cache = NULL; 3529 } 3530 } 3531 } 3532 ram_state_cleanup(&ram_state); 3533 } 3534 3535 /** 3536 * ram_load_setup: Setup RAM for migration incoming side 3537 * 3538 * Returns zero to indicate success and negative for error 3539 * 3540 * @f: QEMUFile where to receive the data 3541 * @opaque: RAMState pointer 3542 */ 3543 static int ram_load_setup(QEMUFile *f, void *opaque) 3544 { 3545 xbzrle_load_setup(); 3546 ramblock_recv_map_init(); 3547 3548 return 0; 3549 } 3550 3551 static int ram_load_cleanup(void *opaque) 3552 { 3553 RAMBlock *rb; 3554 3555 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3556 qemu_ram_block_writeback(rb); 3557 } 3558 3559 xbzrle_load_cleanup(); 3560 3561 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3562 g_free(rb->receivedmap); 3563 rb->receivedmap = NULL; 3564 } 3565 3566 return 0; 3567 } 3568 3569 /** 3570 * ram_postcopy_incoming_init: allocate postcopy data structures 3571 * 3572 * Returns 0 for success and negative if there was one error 3573 * 3574 * @mis: current migration incoming state 3575 * 3576 * Allocate data structures etc needed by incoming migration with 3577 * postcopy-ram. postcopy-ram's similarly names 3578 * postcopy_ram_incoming_init does the work. 3579 */ 3580 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3581 { 3582 return postcopy_ram_incoming_init(mis); 3583 } 3584 3585 /** 3586 * ram_load_postcopy: load a page in postcopy case 3587 * 3588 * Returns 0 for success or -errno in case of error 3589 * 3590 * Called in postcopy mode by ram_load(). 3591 * rcu_read_lock is taken prior to this being called. 3592 * 3593 * @f: QEMUFile where to send the data 3594 * @channel: the channel to use for loading 3595 */ 3596 int ram_load_postcopy(QEMUFile *f, int channel) 3597 { 3598 int flags = 0, ret = 0; 3599 bool place_needed = false; 3600 bool matches_target_page_size = false; 3601 MigrationIncomingState *mis = migration_incoming_get_current(); 3602 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3603 3604 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3605 ram_addr_t addr; 3606 void *page_buffer = NULL; 3607 void *place_source = NULL; 3608 RAMBlock *block = NULL; 3609 uint8_t ch; 3610 int len; 3611 3612 addr = qemu_get_be64(f); 3613 3614 /* 3615 * If qemu file error, we should stop here, and then "addr" 3616 * may be invalid 3617 */ 3618 ret = qemu_file_get_error(f); 3619 if (ret) { 3620 break; 3621 } 3622 3623 flags = addr & ~TARGET_PAGE_MASK; 3624 addr &= TARGET_PAGE_MASK; 3625 3626 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3627 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3628 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3629 block = ram_block_from_stream(mis, f, flags, channel); 3630 if (!block) { 3631 ret = -EINVAL; 3632 break; 3633 } 3634 3635 /* 3636 * Relying on used_length is racy and can result in false positives. 3637 * We might place pages beyond used_length in case RAM was shrunk 3638 * while in postcopy, which is fine - trying to place via 3639 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3640 */ 3641 if (!block->host || addr >= block->postcopy_length) { 3642 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3643 ret = -EINVAL; 3644 break; 3645 } 3646 tmp_page->target_pages++; 3647 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3648 /* 3649 * Postcopy requires that we place whole host pages atomically; 3650 * these may be huge pages for RAMBlocks that are backed by 3651 * hugetlbfs. 3652 * To make it atomic, the data is read into a temporary page 3653 * that's moved into place later. 3654 * The migration protocol uses, possibly smaller, target-pages 3655 * however the source ensures it always sends all the components 3656 * of a host page in one chunk. 3657 */ 3658 page_buffer = tmp_page->tmp_huge_page + 3659 host_page_offset_from_ram_block_offset(block, addr); 3660 /* If all TP are zero then we can optimise the place */ 3661 if (tmp_page->target_pages == 1) { 3662 tmp_page->host_addr = 3663 host_page_from_ram_block_offset(block, addr); 3664 } else if (tmp_page->host_addr != 3665 host_page_from_ram_block_offset(block, addr)) { 3666 /* not the 1st TP within the HP */ 3667 error_report("Non-same host page detected on channel %d: " 3668 "Target host page %p, received host page %p " 3669 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3670 channel, tmp_page->host_addr, 3671 host_page_from_ram_block_offset(block, addr), 3672 block->idstr, addr, tmp_page->target_pages); 3673 ret = -EINVAL; 3674 break; 3675 } 3676 3677 /* 3678 * If it's the last part of a host page then we place the host 3679 * page 3680 */ 3681 if (tmp_page->target_pages == 3682 (block->page_size / TARGET_PAGE_SIZE)) { 3683 place_needed = true; 3684 } 3685 place_source = tmp_page->tmp_huge_page; 3686 } 3687 3688 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3689 case RAM_SAVE_FLAG_ZERO: 3690 ch = qemu_get_byte(f); 3691 /* 3692 * Can skip to set page_buffer when 3693 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3694 */ 3695 if (ch || !matches_target_page_size) { 3696 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3697 } 3698 if (ch) { 3699 tmp_page->all_zero = false; 3700 } 3701 break; 3702 3703 case RAM_SAVE_FLAG_PAGE: 3704 tmp_page->all_zero = false; 3705 if (!matches_target_page_size) { 3706 /* For huge pages, we always use temporary buffer */ 3707 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3708 } else { 3709 /* 3710 * For small pages that matches target page size, we 3711 * avoid the qemu_file copy. Instead we directly use 3712 * the buffer of QEMUFile to place the page. Note: we 3713 * cannot do any QEMUFile operation before using that 3714 * buffer to make sure the buffer is valid when 3715 * placing the page. 3716 */ 3717 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3718 TARGET_PAGE_SIZE); 3719 } 3720 break; 3721 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3722 tmp_page->all_zero = false; 3723 len = qemu_get_be32(f); 3724 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3725 error_report("Invalid compressed data length: %d", len); 3726 ret = -EINVAL; 3727 break; 3728 } 3729 decompress_data_with_multi_threads(f, page_buffer, len); 3730 break; 3731 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 3732 multifd_recv_sync_main(); 3733 break; 3734 case RAM_SAVE_FLAG_EOS: 3735 /* normal exit */ 3736 if (migrate_multifd_flush_after_each_section()) { 3737 multifd_recv_sync_main(); 3738 } 3739 break; 3740 default: 3741 error_report("Unknown combination of migration flags: 0x%x" 3742 " (postcopy mode)", flags); 3743 ret = -EINVAL; 3744 break; 3745 } 3746 3747 /* Got the whole host page, wait for decompress before placing. */ 3748 if (place_needed) { 3749 ret |= wait_for_decompress_done(); 3750 } 3751 3752 /* Detect for any possible file errors */ 3753 if (!ret && qemu_file_get_error(f)) { 3754 ret = qemu_file_get_error(f); 3755 } 3756 3757 if (!ret && place_needed) { 3758 if (tmp_page->all_zero) { 3759 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3760 } else { 3761 ret = postcopy_place_page(mis, tmp_page->host_addr, 3762 place_source, block); 3763 } 3764 place_needed = false; 3765 postcopy_temp_page_reset(tmp_page); 3766 } 3767 } 3768 3769 return ret; 3770 } 3771 3772 static bool postcopy_is_running(void) 3773 { 3774 PostcopyState ps = postcopy_state_get(); 3775 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3776 } 3777 3778 /* 3779 * Flush content of RAM cache into SVM's memory. 3780 * Only flush the pages that be dirtied by PVM or SVM or both. 3781 */ 3782 void colo_flush_ram_cache(void) 3783 { 3784 RAMBlock *block = NULL; 3785 void *dst_host; 3786 void *src_host; 3787 unsigned long offset = 0; 3788 3789 memory_global_dirty_log_sync(false); 3790 qemu_mutex_lock(&ram_state->bitmap_mutex); 3791 WITH_RCU_READ_LOCK_GUARD() { 3792 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3793 ramblock_sync_dirty_bitmap(ram_state, block); 3794 } 3795 } 3796 3797 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3798 WITH_RCU_READ_LOCK_GUARD() { 3799 block = QLIST_FIRST_RCU(&ram_list.blocks); 3800 3801 while (block) { 3802 unsigned long num = 0; 3803 3804 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3805 if (!offset_in_ramblock(block, 3806 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3807 offset = 0; 3808 num = 0; 3809 block = QLIST_NEXT_RCU(block, next); 3810 } else { 3811 unsigned long i = 0; 3812 3813 for (i = 0; i < num; i++) { 3814 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3815 } 3816 dst_host = block->host 3817 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3818 src_host = block->colo_cache 3819 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3820 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3821 offset += num; 3822 } 3823 } 3824 } 3825 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3826 trace_colo_flush_ram_cache_end(); 3827 } 3828 3829 /** 3830 * ram_load_precopy: load pages in precopy case 3831 * 3832 * Returns 0 for success or -errno in case of error 3833 * 3834 * Called in precopy mode by ram_load(). 3835 * rcu_read_lock is taken prior to this being called. 3836 * 3837 * @f: QEMUFile where to send the data 3838 */ 3839 static int ram_load_precopy(QEMUFile *f) 3840 { 3841 MigrationIncomingState *mis = migration_incoming_get_current(); 3842 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3843 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3844 bool postcopy_advised = migration_incoming_postcopy_advised(); 3845 if (!migrate_compress()) { 3846 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3847 } 3848 3849 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3850 ram_addr_t addr, total_ram_bytes; 3851 void *host = NULL, *host_bak = NULL; 3852 uint8_t ch; 3853 3854 /* 3855 * Yield periodically to let main loop run, but an iteration of 3856 * the main loop is expensive, so do it each some iterations 3857 */ 3858 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3859 aio_co_schedule(qemu_get_current_aio_context(), 3860 qemu_coroutine_self()); 3861 qemu_coroutine_yield(); 3862 } 3863 i++; 3864 3865 addr = qemu_get_be64(f); 3866 flags = addr & ~TARGET_PAGE_MASK; 3867 addr &= TARGET_PAGE_MASK; 3868 3869 if (flags & invalid_flags) { 3870 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3871 error_report("Received an unexpected compressed page"); 3872 } 3873 3874 ret = -EINVAL; 3875 break; 3876 } 3877 3878 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3879 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3880 RAMBlock *block = ram_block_from_stream(mis, f, flags, 3881 RAM_CHANNEL_PRECOPY); 3882 3883 host = host_from_ram_block_offset(block, addr); 3884 /* 3885 * After going into COLO stage, we should not load the page 3886 * into SVM's memory directly, we put them into colo_cache firstly. 3887 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3888 * Previously, we copied all these memory in preparing stage of COLO 3889 * while we need to stop VM, which is a time-consuming process. 3890 * Here we optimize it by a trick, back-up every page while in 3891 * migration process while COLO is enabled, though it affects the 3892 * speed of the migration, but it obviously reduce the downtime of 3893 * back-up all SVM'S memory in COLO preparing stage. 3894 */ 3895 if (migration_incoming_colo_enabled()) { 3896 if (migration_incoming_in_colo_state()) { 3897 /* In COLO stage, put all pages into cache temporarily */ 3898 host = colo_cache_from_block_offset(block, addr, true); 3899 } else { 3900 /* 3901 * In migration stage but before COLO stage, 3902 * Put all pages into both cache and SVM's memory. 3903 */ 3904 host_bak = colo_cache_from_block_offset(block, addr, false); 3905 } 3906 } 3907 if (!host) { 3908 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3909 ret = -EINVAL; 3910 break; 3911 } 3912 if (!migration_incoming_in_colo_state()) { 3913 ramblock_recv_bitmap_set(block, host); 3914 } 3915 3916 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3917 } 3918 3919 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3920 case RAM_SAVE_FLAG_MEM_SIZE: 3921 /* Synchronize RAM block list */ 3922 total_ram_bytes = addr; 3923 while (!ret && total_ram_bytes) { 3924 RAMBlock *block; 3925 char id[256]; 3926 ram_addr_t length; 3927 3928 len = qemu_get_byte(f); 3929 qemu_get_buffer(f, (uint8_t *)id, len); 3930 id[len] = 0; 3931 length = qemu_get_be64(f); 3932 3933 block = qemu_ram_block_by_name(id); 3934 if (block && !qemu_ram_is_migratable(block)) { 3935 error_report("block %s should not be migrated !", id); 3936 ret = -EINVAL; 3937 } else if (block) { 3938 if (length != block->used_length) { 3939 Error *local_err = NULL; 3940 3941 ret = qemu_ram_resize(block, length, 3942 &local_err); 3943 if (local_err) { 3944 error_report_err(local_err); 3945 } 3946 } 3947 /* For postcopy we need to check hugepage sizes match */ 3948 if (postcopy_advised && migrate_postcopy_ram() && 3949 block->page_size != qemu_host_page_size) { 3950 uint64_t remote_page_size = qemu_get_be64(f); 3951 if (remote_page_size != block->page_size) { 3952 error_report("Mismatched RAM page size %s " 3953 "(local) %zd != %" PRId64, 3954 id, block->page_size, 3955 remote_page_size); 3956 ret = -EINVAL; 3957 } 3958 } 3959 if (migrate_ignore_shared()) { 3960 hwaddr addr = qemu_get_be64(f); 3961 if (ramblock_is_ignored(block) && 3962 block->mr->addr != addr) { 3963 error_report("Mismatched GPAs for block %s " 3964 "%" PRId64 "!= %" PRId64, 3965 id, (uint64_t)addr, 3966 (uint64_t)block->mr->addr); 3967 ret = -EINVAL; 3968 } 3969 } 3970 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3971 block->idstr); 3972 } else { 3973 error_report("Unknown ramblock \"%s\", cannot " 3974 "accept migration", id); 3975 ret = -EINVAL; 3976 } 3977 3978 total_ram_bytes -= length; 3979 } 3980 break; 3981 3982 case RAM_SAVE_FLAG_ZERO: 3983 ch = qemu_get_byte(f); 3984 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3985 break; 3986 3987 case RAM_SAVE_FLAG_PAGE: 3988 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3989 break; 3990 3991 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3992 len = qemu_get_be32(f); 3993 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3994 error_report("Invalid compressed data length: %d", len); 3995 ret = -EINVAL; 3996 break; 3997 } 3998 decompress_data_with_multi_threads(f, host, len); 3999 break; 4000 4001 case RAM_SAVE_FLAG_XBZRLE: 4002 if (load_xbzrle(f, addr, host) < 0) { 4003 error_report("Failed to decompress XBZRLE page at " 4004 RAM_ADDR_FMT, addr); 4005 ret = -EINVAL; 4006 break; 4007 } 4008 break; 4009 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4010 multifd_recv_sync_main(); 4011 break; 4012 case RAM_SAVE_FLAG_EOS: 4013 /* normal exit */ 4014 if (migrate_multifd_flush_after_each_section()) { 4015 multifd_recv_sync_main(); 4016 } 4017 break; 4018 case RAM_SAVE_FLAG_HOOK: 4019 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4020 break; 4021 default: 4022 error_report("Unknown combination of migration flags: 0x%x", flags); 4023 ret = -EINVAL; 4024 } 4025 if (!ret) { 4026 ret = qemu_file_get_error(f); 4027 } 4028 if (!ret && host_bak) { 4029 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4030 } 4031 } 4032 4033 ret |= wait_for_decompress_done(); 4034 return ret; 4035 } 4036 4037 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4038 { 4039 int ret = 0; 4040 static uint64_t seq_iter; 4041 /* 4042 * If system is running in postcopy mode, page inserts to host memory must 4043 * be atomic 4044 */ 4045 bool postcopy_running = postcopy_is_running(); 4046 4047 seq_iter++; 4048 4049 if (version_id != 4) { 4050 return -EINVAL; 4051 } 4052 4053 /* 4054 * This RCU critical section can be very long running. 4055 * When RCU reclaims in the code start to become numerous, 4056 * it will be necessary to reduce the granularity of this 4057 * critical section. 4058 */ 4059 WITH_RCU_READ_LOCK_GUARD() { 4060 if (postcopy_running) { 4061 /* 4062 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4063 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4064 * service fast page faults. 4065 */ 4066 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4067 } else { 4068 ret = ram_load_precopy(f); 4069 } 4070 } 4071 trace_ram_load_complete(ret, seq_iter); 4072 4073 return ret; 4074 } 4075 4076 static bool ram_has_postcopy(void *opaque) 4077 { 4078 RAMBlock *rb; 4079 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4080 if (ramblock_is_pmem(rb)) { 4081 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4082 "is not supported now!", rb->idstr, rb->host); 4083 return false; 4084 } 4085 } 4086 4087 return migrate_postcopy_ram(); 4088 } 4089 4090 /* Sync all the dirty bitmap with destination VM. */ 4091 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4092 { 4093 RAMBlock *block; 4094 QEMUFile *file = s->to_dst_file; 4095 int ramblock_count = 0; 4096 4097 trace_ram_dirty_bitmap_sync_start(); 4098 4099 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4100 qemu_savevm_send_recv_bitmap(file, block->idstr); 4101 trace_ram_dirty_bitmap_request(block->idstr); 4102 ramblock_count++; 4103 } 4104 4105 trace_ram_dirty_bitmap_sync_wait(); 4106 4107 /* Wait until all the ramblocks' dirty bitmap synced */ 4108 while (ramblock_count--) { 4109 qemu_sem_wait(&s->rp_state.rp_sem); 4110 } 4111 4112 trace_ram_dirty_bitmap_sync_complete(); 4113 4114 return 0; 4115 } 4116 4117 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4118 { 4119 qemu_sem_post(&s->rp_state.rp_sem); 4120 } 4121 4122 /* 4123 * Read the received bitmap, revert it as the initial dirty bitmap. 4124 * This is only used when the postcopy migration is paused but wants 4125 * to resume from a middle point. 4126 */ 4127 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4128 { 4129 int ret = -EINVAL; 4130 /* from_dst_file is always valid because we're within rp_thread */ 4131 QEMUFile *file = s->rp_state.from_dst_file; 4132 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4133 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4134 uint64_t size, end_mark; 4135 4136 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4137 4138 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4139 error_report("%s: incorrect state %s", __func__, 4140 MigrationStatus_str(s->state)); 4141 return -EINVAL; 4142 } 4143 4144 /* 4145 * Note: see comments in ramblock_recv_bitmap_send() on why we 4146 * need the endianness conversion, and the paddings. 4147 */ 4148 local_size = ROUND_UP(local_size, 8); 4149 4150 /* Add paddings */ 4151 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4152 4153 size = qemu_get_be64(file); 4154 4155 /* The size of the bitmap should match with our ramblock */ 4156 if (size != local_size) { 4157 error_report("%s: ramblock '%s' bitmap size mismatch " 4158 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4159 block->idstr, size, local_size); 4160 ret = -EINVAL; 4161 goto out; 4162 } 4163 4164 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4165 end_mark = qemu_get_be64(file); 4166 4167 ret = qemu_file_get_error(file); 4168 if (ret || size != local_size) { 4169 error_report("%s: read bitmap failed for ramblock '%s': %d" 4170 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4171 __func__, block->idstr, ret, local_size, size); 4172 ret = -EIO; 4173 goto out; 4174 } 4175 4176 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4177 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4178 __func__, block->idstr, end_mark); 4179 ret = -EINVAL; 4180 goto out; 4181 } 4182 4183 /* 4184 * Endianness conversion. We are during postcopy (though paused). 4185 * The dirty bitmap won't change. We can directly modify it. 4186 */ 4187 bitmap_from_le(block->bmap, le_bitmap, nbits); 4188 4189 /* 4190 * What we received is "received bitmap". Revert it as the initial 4191 * dirty bitmap for this ramblock. 4192 */ 4193 bitmap_complement(block->bmap, block->bmap, nbits); 4194 4195 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4196 ramblock_dirty_bitmap_clear_discarded_pages(block); 4197 4198 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4199 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4200 4201 /* 4202 * We succeeded to sync bitmap for current ramblock. If this is 4203 * the last one to sync, we need to notify the main send thread. 4204 */ 4205 ram_dirty_bitmap_reload_notify(s); 4206 4207 ret = 0; 4208 out: 4209 g_free(le_bitmap); 4210 return ret; 4211 } 4212 4213 static int ram_resume_prepare(MigrationState *s, void *opaque) 4214 { 4215 RAMState *rs = *(RAMState **)opaque; 4216 int ret; 4217 4218 ret = ram_dirty_bitmap_sync_all(s, rs); 4219 if (ret) { 4220 return ret; 4221 } 4222 4223 ram_state_resume_prepare(rs, s->to_dst_file); 4224 4225 return 0; 4226 } 4227 4228 void postcopy_preempt_shutdown_file(MigrationState *s) 4229 { 4230 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4231 qemu_fflush(s->postcopy_qemufile_src); 4232 } 4233 4234 static SaveVMHandlers savevm_ram_handlers = { 4235 .save_setup = ram_save_setup, 4236 .save_live_iterate = ram_save_iterate, 4237 .save_live_complete_postcopy = ram_save_complete, 4238 .save_live_complete_precopy = ram_save_complete, 4239 .has_postcopy = ram_has_postcopy, 4240 .state_pending_exact = ram_state_pending_exact, 4241 .state_pending_estimate = ram_state_pending_estimate, 4242 .load_state = ram_load, 4243 .save_cleanup = ram_save_cleanup, 4244 .load_setup = ram_load_setup, 4245 .load_cleanup = ram_load_cleanup, 4246 .resume_prepare = ram_resume_prepare, 4247 }; 4248 4249 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4250 size_t old_size, size_t new_size) 4251 { 4252 PostcopyState ps = postcopy_state_get(); 4253 ram_addr_t offset; 4254 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4255 Error *err = NULL; 4256 4257 if (ramblock_is_ignored(rb)) { 4258 return; 4259 } 4260 4261 if (!migration_is_idle()) { 4262 /* 4263 * Precopy code on the source cannot deal with the size of RAM blocks 4264 * changing at random points in time - especially after sending the 4265 * RAM block sizes in the migration stream, they must no longer change. 4266 * Abort and indicate a proper reason. 4267 */ 4268 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4269 migration_cancel(err); 4270 error_free(err); 4271 } 4272 4273 switch (ps) { 4274 case POSTCOPY_INCOMING_ADVISE: 4275 /* 4276 * Update what ram_postcopy_incoming_init()->init_range() does at the 4277 * time postcopy was advised. Syncing RAM blocks with the source will 4278 * result in RAM resizes. 4279 */ 4280 if (old_size < new_size) { 4281 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4282 error_report("RAM block '%s' discard of resized RAM failed", 4283 rb->idstr); 4284 } 4285 } 4286 rb->postcopy_length = new_size; 4287 break; 4288 case POSTCOPY_INCOMING_NONE: 4289 case POSTCOPY_INCOMING_RUNNING: 4290 case POSTCOPY_INCOMING_END: 4291 /* 4292 * Once our guest is running, postcopy does no longer care about 4293 * resizes. When growing, the new memory was not available on the 4294 * source, no handler needed. 4295 */ 4296 break; 4297 default: 4298 error_report("RAM block '%s' resized during postcopy state: %d", 4299 rb->idstr, ps); 4300 exit(-1); 4301 } 4302 } 4303 4304 static RAMBlockNotifier ram_mig_ram_notifier = { 4305 .ram_block_resized = ram_mig_ram_block_resized, 4306 }; 4307 4308 void ram_mig_init(void) 4309 { 4310 qemu_mutex_init(&XBZRLE.lock); 4311 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4312 ram_block_notifier_add(&ram_mig_ram_notifier); 4313 } 4314