1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram-compress.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration-stats.h" 40 #include "migration/register.h" 41 #include "migration/misc.h" 42 #include "qemu-file.h" 43 #include "postcopy-ram.h" 44 #include "page_cache.h" 45 #include "qemu/error-report.h" 46 #include "qapi/error.h" 47 #include "qapi/qapi-types-migration.h" 48 #include "qapi/qapi-events-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "exec/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "block.h" 56 #include "sysemu/cpu-throttle.h" 57 #include "savevm.h" 58 #include "qemu/iov.h" 59 #include "multifd.h" 60 #include "sysemu/runstate.h" 61 #include "options.h" 62 63 #include "hw/boards.h" /* for machine_dump_guest_core() */ 64 65 #if defined(__linux__) 66 #include "qemu/userfaultfd.h" 67 #endif /* defined(__linux__) */ 68 69 /***********************************************************/ 70 /* ram save/restore */ 71 72 /* 73 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 74 * worked for pages that were filled with the same char. We switched 75 * it to only search for the zero value. And to avoid confusion with 76 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 77 */ 78 /* 79 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 80 */ 81 #define RAM_SAVE_FLAG_FULL 0x01 82 #define RAM_SAVE_FLAG_ZERO 0x02 83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 84 #define RAM_SAVE_FLAG_PAGE 0x08 85 #define RAM_SAVE_FLAG_EOS 0x10 86 #define RAM_SAVE_FLAG_CONTINUE 0x20 87 #define RAM_SAVE_FLAG_XBZRLE 0x40 88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 89 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 91 /* We can't use any flag that is bigger than 0x200 */ 92 93 XBZRLECacheStats xbzrle_counters; 94 95 /* used by the search for pages to send */ 96 struct PageSearchStatus { 97 /* The migration channel used for a specific host page */ 98 QEMUFile *pss_channel; 99 /* Last block from where we have sent data */ 100 RAMBlock *last_sent_block; 101 /* Current block being searched */ 102 RAMBlock *block; 103 /* Current page to search from */ 104 unsigned long page; 105 /* Set once we wrap around */ 106 bool complete_round; 107 /* Whether we're sending a host page */ 108 bool host_page_sending; 109 /* The start/end of current host page. Invalid if host_page_sending==false */ 110 unsigned long host_page_start; 111 unsigned long host_page_end; 112 }; 113 typedef struct PageSearchStatus PageSearchStatus; 114 115 /* struct contains XBZRLE cache and a static page 116 used by the compression */ 117 static struct { 118 /* buffer used for XBZRLE encoding */ 119 uint8_t *encoded_buf; 120 /* buffer for storing page content */ 121 uint8_t *current_buf; 122 /* Cache for XBZRLE, Protected by lock. */ 123 PageCache *cache; 124 QemuMutex lock; 125 /* it will store a page full of zeros */ 126 uint8_t *zero_target_page; 127 /* buffer used for XBZRLE decoding */ 128 uint8_t *decoded_buf; 129 } XBZRLE; 130 131 static void XBZRLE_cache_lock(void) 132 { 133 if (migrate_xbzrle()) { 134 qemu_mutex_lock(&XBZRLE.lock); 135 } 136 } 137 138 static void XBZRLE_cache_unlock(void) 139 { 140 if (migrate_xbzrle()) { 141 qemu_mutex_unlock(&XBZRLE.lock); 142 } 143 } 144 145 /** 146 * xbzrle_cache_resize: resize the xbzrle cache 147 * 148 * This function is called from migrate_params_apply in main 149 * thread, possibly while a migration is in progress. A running 150 * migration may be using the cache and might finish during this call, 151 * hence changes to the cache are protected by XBZRLE.lock(). 152 * 153 * Returns 0 for success or -1 for error 154 * 155 * @new_size: new cache size 156 * @errp: set *errp if the check failed, with reason 157 */ 158 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 159 { 160 PageCache *new_cache; 161 int64_t ret = 0; 162 163 /* Check for truncation */ 164 if (new_size != (size_t)new_size) { 165 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 166 "exceeding address space"); 167 return -1; 168 } 169 170 if (new_size == migrate_xbzrle_cache_size()) { 171 /* nothing to do */ 172 return 0; 173 } 174 175 XBZRLE_cache_lock(); 176 177 if (XBZRLE.cache != NULL) { 178 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 179 if (!new_cache) { 180 ret = -1; 181 goto out; 182 } 183 184 cache_fini(XBZRLE.cache); 185 XBZRLE.cache = new_cache; 186 } 187 out: 188 XBZRLE_cache_unlock(); 189 return ret; 190 } 191 192 static bool postcopy_preempt_active(void) 193 { 194 return migrate_postcopy_preempt() && migration_in_postcopy(); 195 } 196 197 bool ramblock_is_ignored(RAMBlock *block) 198 { 199 return !qemu_ram_is_migratable(block) || 200 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 201 } 202 203 #undef RAMBLOCK_FOREACH 204 205 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 206 { 207 RAMBlock *block; 208 int ret = 0; 209 210 RCU_READ_LOCK_GUARD(); 211 212 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 213 ret = func(block, opaque); 214 if (ret) { 215 break; 216 } 217 } 218 return ret; 219 } 220 221 static void ramblock_recv_map_init(void) 222 { 223 RAMBlock *rb; 224 225 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 226 assert(!rb->receivedmap); 227 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 228 } 229 } 230 231 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 232 { 233 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 234 rb->receivedmap); 235 } 236 237 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 238 { 239 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 240 } 241 242 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 243 { 244 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 245 } 246 247 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 248 size_t nr) 249 { 250 bitmap_set_atomic(rb->receivedmap, 251 ramblock_recv_bitmap_offset(host_addr, rb), 252 nr); 253 } 254 255 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 256 257 /* 258 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 259 * 260 * Returns >0 if success with sent bytes, or <0 if error. 261 */ 262 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 263 const char *block_name) 264 { 265 RAMBlock *block = qemu_ram_block_by_name(block_name); 266 unsigned long *le_bitmap, nbits; 267 uint64_t size; 268 269 if (!block) { 270 error_report("%s: invalid block name: %s", __func__, block_name); 271 return -1; 272 } 273 274 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 275 276 /* 277 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 278 * machines we may need 4 more bytes for padding (see below 279 * comment). So extend it a bit before hand. 280 */ 281 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 282 283 /* 284 * Always use little endian when sending the bitmap. This is 285 * required that when source and destination VMs are not using the 286 * same endianness. (Note: big endian won't work.) 287 */ 288 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 289 290 /* Size of the bitmap, in bytes */ 291 size = DIV_ROUND_UP(nbits, 8); 292 293 /* 294 * size is always aligned to 8 bytes for 64bit machines, but it 295 * may not be true for 32bit machines. We need this padding to 296 * make sure the migration can survive even between 32bit and 297 * 64bit machines. 298 */ 299 size = ROUND_UP(size, 8); 300 301 qemu_put_be64(file, size); 302 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 303 /* 304 * Mark as an end, in case the middle part is screwed up due to 305 * some "mysterious" reason. 306 */ 307 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 308 qemu_fflush(file); 309 310 g_free(le_bitmap); 311 312 if (qemu_file_get_error(file)) { 313 return qemu_file_get_error(file); 314 } 315 316 return size + sizeof(size); 317 } 318 319 /* 320 * An outstanding page request, on the source, having been received 321 * and queued 322 */ 323 struct RAMSrcPageRequest { 324 RAMBlock *rb; 325 hwaddr offset; 326 hwaddr len; 327 328 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 329 }; 330 331 /* State of RAM for migration */ 332 struct RAMState { 333 /* 334 * PageSearchStatus structures for the channels when send pages. 335 * Protected by the bitmap_mutex. 336 */ 337 PageSearchStatus pss[RAM_CHANNEL_MAX]; 338 /* UFFD file descriptor, used in 'write-tracking' migration */ 339 int uffdio_fd; 340 /* total ram size in bytes */ 341 uint64_t ram_bytes_total; 342 /* Last block that we have visited searching for dirty pages */ 343 RAMBlock *last_seen_block; 344 /* Last dirty target page we have sent */ 345 ram_addr_t last_page; 346 /* last ram version we have seen */ 347 uint32_t last_version; 348 /* How many times we have dirty too many pages */ 349 int dirty_rate_high_cnt; 350 /* these variables are used for bitmap sync */ 351 /* last time we did a full bitmap_sync */ 352 int64_t time_last_bitmap_sync; 353 /* bytes transferred at start_time */ 354 uint64_t bytes_xfer_prev; 355 /* number of dirty pages since start_time */ 356 uint64_t num_dirty_pages_period; 357 /* xbzrle misses since the beginning of the period */ 358 uint64_t xbzrle_cache_miss_prev; 359 /* Amount of xbzrle pages since the beginning of the period */ 360 uint64_t xbzrle_pages_prev; 361 /* Amount of xbzrle encoded bytes since the beginning of the period */ 362 uint64_t xbzrle_bytes_prev; 363 /* Are we really using XBZRLE (e.g., after the first round). */ 364 bool xbzrle_started; 365 /* Are we on the last stage of migration */ 366 bool last_stage; 367 /* compression statistics since the beginning of the period */ 368 /* amount of count that no free thread to compress data */ 369 uint64_t compress_thread_busy_prev; 370 /* amount bytes after compression */ 371 uint64_t compressed_size_prev; 372 /* amount of compressed pages */ 373 uint64_t compress_pages_prev; 374 375 /* total handled target pages at the beginning of period */ 376 uint64_t target_page_count_prev; 377 /* total handled target pages since start */ 378 uint64_t target_page_count; 379 /* number of dirty bits in the bitmap */ 380 uint64_t migration_dirty_pages; 381 /* 382 * Protects: 383 * - dirty/clear bitmap 384 * - migration_dirty_pages 385 * - pss structures 386 */ 387 QemuMutex bitmap_mutex; 388 /* The RAMBlock used in the last src_page_requests */ 389 RAMBlock *last_req_rb; 390 /* Queue of outstanding page requests from the destination */ 391 QemuMutex src_page_req_mutex; 392 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 393 }; 394 typedef struct RAMState RAMState; 395 396 static RAMState *ram_state; 397 398 static NotifierWithReturnList precopy_notifier_list; 399 400 /* Whether postcopy has queued requests? */ 401 static bool postcopy_has_request(RAMState *rs) 402 { 403 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 404 } 405 406 void precopy_infrastructure_init(void) 407 { 408 notifier_with_return_list_init(&precopy_notifier_list); 409 } 410 411 void precopy_add_notifier(NotifierWithReturn *n) 412 { 413 notifier_with_return_list_add(&precopy_notifier_list, n); 414 } 415 416 void precopy_remove_notifier(NotifierWithReturn *n) 417 { 418 notifier_with_return_remove(n); 419 } 420 421 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 422 { 423 PrecopyNotifyData pnd; 424 pnd.reason = reason; 425 pnd.errp = errp; 426 427 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 428 } 429 430 uint64_t ram_bytes_remaining(void) 431 { 432 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 433 0; 434 } 435 436 void ram_transferred_add(uint64_t bytes) 437 { 438 if (runstate_is_running()) { 439 stat64_add(&mig_stats.precopy_bytes, bytes); 440 } else if (migration_in_postcopy()) { 441 stat64_add(&mig_stats.postcopy_bytes, bytes); 442 } else { 443 stat64_add(&mig_stats.downtime_bytes, bytes); 444 } 445 stat64_add(&mig_stats.transferred, bytes); 446 } 447 448 struct MigrationOps { 449 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 450 }; 451 typedef struct MigrationOps MigrationOps; 452 453 MigrationOps *migration_ops; 454 455 static int ram_save_host_page_urgent(PageSearchStatus *pss); 456 457 /* NOTE: page is the PFN not real ram_addr_t. */ 458 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 459 { 460 pss->block = rb; 461 pss->page = page; 462 pss->complete_round = false; 463 } 464 465 /* 466 * Check whether two PSSs are actively sending the same page. Return true 467 * if it is, false otherwise. 468 */ 469 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 470 { 471 return pss1->host_page_sending && pss2->host_page_sending && 472 (pss1->host_page_start == pss2->host_page_start); 473 } 474 475 /** 476 * save_page_header: write page header to wire 477 * 478 * If this is the 1st block, it also writes the block identification 479 * 480 * Returns the number of bytes written 481 * 482 * @pss: current PSS channel status 483 * @block: block that contains the page we want to send 484 * @offset: offset inside the block for the page 485 * in the lower bits, it contains flags 486 */ 487 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 488 RAMBlock *block, ram_addr_t offset) 489 { 490 size_t size, len; 491 bool same_block = (block == pss->last_sent_block); 492 493 if (same_block) { 494 offset |= RAM_SAVE_FLAG_CONTINUE; 495 } 496 qemu_put_be64(f, offset); 497 size = 8; 498 499 if (!same_block) { 500 len = strlen(block->idstr); 501 qemu_put_byte(f, len); 502 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 503 size += 1 + len; 504 pss->last_sent_block = block; 505 } 506 return size; 507 } 508 509 /** 510 * mig_throttle_guest_down: throttle down the guest 511 * 512 * Reduce amount of guest cpu execution to hopefully slow down memory 513 * writes. If guest dirty memory rate is reduced below the rate at 514 * which we can transfer pages to the destination then we should be 515 * able to complete migration. Some workloads dirty memory way too 516 * fast and will not effectively converge, even with auto-converge. 517 */ 518 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 519 uint64_t bytes_dirty_threshold) 520 { 521 uint64_t pct_initial = migrate_cpu_throttle_initial(); 522 uint64_t pct_increment = migrate_cpu_throttle_increment(); 523 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 524 int pct_max = migrate_max_cpu_throttle(); 525 526 uint64_t throttle_now = cpu_throttle_get_percentage(); 527 uint64_t cpu_now, cpu_ideal, throttle_inc; 528 529 /* We have not started throttling yet. Let's start it. */ 530 if (!cpu_throttle_active()) { 531 cpu_throttle_set(pct_initial); 532 } else { 533 /* Throttling already on, just increase the rate */ 534 if (!pct_tailslow) { 535 throttle_inc = pct_increment; 536 } else { 537 /* Compute the ideal CPU percentage used by Guest, which may 538 * make the dirty rate match the dirty rate threshold. */ 539 cpu_now = 100 - throttle_now; 540 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 541 bytes_dirty_period); 542 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 543 } 544 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 545 } 546 } 547 548 void mig_throttle_counter_reset(void) 549 { 550 RAMState *rs = ram_state; 551 552 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 553 rs->num_dirty_pages_period = 0; 554 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred); 555 } 556 557 /** 558 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 559 * 560 * @rs: current RAM state 561 * @current_addr: address for the zero page 562 * 563 * Update the xbzrle cache to reflect a page that's been sent as all 0. 564 * The important thing is that a stale (not-yet-0'd) page be replaced 565 * by the new data. 566 * As a bonus, if the page wasn't in the cache it gets added so that 567 * when a small write is made into the 0'd page it gets XBZRLE sent. 568 */ 569 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 570 { 571 /* We don't care if this fails to allocate a new cache page 572 * as long as it updated an old one */ 573 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 574 stat64_get(&mig_stats.dirty_sync_count)); 575 } 576 577 #define ENCODING_FLAG_XBZRLE 0x1 578 579 /** 580 * save_xbzrle_page: compress and send current page 581 * 582 * Returns: 1 means that we wrote the page 583 * 0 means that page is identical to the one already sent 584 * -1 means that xbzrle would be longer than normal 585 * 586 * @rs: current RAM state 587 * @pss: current PSS channel 588 * @current_data: pointer to the address of the page contents 589 * @current_addr: addr of the page 590 * @block: block that contains the page we want to send 591 * @offset: offset inside the block for the page 592 */ 593 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 594 uint8_t **current_data, ram_addr_t current_addr, 595 RAMBlock *block, ram_addr_t offset) 596 { 597 int encoded_len = 0, bytes_xbzrle; 598 uint8_t *prev_cached_page; 599 QEMUFile *file = pss->pss_channel; 600 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 601 602 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 603 xbzrle_counters.cache_miss++; 604 if (!rs->last_stage) { 605 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 606 generation) == -1) { 607 return -1; 608 } else { 609 /* update *current_data when the page has been 610 inserted into cache */ 611 *current_data = get_cached_data(XBZRLE.cache, current_addr); 612 } 613 } 614 return -1; 615 } 616 617 /* 618 * Reaching here means the page has hit the xbzrle cache, no matter what 619 * encoding result it is (normal encoding, overflow or skipping the page), 620 * count the page as encoded. This is used to calculate the encoding rate. 621 * 622 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 623 * 2nd page turns out to be skipped (i.e. no new bytes written to the 624 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 625 * skipped page included. In this way, the encoding rate can tell if the 626 * guest page is good for xbzrle encoding. 627 */ 628 xbzrle_counters.pages++; 629 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 630 631 /* save current buffer into memory */ 632 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 633 634 /* XBZRLE encoding (if there is no overflow) */ 635 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 636 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 637 TARGET_PAGE_SIZE); 638 639 /* 640 * Update the cache contents, so that it corresponds to the data 641 * sent, in all cases except where we skip the page. 642 */ 643 if (!rs->last_stage && encoded_len != 0) { 644 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 645 /* 646 * In the case where we couldn't compress, ensure that the caller 647 * sends the data from the cache, since the guest might have 648 * changed the RAM since we copied it. 649 */ 650 *current_data = prev_cached_page; 651 } 652 653 if (encoded_len == 0) { 654 trace_save_xbzrle_page_skipping(); 655 return 0; 656 } else if (encoded_len == -1) { 657 trace_save_xbzrle_page_overflow(); 658 xbzrle_counters.overflow++; 659 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 660 return -1; 661 } 662 663 /* Send XBZRLE based compressed page */ 664 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 665 offset | RAM_SAVE_FLAG_XBZRLE); 666 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 667 qemu_put_be16(file, encoded_len); 668 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 669 bytes_xbzrle += encoded_len + 1 + 2; 670 /* 671 * Like compressed_size (please see update_compress_thread_counts), 672 * the xbzrle encoded bytes don't count the 8 byte header with 673 * RAM_SAVE_FLAG_CONTINUE. 674 */ 675 xbzrle_counters.bytes += bytes_xbzrle - 8; 676 ram_transferred_add(bytes_xbzrle); 677 678 return 1; 679 } 680 681 /** 682 * pss_find_next_dirty: find the next dirty page of current ramblock 683 * 684 * This function updates pss->page to point to the next dirty page index 685 * within the ramblock to migrate, or the end of ramblock when nothing 686 * found. Note that when pss->host_page_sending==true it means we're 687 * during sending a host page, so we won't look for dirty page that is 688 * outside the host page boundary. 689 * 690 * @pss: the current page search status 691 */ 692 static void pss_find_next_dirty(PageSearchStatus *pss) 693 { 694 RAMBlock *rb = pss->block; 695 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 696 unsigned long *bitmap = rb->bmap; 697 698 if (ramblock_is_ignored(rb)) { 699 /* Points directly to the end, so we know no dirty page */ 700 pss->page = size; 701 return; 702 } 703 704 /* 705 * If during sending a host page, only look for dirty pages within the 706 * current host page being send. 707 */ 708 if (pss->host_page_sending) { 709 assert(pss->host_page_end); 710 size = MIN(size, pss->host_page_end); 711 } 712 713 pss->page = find_next_bit(bitmap, size, pss->page); 714 } 715 716 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 717 unsigned long page) 718 { 719 uint8_t shift; 720 hwaddr size, start; 721 722 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 723 return; 724 } 725 726 shift = rb->clear_bmap_shift; 727 /* 728 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 729 * can make things easier sometimes since then start address 730 * of the small chunk will always be 64 pages aligned so the 731 * bitmap will always be aligned to unsigned long. We should 732 * even be able to remove this restriction but I'm simply 733 * keeping it. 734 */ 735 assert(shift >= 6); 736 737 size = 1ULL << (TARGET_PAGE_BITS + shift); 738 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 739 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 740 memory_region_clear_dirty_bitmap(rb->mr, start, size); 741 } 742 743 static void 744 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 745 unsigned long start, 746 unsigned long npages) 747 { 748 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 749 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 750 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 751 752 /* 753 * Clear pages from start to start + npages - 1, so the end boundary is 754 * exclusive. 755 */ 756 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 757 migration_clear_memory_region_dirty_bitmap(rb, i); 758 } 759 } 760 761 /* 762 * colo_bitmap_find_diry:find contiguous dirty pages from start 763 * 764 * Returns the page offset within memory region of the start of the contiguout 765 * dirty page 766 * 767 * @rs: current RAM state 768 * @rb: RAMBlock where to search for dirty pages 769 * @start: page where we start the search 770 * @num: the number of contiguous dirty pages 771 */ 772 static inline 773 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 774 unsigned long start, unsigned long *num) 775 { 776 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 777 unsigned long *bitmap = rb->bmap; 778 unsigned long first, next; 779 780 *num = 0; 781 782 if (ramblock_is_ignored(rb)) { 783 return size; 784 } 785 786 first = find_next_bit(bitmap, size, start); 787 if (first >= size) { 788 return first; 789 } 790 next = find_next_zero_bit(bitmap, size, first + 1); 791 assert(next >= first); 792 *num = next - first; 793 return first; 794 } 795 796 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 797 RAMBlock *rb, 798 unsigned long page) 799 { 800 bool ret; 801 802 /* 803 * Clear dirty bitmap if needed. This _must_ be called before we 804 * send any of the page in the chunk because we need to make sure 805 * we can capture further page content changes when we sync dirty 806 * log the next time. So as long as we are going to send any of 807 * the page in the chunk we clear the remote dirty bitmap for all. 808 * Clearing it earlier won't be a problem, but too late will. 809 */ 810 migration_clear_memory_region_dirty_bitmap(rb, page); 811 812 ret = test_and_clear_bit(page, rb->bmap); 813 if (ret) { 814 rs->migration_dirty_pages--; 815 } 816 817 return ret; 818 } 819 820 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 821 void *opaque) 822 { 823 const hwaddr offset = section->offset_within_region; 824 const hwaddr size = int128_get64(section->size); 825 const unsigned long start = offset >> TARGET_PAGE_BITS; 826 const unsigned long npages = size >> TARGET_PAGE_BITS; 827 RAMBlock *rb = section->mr->ram_block; 828 uint64_t *cleared_bits = opaque; 829 830 /* 831 * We don't grab ram_state->bitmap_mutex because we expect to run 832 * only when starting migration or during postcopy recovery where 833 * we don't have concurrent access. 834 */ 835 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 836 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 837 } 838 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 839 bitmap_clear(rb->bmap, start, npages); 840 } 841 842 /* 843 * Exclude all dirty pages from migration that fall into a discarded range as 844 * managed by a RamDiscardManager responsible for the mapped memory region of 845 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 846 * 847 * Discarded pages ("logically unplugged") have undefined content and must 848 * not get migrated, because even reading these pages for migration might 849 * result in undesired behavior. 850 * 851 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 852 * 853 * Note: The result is only stable while migrating (precopy/postcopy). 854 */ 855 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 856 { 857 uint64_t cleared_bits = 0; 858 859 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 860 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 861 MemoryRegionSection section = { 862 .mr = rb->mr, 863 .offset_within_region = 0, 864 .size = int128_make64(qemu_ram_get_used_length(rb)), 865 }; 866 867 ram_discard_manager_replay_discarded(rdm, §ion, 868 dirty_bitmap_clear_section, 869 &cleared_bits); 870 } 871 return cleared_bits; 872 } 873 874 /* 875 * Check if a host-page aligned page falls into a discarded range as managed by 876 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 877 * 878 * Note: The result is only stable while migrating (precopy/postcopy). 879 */ 880 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 881 { 882 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 883 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 884 MemoryRegionSection section = { 885 .mr = rb->mr, 886 .offset_within_region = start, 887 .size = int128_make64(qemu_ram_pagesize(rb)), 888 }; 889 890 return !ram_discard_manager_is_populated(rdm, §ion); 891 } 892 return false; 893 } 894 895 /* Called with RCU critical section */ 896 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 897 { 898 uint64_t new_dirty_pages = 899 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 900 901 rs->migration_dirty_pages += new_dirty_pages; 902 rs->num_dirty_pages_period += new_dirty_pages; 903 } 904 905 /** 906 * ram_pagesize_summary: calculate all the pagesizes of a VM 907 * 908 * Returns a summary bitmap of the page sizes of all RAMBlocks 909 * 910 * For VMs with just normal pages this is equivalent to the host page 911 * size. If it's got some huge pages then it's the OR of all the 912 * different page sizes. 913 */ 914 uint64_t ram_pagesize_summary(void) 915 { 916 RAMBlock *block; 917 uint64_t summary = 0; 918 919 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 920 summary |= block->page_size; 921 } 922 923 return summary; 924 } 925 926 uint64_t ram_get_total_transferred_pages(void) 927 { 928 return stat64_get(&mig_stats.normal_pages) + 929 stat64_get(&mig_stats.zero_pages) + 930 compression_counters.pages + xbzrle_counters.pages; 931 } 932 933 static void migration_update_rates(RAMState *rs, int64_t end_time) 934 { 935 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 936 double compressed_size; 937 938 /* calculate period counters */ 939 stat64_set(&mig_stats.dirty_pages_rate, 940 rs->num_dirty_pages_period * 1000 / 941 (end_time - rs->time_last_bitmap_sync)); 942 943 if (!page_count) { 944 return; 945 } 946 947 if (migrate_xbzrle()) { 948 double encoded_size, unencoded_size; 949 950 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 951 rs->xbzrle_cache_miss_prev) / page_count; 952 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 953 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 954 TARGET_PAGE_SIZE; 955 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 956 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 957 xbzrle_counters.encoding_rate = 0; 958 } else { 959 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 960 } 961 rs->xbzrle_pages_prev = xbzrle_counters.pages; 962 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 963 } 964 965 if (migrate_compress()) { 966 compression_counters.busy_rate = (double)(compression_counters.busy - 967 rs->compress_thread_busy_prev) / page_count; 968 rs->compress_thread_busy_prev = compression_counters.busy; 969 970 compressed_size = compression_counters.compressed_size - 971 rs->compressed_size_prev; 972 if (compressed_size) { 973 double uncompressed_size = (compression_counters.pages - 974 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 975 976 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 977 compression_counters.compression_rate = 978 uncompressed_size / compressed_size; 979 980 rs->compress_pages_prev = compression_counters.pages; 981 rs->compressed_size_prev = compression_counters.compressed_size; 982 } 983 } 984 } 985 986 static void migration_trigger_throttle(RAMState *rs) 987 { 988 uint64_t threshold = migrate_throttle_trigger_threshold(); 989 uint64_t bytes_xfer_period = 990 stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev; 991 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 992 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 993 994 /* During block migration the auto-converge logic incorrectly detects 995 * that ram migration makes no progress. Avoid this by disabling the 996 * throttling logic during the bulk phase of block migration. */ 997 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 998 /* The following detection logic can be refined later. For now: 999 Check to see if the ratio between dirtied bytes and the approx. 1000 amount of bytes that just got transferred since the last time 1001 we were in this routine reaches the threshold. If that happens 1002 twice, start or increase throttling. */ 1003 1004 if ((bytes_dirty_period > bytes_dirty_threshold) && 1005 (++rs->dirty_rate_high_cnt >= 2)) { 1006 trace_migration_throttle(); 1007 rs->dirty_rate_high_cnt = 0; 1008 mig_throttle_guest_down(bytes_dirty_period, 1009 bytes_dirty_threshold); 1010 } 1011 } 1012 } 1013 1014 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1015 { 1016 RAMBlock *block; 1017 int64_t end_time; 1018 1019 stat64_add(&mig_stats.dirty_sync_count, 1); 1020 1021 if (!rs->time_last_bitmap_sync) { 1022 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1023 } 1024 1025 trace_migration_bitmap_sync_start(); 1026 memory_global_dirty_log_sync(last_stage); 1027 1028 qemu_mutex_lock(&rs->bitmap_mutex); 1029 WITH_RCU_READ_LOCK_GUARD() { 1030 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1031 ramblock_sync_dirty_bitmap(rs, block); 1032 } 1033 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1034 } 1035 qemu_mutex_unlock(&rs->bitmap_mutex); 1036 1037 memory_global_after_dirty_log_sync(); 1038 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1039 1040 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1041 1042 /* more than 1 second = 1000 millisecons */ 1043 if (end_time > rs->time_last_bitmap_sync + 1000) { 1044 migration_trigger_throttle(rs); 1045 1046 migration_update_rates(rs, end_time); 1047 1048 rs->target_page_count_prev = rs->target_page_count; 1049 1050 /* reset period counters */ 1051 rs->time_last_bitmap_sync = end_time; 1052 rs->num_dirty_pages_period = 0; 1053 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred); 1054 } 1055 if (migrate_events()) { 1056 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1057 qapi_event_send_migration_pass(generation); 1058 } 1059 } 1060 1061 static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage) 1062 { 1063 Error *local_err = NULL; 1064 1065 /* 1066 * The current notifier usage is just an optimization to migration, so we 1067 * don't stop the normal migration process in the error case. 1068 */ 1069 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1070 error_report_err(local_err); 1071 local_err = NULL; 1072 } 1073 1074 migration_bitmap_sync(rs, last_stage); 1075 1076 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1077 error_report_err(local_err); 1078 } 1079 } 1080 1081 void ram_release_page(const char *rbname, uint64_t offset) 1082 { 1083 if (!migrate_release_ram() || !migration_in_postcopy()) { 1084 return; 1085 } 1086 1087 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1088 } 1089 1090 /** 1091 * save_zero_page_to_file: send the zero page to the file 1092 * 1093 * Returns the size of data written to the file, 0 means the page is not 1094 * a zero page 1095 * 1096 * @pss: current PSS channel 1097 * @block: block that contains the page we want to send 1098 * @offset: offset inside the block for the page 1099 */ 1100 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1101 RAMBlock *block, ram_addr_t offset) 1102 { 1103 uint8_t *p = block->host + offset; 1104 int len = 0; 1105 1106 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1107 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1108 qemu_put_byte(file, 0); 1109 len += 1; 1110 ram_release_page(block->idstr, offset); 1111 } 1112 return len; 1113 } 1114 1115 /** 1116 * save_zero_page: send the zero page to the stream 1117 * 1118 * Returns the number of pages written. 1119 * 1120 * @pss: current PSS channel 1121 * @block: block that contains the page we want to send 1122 * @offset: offset inside the block for the page 1123 */ 1124 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1125 ram_addr_t offset) 1126 { 1127 int len = save_zero_page_to_file(pss, f, block, offset); 1128 1129 if (len) { 1130 stat64_add(&mig_stats.zero_pages, 1); 1131 ram_transferred_add(len); 1132 return 1; 1133 } 1134 return -1; 1135 } 1136 1137 /* 1138 * @pages: the number of pages written by the control path, 1139 * < 0 - error 1140 * > 0 - number of pages written 1141 * 1142 * Return true if the pages has been saved, otherwise false is returned. 1143 */ 1144 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1145 ram_addr_t offset, int *pages) 1146 { 1147 uint64_t bytes_xmit = 0; 1148 int ret; 1149 1150 *pages = -1; 1151 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1152 TARGET_PAGE_SIZE, &bytes_xmit); 1153 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1154 return false; 1155 } 1156 1157 if (bytes_xmit) { 1158 ram_transferred_add(bytes_xmit); 1159 *pages = 1; 1160 } 1161 1162 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1163 return true; 1164 } 1165 1166 if (bytes_xmit > 0) { 1167 stat64_add(&mig_stats.normal_pages, 1); 1168 } else if (bytes_xmit == 0) { 1169 stat64_add(&mig_stats.zero_pages, 1); 1170 } 1171 1172 return true; 1173 } 1174 1175 /* 1176 * directly send the page to the stream 1177 * 1178 * Returns the number of pages written. 1179 * 1180 * @pss: current PSS channel 1181 * @block: block that contains the page we want to send 1182 * @offset: offset inside the block for the page 1183 * @buf: the page to be sent 1184 * @async: send to page asyncly 1185 */ 1186 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1187 ram_addr_t offset, uint8_t *buf, bool async) 1188 { 1189 QEMUFile *file = pss->pss_channel; 1190 1191 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1192 offset | RAM_SAVE_FLAG_PAGE)); 1193 if (async) { 1194 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1195 migrate_release_ram() && 1196 migration_in_postcopy()); 1197 } else { 1198 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1199 } 1200 ram_transferred_add(TARGET_PAGE_SIZE); 1201 stat64_add(&mig_stats.normal_pages, 1); 1202 return 1; 1203 } 1204 1205 /** 1206 * ram_save_page: send the given page to the stream 1207 * 1208 * Returns the number of pages written. 1209 * < 0 - error 1210 * >=0 - Number of pages written - this might legally be 0 1211 * if xbzrle noticed the page was the same. 1212 * 1213 * @rs: current RAM state 1214 * @block: block that contains the page we want to send 1215 * @offset: offset inside the block for the page 1216 */ 1217 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1218 { 1219 int pages = -1; 1220 uint8_t *p; 1221 bool send_async = true; 1222 RAMBlock *block = pss->block; 1223 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1224 ram_addr_t current_addr = block->offset + offset; 1225 1226 p = block->host + offset; 1227 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1228 1229 XBZRLE_cache_lock(); 1230 if (rs->xbzrle_started && !migration_in_postcopy()) { 1231 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1232 block, offset); 1233 if (!rs->last_stage) { 1234 /* Can't send this cached data async, since the cache page 1235 * might get updated before it gets to the wire 1236 */ 1237 send_async = false; 1238 } 1239 } 1240 1241 /* XBZRLE overflow or normal page */ 1242 if (pages == -1) { 1243 pages = save_normal_page(pss, block, offset, p, send_async); 1244 } 1245 1246 XBZRLE_cache_unlock(); 1247 1248 return pages; 1249 } 1250 1251 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1252 ram_addr_t offset) 1253 { 1254 if (multifd_queue_page(file, block, offset) < 0) { 1255 return -1; 1256 } 1257 stat64_add(&mig_stats.normal_pages, 1); 1258 1259 return 1; 1260 } 1261 1262 static void 1263 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1264 { 1265 ram_transferred_add(bytes_xmit); 1266 1267 if (param->result == RES_ZEROPAGE) { 1268 stat64_add(&mig_stats.zero_pages, 1); 1269 return; 1270 } 1271 1272 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1273 compression_counters.compressed_size += bytes_xmit - 8; 1274 compression_counters.pages++; 1275 } 1276 1277 static bool save_page_use_compression(RAMState *rs); 1278 1279 static int send_queued_data(CompressParam *param) 1280 { 1281 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY]; 1282 MigrationState *ms = migrate_get_current(); 1283 QEMUFile *file = ms->to_dst_file; 1284 int len = 0; 1285 1286 RAMBlock *block = param->block; 1287 ram_addr_t offset = param->offset; 1288 1289 if (param->result == RES_NONE) { 1290 return 0; 1291 } 1292 1293 assert(block == pss->last_sent_block); 1294 1295 if (param->result == RES_ZEROPAGE) { 1296 assert(qemu_file_buffer_empty(param->file)); 1297 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1298 qemu_put_byte(file, 0); 1299 len += 1; 1300 ram_release_page(block->idstr, offset); 1301 } else if (param->result == RES_COMPRESS) { 1302 assert(!qemu_file_buffer_empty(param->file)); 1303 len += save_page_header(pss, file, block, 1304 offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1305 len += qemu_put_qemu_file(file, param->file); 1306 } else { 1307 abort(); 1308 } 1309 1310 update_compress_thread_counts(param, len); 1311 1312 return len; 1313 } 1314 1315 static void ram_flush_compressed_data(RAMState *rs) 1316 { 1317 if (!save_page_use_compression(rs)) { 1318 return; 1319 } 1320 1321 flush_compressed_data(send_queued_data); 1322 } 1323 1324 #define PAGE_ALL_CLEAN 0 1325 #define PAGE_TRY_AGAIN 1 1326 #define PAGE_DIRTY_FOUND 2 1327 /** 1328 * find_dirty_block: find the next dirty page and update any state 1329 * associated with the search process. 1330 * 1331 * Returns: 1332 * <0: An error happened 1333 * PAGE_ALL_CLEAN: no dirty page found, give up 1334 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1335 * PAGE_DIRTY_FOUND: dirty page found 1336 * 1337 * @rs: current RAM state 1338 * @pss: data about the state of the current dirty page scan 1339 * @again: set to false if the search has scanned the whole of RAM 1340 */ 1341 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1342 { 1343 /* Update pss->page for the next dirty bit in ramblock */ 1344 pss_find_next_dirty(pss); 1345 1346 if (pss->complete_round && pss->block == rs->last_seen_block && 1347 pss->page >= rs->last_page) { 1348 /* 1349 * We've been once around the RAM and haven't found anything. 1350 * Give up. 1351 */ 1352 return PAGE_ALL_CLEAN; 1353 } 1354 if (!offset_in_ramblock(pss->block, 1355 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1356 /* Didn't find anything in this RAM Block */ 1357 pss->page = 0; 1358 pss->block = QLIST_NEXT_RCU(pss->block, next); 1359 if (!pss->block) { 1360 if (!migrate_multifd_flush_after_each_section()) { 1361 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1362 int ret = multifd_send_sync_main(f); 1363 if (ret < 0) { 1364 return ret; 1365 } 1366 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 1367 qemu_fflush(f); 1368 } 1369 /* 1370 * If memory migration starts over, we will meet a dirtied page 1371 * which may still exists in compression threads's ring, so we 1372 * should flush the compressed data to make sure the new page 1373 * is not overwritten by the old one in the destination. 1374 * 1375 * Also If xbzrle is on, stop using the data compression at this 1376 * point. In theory, xbzrle can do better than compression. 1377 */ 1378 ram_flush_compressed_data(rs); 1379 1380 /* Hit the end of the list */ 1381 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1382 /* Flag that we've looped */ 1383 pss->complete_round = true; 1384 /* After the first round, enable XBZRLE. */ 1385 if (migrate_xbzrle()) { 1386 rs->xbzrle_started = true; 1387 } 1388 } 1389 /* Didn't find anything this time, but try again on the new block */ 1390 return PAGE_TRY_AGAIN; 1391 } else { 1392 /* We've found something */ 1393 return PAGE_DIRTY_FOUND; 1394 } 1395 } 1396 1397 /** 1398 * unqueue_page: gets a page of the queue 1399 * 1400 * Helper for 'get_queued_page' - gets a page off the queue 1401 * 1402 * Returns the block of the page (or NULL if none available) 1403 * 1404 * @rs: current RAM state 1405 * @offset: used to return the offset within the RAMBlock 1406 */ 1407 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1408 { 1409 struct RAMSrcPageRequest *entry; 1410 RAMBlock *block = NULL; 1411 1412 if (!postcopy_has_request(rs)) { 1413 return NULL; 1414 } 1415 1416 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1417 1418 /* 1419 * This should _never_ change even after we take the lock, because no one 1420 * should be taking anything off the request list other than us. 1421 */ 1422 assert(postcopy_has_request(rs)); 1423 1424 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1425 block = entry->rb; 1426 *offset = entry->offset; 1427 1428 if (entry->len > TARGET_PAGE_SIZE) { 1429 entry->len -= TARGET_PAGE_SIZE; 1430 entry->offset += TARGET_PAGE_SIZE; 1431 } else { 1432 memory_region_unref(block->mr); 1433 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1434 g_free(entry); 1435 migration_consume_urgent_request(); 1436 } 1437 1438 return block; 1439 } 1440 1441 #if defined(__linux__) 1442 /** 1443 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1444 * is found, return RAM block pointer and page offset 1445 * 1446 * Returns pointer to the RAMBlock containing faulting page, 1447 * NULL if no write faults are pending 1448 * 1449 * @rs: current RAM state 1450 * @offset: page offset from the beginning of the block 1451 */ 1452 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1453 { 1454 struct uffd_msg uffd_msg; 1455 void *page_address; 1456 RAMBlock *block; 1457 int res; 1458 1459 if (!migrate_background_snapshot()) { 1460 return NULL; 1461 } 1462 1463 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1464 if (res <= 0) { 1465 return NULL; 1466 } 1467 1468 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1469 block = qemu_ram_block_from_host(page_address, false, offset); 1470 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1471 return block; 1472 } 1473 1474 /** 1475 * ram_save_release_protection: release UFFD write protection after 1476 * a range of pages has been saved 1477 * 1478 * @rs: current RAM state 1479 * @pss: page-search-status structure 1480 * @start_page: index of the first page in the range relative to pss->block 1481 * 1482 * Returns 0 on success, negative value in case of an error 1483 */ 1484 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1485 unsigned long start_page) 1486 { 1487 int res = 0; 1488 1489 /* Check if page is from UFFD-managed region. */ 1490 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1491 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1492 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1493 1494 /* Flush async buffers before un-protect. */ 1495 qemu_fflush(pss->pss_channel); 1496 /* Un-protect memory range. */ 1497 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1498 false, false); 1499 } 1500 1501 return res; 1502 } 1503 1504 /* ram_write_tracking_available: check if kernel supports required UFFD features 1505 * 1506 * Returns true if supports, false otherwise 1507 */ 1508 bool ram_write_tracking_available(void) 1509 { 1510 uint64_t uffd_features; 1511 int res; 1512 1513 res = uffd_query_features(&uffd_features); 1514 return (res == 0 && 1515 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1516 } 1517 1518 /* ram_write_tracking_compatible: check if guest configuration is 1519 * compatible with 'write-tracking' 1520 * 1521 * Returns true if compatible, false otherwise 1522 */ 1523 bool ram_write_tracking_compatible(void) 1524 { 1525 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1526 int uffd_fd; 1527 RAMBlock *block; 1528 bool ret = false; 1529 1530 /* Open UFFD file descriptor */ 1531 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1532 if (uffd_fd < 0) { 1533 return false; 1534 } 1535 1536 RCU_READ_LOCK_GUARD(); 1537 1538 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1539 uint64_t uffd_ioctls; 1540 1541 /* Nothing to do with read-only and MMIO-writable regions */ 1542 if (block->mr->readonly || block->mr->rom_device) { 1543 continue; 1544 } 1545 /* Try to register block memory via UFFD-IO to track writes */ 1546 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1547 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1548 goto out; 1549 } 1550 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1551 goto out; 1552 } 1553 } 1554 ret = true; 1555 1556 out: 1557 uffd_close_fd(uffd_fd); 1558 return ret; 1559 } 1560 1561 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1562 ram_addr_t size) 1563 { 1564 const ram_addr_t end = offset + size; 1565 1566 /* 1567 * We read one byte of each page; this will preallocate page tables if 1568 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1569 * where no page was populated yet. This might require adaption when 1570 * supporting other mappings, like shmem. 1571 */ 1572 for (; offset < end; offset += block->page_size) { 1573 char tmp = *((char *)block->host + offset); 1574 1575 /* Don't optimize the read out */ 1576 asm volatile("" : "+r" (tmp)); 1577 } 1578 } 1579 1580 static inline int populate_read_section(MemoryRegionSection *section, 1581 void *opaque) 1582 { 1583 const hwaddr size = int128_get64(section->size); 1584 hwaddr offset = section->offset_within_region; 1585 RAMBlock *block = section->mr->ram_block; 1586 1587 populate_read_range(block, offset, size); 1588 return 0; 1589 } 1590 1591 /* 1592 * ram_block_populate_read: preallocate page tables and populate pages in the 1593 * RAM block by reading a byte of each page. 1594 * 1595 * Since it's solely used for userfault_fd WP feature, here we just 1596 * hardcode page size to qemu_real_host_page_size. 1597 * 1598 * @block: RAM block to populate 1599 */ 1600 static void ram_block_populate_read(RAMBlock *rb) 1601 { 1602 /* 1603 * Skip populating all pages that fall into a discarded range as managed by 1604 * a RamDiscardManager responsible for the mapped memory region of the 1605 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1606 * must not get populated automatically. We don't have to track 1607 * modifications via userfaultfd WP reliably, because these pages will 1608 * not be part of the migration stream either way -- see 1609 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1610 * 1611 * Note: The result is only stable while migrating (precopy/postcopy). 1612 */ 1613 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1614 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1615 MemoryRegionSection section = { 1616 .mr = rb->mr, 1617 .offset_within_region = 0, 1618 .size = rb->mr->size, 1619 }; 1620 1621 ram_discard_manager_replay_populated(rdm, §ion, 1622 populate_read_section, NULL); 1623 } else { 1624 populate_read_range(rb, 0, rb->used_length); 1625 } 1626 } 1627 1628 /* 1629 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1630 */ 1631 void ram_write_tracking_prepare(void) 1632 { 1633 RAMBlock *block; 1634 1635 RCU_READ_LOCK_GUARD(); 1636 1637 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1638 /* Nothing to do with read-only and MMIO-writable regions */ 1639 if (block->mr->readonly || block->mr->rom_device) { 1640 continue; 1641 } 1642 1643 /* 1644 * Populate pages of the RAM block before enabling userfault_fd 1645 * write protection. 1646 * 1647 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1648 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1649 * pages with pte_none() entries in page table. 1650 */ 1651 ram_block_populate_read(block); 1652 } 1653 } 1654 1655 static inline int uffd_protect_section(MemoryRegionSection *section, 1656 void *opaque) 1657 { 1658 const hwaddr size = int128_get64(section->size); 1659 const hwaddr offset = section->offset_within_region; 1660 RAMBlock *rb = section->mr->ram_block; 1661 int uffd_fd = (uintptr_t)opaque; 1662 1663 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1664 false); 1665 } 1666 1667 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1668 { 1669 assert(rb->flags & RAM_UF_WRITEPROTECT); 1670 1671 /* See ram_block_populate_read() */ 1672 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1673 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1674 MemoryRegionSection section = { 1675 .mr = rb->mr, 1676 .offset_within_region = 0, 1677 .size = rb->mr->size, 1678 }; 1679 1680 return ram_discard_manager_replay_populated(rdm, §ion, 1681 uffd_protect_section, 1682 (void *)(uintptr_t)uffd_fd); 1683 } 1684 return uffd_change_protection(uffd_fd, rb->host, 1685 rb->used_length, true, false); 1686 } 1687 1688 /* 1689 * ram_write_tracking_start: start UFFD-WP memory tracking 1690 * 1691 * Returns 0 for success or negative value in case of error 1692 */ 1693 int ram_write_tracking_start(void) 1694 { 1695 int uffd_fd; 1696 RAMState *rs = ram_state; 1697 RAMBlock *block; 1698 1699 /* Open UFFD file descriptor */ 1700 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1701 if (uffd_fd < 0) { 1702 return uffd_fd; 1703 } 1704 rs->uffdio_fd = uffd_fd; 1705 1706 RCU_READ_LOCK_GUARD(); 1707 1708 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1709 /* Nothing to do with read-only and MMIO-writable regions */ 1710 if (block->mr->readonly || block->mr->rom_device) { 1711 continue; 1712 } 1713 1714 /* Register block memory with UFFD to track writes */ 1715 if (uffd_register_memory(rs->uffdio_fd, block->host, 1716 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1717 goto fail; 1718 } 1719 block->flags |= RAM_UF_WRITEPROTECT; 1720 memory_region_ref(block->mr); 1721 1722 /* Apply UFFD write protection to the block memory range */ 1723 if (ram_block_uffd_protect(block, uffd_fd)) { 1724 goto fail; 1725 } 1726 1727 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1728 block->host, block->max_length); 1729 } 1730 1731 return 0; 1732 1733 fail: 1734 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1735 1736 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1737 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1738 continue; 1739 } 1740 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1741 /* Cleanup flags and remove reference */ 1742 block->flags &= ~RAM_UF_WRITEPROTECT; 1743 memory_region_unref(block->mr); 1744 } 1745 1746 uffd_close_fd(uffd_fd); 1747 rs->uffdio_fd = -1; 1748 return -1; 1749 } 1750 1751 /** 1752 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1753 */ 1754 void ram_write_tracking_stop(void) 1755 { 1756 RAMState *rs = ram_state; 1757 RAMBlock *block; 1758 1759 RCU_READ_LOCK_GUARD(); 1760 1761 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1762 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1763 continue; 1764 } 1765 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1766 1767 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1768 block->host, block->max_length); 1769 1770 /* Cleanup flags and remove reference */ 1771 block->flags &= ~RAM_UF_WRITEPROTECT; 1772 memory_region_unref(block->mr); 1773 } 1774 1775 /* Finally close UFFD file descriptor */ 1776 uffd_close_fd(rs->uffdio_fd); 1777 rs->uffdio_fd = -1; 1778 } 1779 1780 #else 1781 /* No target OS support, stubs just fail or ignore */ 1782 1783 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1784 { 1785 (void) rs; 1786 (void) offset; 1787 1788 return NULL; 1789 } 1790 1791 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1792 unsigned long start_page) 1793 { 1794 (void) rs; 1795 (void) pss; 1796 (void) start_page; 1797 1798 return 0; 1799 } 1800 1801 bool ram_write_tracking_available(void) 1802 { 1803 return false; 1804 } 1805 1806 bool ram_write_tracking_compatible(void) 1807 { 1808 assert(0); 1809 return false; 1810 } 1811 1812 int ram_write_tracking_start(void) 1813 { 1814 assert(0); 1815 return -1; 1816 } 1817 1818 void ram_write_tracking_stop(void) 1819 { 1820 assert(0); 1821 } 1822 #endif /* defined(__linux__) */ 1823 1824 /** 1825 * get_queued_page: unqueue a page from the postcopy requests 1826 * 1827 * Skips pages that are already sent (!dirty) 1828 * 1829 * Returns true if a queued page is found 1830 * 1831 * @rs: current RAM state 1832 * @pss: data about the state of the current dirty page scan 1833 */ 1834 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1835 { 1836 RAMBlock *block; 1837 ram_addr_t offset; 1838 bool dirty; 1839 1840 do { 1841 block = unqueue_page(rs, &offset); 1842 /* 1843 * We're sending this page, and since it's postcopy nothing else 1844 * will dirty it, and we must make sure it doesn't get sent again 1845 * even if this queue request was received after the background 1846 * search already sent it. 1847 */ 1848 if (block) { 1849 unsigned long page; 1850 1851 page = offset >> TARGET_PAGE_BITS; 1852 dirty = test_bit(page, block->bmap); 1853 if (!dirty) { 1854 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1855 page); 1856 } else { 1857 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1858 } 1859 } 1860 1861 } while (block && !dirty); 1862 1863 if (!block) { 1864 /* 1865 * Poll write faults too if background snapshot is enabled; that's 1866 * when we have vcpus got blocked by the write protected pages. 1867 */ 1868 block = poll_fault_page(rs, &offset); 1869 } 1870 1871 if (block) { 1872 /* 1873 * We want the background search to continue from the queued page 1874 * since the guest is likely to want other pages near to the page 1875 * it just requested. 1876 */ 1877 pss->block = block; 1878 pss->page = offset >> TARGET_PAGE_BITS; 1879 1880 /* 1881 * This unqueued page would break the "one round" check, even is 1882 * really rare. 1883 */ 1884 pss->complete_round = false; 1885 } 1886 1887 return !!block; 1888 } 1889 1890 /** 1891 * migration_page_queue_free: drop any remaining pages in the ram 1892 * request queue 1893 * 1894 * It should be empty at the end anyway, but in error cases there may 1895 * be some left. in case that there is any page left, we drop it. 1896 * 1897 */ 1898 static void migration_page_queue_free(RAMState *rs) 1899 { 1900 struct RAMSrcPageRequest *mspr, *next_mspr; 1901 /* This queue generally should be empty - but in the case of a failed 1902 * migration might have some droppings in. 1903 */ 1904 RCU_READ_LOCK_GUARD(); 1905 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1906 memory_region_unref(mspr->rb->mr); 1907 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1908 g_free(mspr); 1909 } 1910 } 1911 1912 /** 1913 * ram_save_queue_pages: queue the page for transmission 1914 * 1915 * A request from postcopy destination for example. 1916 * 1917 * Returns zero on success or negative on error 1918 * 1919 * @rbname: Name of the RAMBLock of the request. NULL means the 1920 * same that last one. 1921 * @start: starting address from the start of the RAMBlock 1922 * @len: length (in bytes) to send 1923 */ 1924 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1925 { 1926 RAMBlock *ramblock; 1927 RAMState *rs = ram_state; 1928 1929 stat64_add(&mig_stats.postcopy_requests, 1); 1930 RCU_READ_LOCK_GUARD(); 1931 1932 if (!rbname) { 1933 /* Reuse last RAMBlock */ 1934 ramblock = rs->last_req_rb; 1935 1936 if (!ramblock) { 1937 /* 1938 * Shouldn't happen, we can't reuse the last RAMBlock if 1939 * it's the 1st request. 1940 */ 1941 error_report("ram_save_queue_pages no previous block"); 1942 return -1; 1943 } 1944 } else { 1945 ramblock = qemu_ram_block_by_name(rbname); 1946 1947 if (!ramblock) { 1948 /* We shouldn't be asked for a non-existent RAMBlock */ 1949 error_report("ram_save_queue_pages no block '%s'", rbname); 1950 return -1; 1951 } 1952 rs->last_req_rb = ramblock; 1953 } 1954 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1955 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1956 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1957 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1958 __func__, start, len, ramblock->used_length); 1959 return -1; 1960 } 1961 1962 /* 1963 * When with postcopy preempt, we send back the page directly in the 1964 * rp-return thread. 1965 */ 1966 if (postcopy_preempt_active()) { 1967 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1968 size_t page_size = qemu_ram_pagesize(ramblock); 1969 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1970 int ret = 0; 1971 1972 qemu_mutex_lock(&rs->bitmap_mutex); 1973 1974 pss_init(pss, ramblock, page_start); 1975 /* 1976 * Always use the preempt channel, and make sure it's there. It's 1977 * safe to access without lock, because when rp-thread is running 1978 * we should be the only one who operates on the qemufile 1979 */ 1980 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 1981 assert(pss->pss_channel); 1982 1983 /* 1984 * It must be either one or multiple of host page size. Just 1985 * assert; if something wrong we're mostly split brain anyway. 1986 */ 1987 assert(len % page_size == 0); 1988 while (len) { 1989 if (ram_save_host_page_urgent(pss)) { 1990 error_report("%s: ram_save_host_page_urgent() failed: " 1991 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 1992 __func__, ramblock->idstr, start); 1993 ret = -1; 1994 break; 1995 } 1996 /* 1997 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 1998 * will automatically be moved and point to the next host page 1999 * we're going to send, so no need to update here. 2000 * 2001 * Normally QEMU never sends >1 host page in requests, so 2002 * logically we don't even need that as the loop should only 2003 * run once, but just to be consistent. 2004 */ 2005 len -= page_size; 2006 }; 2007 qemu_mutex_unlock(&rs->bitmap_mutex); 2008 2009 return ret; 2010 } 2011 2012 struct RAMSrcPageRequest *new_entry = 2013 g_new0(struct RAMSrcPageRequest, 1); 2014 new_entry->rb = ramblock; 2015 new_entry->offset = start; 2016 new_entry->len = len; 2017 2018 memory_region_ref(ramblock->mr); 2019 qemu_mutex_lock(&rs->src_page_req_mutex); 2020 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2021 migration_make_urgent_request(); 2022 qemu_mutex_unlock(&rs->src_page_req_mutex); 2023 2024 return 0; 2025 } 2026 2027 static bool save_page_use_compression(RAMState *rs) 2028 { 2029 if (!migrate_compress()) { 2030 return false; 2031 } 2032 2033 /* 2034 * If xbzrle is enabled (e.g., after first round of migration), stop 2035 * using the data compression. In theory, xbzrle can do better than 2036 * compression. 2037 */ 2038 if (rs->xbzrle_started) { 2039 return false; 2040 } 2041 2042 return true; 2043 } 2044 2045 /* 2046 * try to compress the page before posting it out, return true if the page 2047 * has been properly handled by compression, otherwise needs other 2048 * paths to handle it 2049 */ 2050 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2051 RAMBlock *block, ram_addr_t offset) 2052 { 2053 if (!save_page_use_compression(rs)) { 2054 return false; 2055 } 2056 2057 /* 2058 * When starting the process of a new block, the first page of 2059 * the block should be sent out before other pages in the same 2060 * block, and all the pages in last block should have been sent 2061 * out, keeping this order is important, because the 'cont' flag 2062 * is used to avoid resending the block name. 2063 * 2064 * We post the fist page as normal page as compression will take 2065 * much CPU resource. 2066 */ 2067 if (block != pss->last_sent_block) { 2068 ram_flush_compressed_data(rs); 2069 return false; 2070 } 2071 2072 if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) { 2073 return true; 2074 } 2075 2076 compression_counters.busy++; 2077 return false; 2078 } 2079 2080 /** 2081 * ram_save_target_page_legacy: save one target page 2082 * 2083 * Returns the number of pages written 2084 * 2085 * @rs: current RAM state 2086 * @pss: data about the page we want to send 2087 */ 2088 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2089 { 2090 RAMBlock *block = pss->block; 2091 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2092 int res; 2093 2094 if (control_save_page(pss, block, offset, &res)) { 2095 return res; 2096 } 2097 2098 if (save_compress_page(rs, pss, block, offset)) { 2099 return 1; 2100 } 2101 2102 res = save_zero_page(pss, pss->pss_channel, block, offset); 2103 if (res > 0) { 2104 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2105 * page would be stale 2106 */ 2107 if (rs->xbzrle_started) { 2108 XBZRLE_cache_lock(); 2109 xbzrle_cache_zero_page(rs, block->offset + offset); 2110 XBZRLE_cache_unlock(); 2111 } 2112 return res; 2113 } 2114 2115 /* 2116 * Do not use multifd in postcopy as one whole host page should be 2117 * placed. Meanwhile postcopy requires atomic update of pages, so even 2118 * if host page size == guest page size the dest guest during run may 2119 * still see partially copied pages which is data corruption. 2120 */ 2121 if (migrate_multifd() && !migration_in_postcopy()) { 2122 return ram_save_multifd_page(pss->pss_channel, block, offset); 2123 } 2124 2125 return ram_save_page(rs, pss); 2126 } 2127 2128 /* Should be called before sending a host page */ 2129 static void pss_host_page_prepare(PageSearchStatus *pss) 2130 { 2131 /* How many guest pages are there in one host page? */ 2132 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2133 2134 pss->host_page_sending = true; 2135 if (guest_pfns <= 1) { 2136 /* 2137 * This covers both when guest psize == host psize, or when guest 2138 * has larger psize than the host (guest_pfns==0). 2139 * 2140 * For the latter, we always send one whole guest page per 2141 * iteration of the host page (example: an Alpha VM on x86 host 2142 * will have guest psize 8K while host psize 4K). 2143 */ 2144 pss->host_page_start = pss->page; 2145 pss->host_page_end = pss->page + 1; 2146 } else { 2147 /* 2148 * The host page spans over multiple guest pages, we send them 2149 * within the same host page iteration. 2150 */ 2151 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2152 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2153 } 2154 } 2155 2156 /* 2157 * Whether the page pointed by PSS is within the host page being sent. 2158 * Must be called after a previous pss_host_page_prepare(). 2159 */ 2160 static bool pss_within_range(PageSearchStatus *pss) 2161 { 2162 ram_addr_t ram_addr; 2163 2164 assert(pss->host_page_sending); 2165 2166 /* Over host-page boundary? */ 2167 if (pss->page >= pss->host_page_end) { 2168 return false; 2169 } 2170 2171 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2172 2173 return offset_in_ramblock(pss->block, ram_addr); 2174 } 2175 2176 static void pss_host_page_finish(PageSearchStatus *pss) 2177 { 2178 pss->host_page_sending = false; 2179 /* This is not needed, but just to reset it */ 2180 pss->host_page_start = pss->host_page_end = 0; 2181 } 2182 2183 /* 2184 * Send an urgent host page specified by `pss'. Need to be called with 2185 * bitmap_mutex held. 2186 * 2187 * Returns 0 if save host page succeeded, false otherwise. 2188 */ 2189 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2190 { 2191 bool page_dirty, sent = false; 2192 RAMState *rs = ram_state; 2193 int ret = 0; 2194 2195 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2196 pss_host_page_prepare(pss); 2197 2198 /* 2199 * If precopy is sending the same page, let it be done in precopy, or 2200 * we could send the same page in two channels and none of them will 2201 * receive the whole page. 2202 */ 2203 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2204 trace_postcopy_preempt_hit(pss->block->idstr, 2205 pss->page << TARGET_PAGE_BITS); 2206 return 0; 2207 } 2208 2209 do { 2210 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2211 2212 if (page_dirty) { 2213 /* Be strict to return code; it must be 1, or what else? */ 2214 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2215 error_report_once("%s: ram_save_target_page failed", __func__); 2216 ret = -1; 2217 goto out; 2218 } 2219 sent = true; 2220 } 2221 pss_find_next_dirty(pss); 2222 } while (pss_within_range(pss)); 2223 out: 2224 pss_host_page_finish(pss); 2225 /* For urgent requests, flush immediately if sent */ 2226 if (sent) { 2227 qemu_fflush(pss->pss_channel); 2228 } 2229 return ret; 2230 } 2231 2232 /** 2233 * ram_save_host_page: save a whole host page 2234 * 2235 * Starting at *offset send pages up to the end of the current host 2236 * page. It's valid for the initial offset to point into the middle of 2237 * a host page in which case the remainder of the hostpage is sent. 2238 * Only dirty target pages are sent. Note that the host page size may 2239 * be a huge page for this block. 2240 * 2241 * The saving stops at the boundary of the used_length of the block 2242 * if the RAMBlock isn't a multiple of the host page size. 2243 * 2244 * The caller must be with ram_state.bitmap_mutex held to call this 2245 * function. Note that this function can temporarily release the lock, but 2246 * when the function is returned it'll make sure the lock is still held. 2247 * 2248 * Returns the number of pages written or negative on error 2249 * 2250 * @rs: current RAM state 2251 * @pss: data about the page we want to send 2252 */ 2253 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2254 { 2255 bool page_dirty, preempt_active = postcopy_preempt_active(); 2256 int tmppages, pages = 0; 2257 size_t pagesize_bits = 2258 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2259 unsigned long start_page = pss->page; 2260 int res; 2261 2262 if (ramblock_is_ignored(pss->block)) { 2263 error_report("block %s should not be migrated !", pss->block->idstr); 2264 return 0; 2265 } 2266 2267 /* Update host page boundary information */ 2268 pss_host_page_prepare(pss); 2269 2270 do { 2271 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2272 2273 /* Check the pages is dirty and if it is send it */ 2274 if (page_dirty) { 2275 /* 2276 * Properly yield the lock only in postcopy preempt mode 2277 * because both migration thread and rp-return thread can 2278 * operate on the bitmaps. 2279 */ 2280 if (preempt_active) { 2281 qemu_mutex_unlock(&rs->bitmap_mutex); 2282 } 2283 tmppages = migration_ops->ram_save_target_page(rs, pss); 2284 if (tmppages >= 0) { 2285 pages += tmppages; 2286 /* 2287 * Allow rate limiting to happen in the middle of huge pages if 2288 * something is sent in the current iteration. 2289 */ 2290 if (pagesize_bits > 1 && tmppages > 0) { 2291 migration_rate_limit(); 2292 } 2293 } 2294 if (preempt_active) { 2295 qemu_mutex_lock(&rs->bitmap_mutex); 2296 } 2297 } else { 2298 tmppages = 0; 2299 } 2300 2301 if (tmppages < 0) { 2302 pss_host_page_finish(pss); 2303 return tmppages; 2304 } 2305 2306 pss_find_next_dirty(pss); 2307 } while (pss_within_range(pss)); 2308 2309 pss_host_page_finish(pss); 2310 2311 res = ram_save_release_protection(rs, pss, start_page); 2312 return (res < 0 ? res : pages); 2313 } 2314 2315 /** 2316 * ram_find_and_save_block: finds a dirty page and sends it to f 2317 * 2318 * Called within an RCU critical section. 2319 * 2320 * Returns the number of pages written where zero means no dirty pages, 2321 * or negative on error 2322 * 2323 * @rs: current RAM state 2324 * 2325 * On systems where host-page-size > target-page-size it will send all the 2326 * pages in a host page that are dirty. 2327 */ 2328 static int ram_find_and_save_block(RAMState *rs) 2329 { 2330 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2331 int pages = 0; 2332 2333 /* No dirty page as there is zero RAM */ 2334 if (!rs->ram_bytes_total) { 2335 return pages; 2336 } 2337 2338 /* 2339 * Always keep last_seen_block/last_page valid during this procedure, 2340 * because find_dirty_block() relies on these values (e.g., we compare 2341 * last_seen_block with pss.block to see whether we searched all the 2342 * ramblocks) to detect the completion of migration. Having NULL value 2343 * of last_seen_block can conditionally cause below loop to run forever. 2344 */ 2345 if (!rs->last_seen_block) { 2346 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2347 rs->last_page = 0; 2348 } 2349 2350 pss_init(pss, rs->last_seen_block, rs->last_page); 2351 2352 while (true){ 2353 if (!get_queued_page(rs, pss)) { 2354 /* priority queue empty, so just search for something dirty */ 2355 int res = find_dirty_block(rs, pss); 2356 if (res != PAGE_DIRTY_FOUND) { 2357 if (res == PAGE_ALL_CLEAN) { 2358 break; 2359 } else if (res == PAGE_TRY_AGAIN) { 2360 continue; 2361 } else if (res < 0) { 2362 pages = res; 2363 break; 2364 } 2365 } 2366 } 2367 pages = ram_save_host_page(rs, pss); 2368 if (pages) { 2369 break; 2370 } 2371 } 2372 2373 rs->last_seen_block = pss->block; 2374 rs->last_page = pss->page; 2375 2376 return pages; 2377 } 2378 2379 static uint64_t ram_bytes_total_with_ignored(void) 2380 { 2381 RAMBlock *block; 2382 uint64_t total = 0; 2383 2384 RCU_READ_LOCK_GUARD(); 2385 2386 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2387 total += block->used_length; 2388 } 2389 return total; 2390 } 2391 2392 uint64_t ram_bytes_total(void) 2393 { 2394 RAMBlock *block; 2395 uint64_t total = 0; 2396 2397 RCU_READ_LOCK_GUARD(); 2398 2399 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2400 total += block->used_length; 2401 } 2402 return total; 2403 } 2404 2405 static void xbzrle_load_setup(void) 2406 { 2407 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2408 } 2409 2410 static void xbzrle_load_cleanup(void) 2411 { 2412 g_free(XBZRLE.decoded_buf); 2413 XBZRLE.decoded_buf = NULL; 2414 } 2415 2416 static void ram_state_cleanup(RAMState **rsp) 2417 { 2418 if (*rsp) { 2419 migration_page_queue_free(*rsp); 2420 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2421 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2422 g_free(*rsp); 2423 *rsp = NULL; 2424 } 2425 } 2426 2427 static void xbzrle_cleanup(void) 2428 { 2429 XBZRLE_cache_lock(); 2430 if (XBZRLE.cache) { 2431 cache_fini(XBZRLE.cache); 2432 g_free(XBZRLE.encoded_buf); 2433 g_free(XBZRLE.current_buf); 2434 g_free(XBZRLE.zero_target_page); 2435 XBZRLE.cache = NULL; 2436 XBZRLE.encoded_buf = NULL; 2437 XBZRLE.current_buf = NULL; 2438 XBZRLE.zero_target_page = NULL; 2439 } 2440 XBZRLE_cache_unlock(); 2441 } 2442 2443 static void ram_save_cleanup(void *opaque) 2444 { 2445 RAMState **rsp = opaque; 2446 RAMBlock *block; 2447 2448 /* We don't use dirty log with background snapshots */ 2449 if (!migrate_background_snapshot()) { 2450 /* caller have hold iothread lock or is in a bh, so there is 2451 * no writing race against the migration bitmap 2452 */ 2453 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2454 /* 2455 * do not stop dirty log without starting it, since 2456 * memory_global_dirty_log_stop will assert that 2457 * memory_global_dirty_log_start/stop used in pairs 2458 */ 2459 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2460 } 2461 } 2462 2463 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2464 g_free(block->clear_bmap); 2465 block->clear_bmap = NULL; 2466 g_free(block->bmap); 2467 block->bmap = NULL; 2468 } 2469 2470 xbzrle_cleanup(); 2471 compress_threads_save_cleanup(); 2472 ram_state_cleanup(rsp); 2473 g_free(migration_ops); 2474 migration_ops = NULL; 2475 } 2476 2477 static void ram_state_reset(RAMState *rs) 2478 { 2479 int i; 2480 2481 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2482 rs->pss[i].last_sent_block = NULL; 2483 } 2484 2485 rs->last_seen_block = NULL; 2486 rs->last_page = 0; 2487 rs->last_version = ram_list.version; 2488 rs->xbzrle_started = false; 2489 } 2490 2491 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2492 2493 /* **** functions for postcopy ***** */ 2494 2495 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2496 { 2497 struct RAMBlock *block; 2498 2499 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2500 unsigned long *bitmap = block->bmap; 2501 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2502 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2503 2504 while (run_start < range) { 2505 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2506 ram_discard_range(block->idstr, 2507 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2508 ((ram_addr_t)(run_end - run_start)) 2509 << TARGET_PAGE_BITS); 2510 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2511 } 2512 } 2513 } 2514 2515 /** 2516 * postcopy_send_discard_bm_ram: discard a RAMBlock 2517 * 2518 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2519 * 2520 * @ms: current migration state 2521 * @block: RAMBlock to discard 2522 */ 2523 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2524 { 2525 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2526 unsigned long current; 2527 unsigned long *bitmap = block->bmap; 2528 2529 for (current = 0; current < end; ) { 2530 unsigned long one = find_next_bit(bitmap, end, current); 2531 unsigned long zero, discard_length; 2532 2533 if (one >= end) { 2534 break; 2535 } 2536 2537 zero = find_next_zero_bit(bitmap, end, one + 1); 2538 2539 if (zero >= end) { 2540 discard_length = end - one; 2541 } else { 2542 discard_length = zero - one; 2543 } 2544 postcopy_discard_send_range(ms, one, discard_length); 2545 current = one + discard_length; 2546 } 2547 } 2548 2549 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2550 2551 /** 2552 * postcopy_each_ram_send_discard: discard all RAMBlocks 2553 * 2554 * Utility for the outgoing postcopy code. 2555 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2556 * passing it bitmap indexes and name. 2557 * (qemu_ram_foreach_block ends up passing unscaled lengths 2558 * which would mean postcopy code would have to deal with target page) 2559 * 2560 * @ms: current migration state 2561 */ 2562 static void postcopy_each_ram_send_discard(MigrationState *ms) 2563 { 2564 struct RAMBlock *block; 2565 2566 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2567 postcopy_discard_send_init(ms, block->idstr); 2568 2569 /* 2570 * Deal with TPS != HPS and huge pages. It discard any partially sent 2571 * host-page size chunks, mark any partially dirty host-page size 2572 * chunks as all dirty. In this case the host-page is the host-page 2573 * for the particular RAMBlock, i.e. it might be a huge page. 2574 */ 2575 postcopy_chunk_hostpages_pass(ms, block); 2576 2577 /* 2578 * Postcopy sends chunks of bitmap over the wire, but it 2579 * just needs indexes at this point, avoids it having 2580 * target page specific code. 2581 */ 2582 postcopy_send_discard_bm_ram(ms, block); 2583 postcopy_discard_send_finish(ms); 2584 } 2585 } 2586 2587 /** 2588 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2589 * 2590 * Helper for postcopy_chunk_hostpages; it's called twice to 2591 * canonicalize the two bitmaps, that are similar, but one is 2592 * inverted. 2593 * 2594 * Postcopy requires that all target pages in a hostpage are dirty or 2595 * clean, not a mix. This function canonicalizes the bitmaps. 2596 * 2597 * @ms: current migration state 2598 * @block: block that contains the page we want to canonicalize 2599 */ 2600 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2601 { 2602 RAMState *rs = ram_state; 2603 unsigned long *bitmap = block->bmap; 2604 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2605 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2606 unsigned long run_start; 2607 2608 if (block->page_size == TARGET_PAGE_SIZE) { 2609 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2610 return; 2611 } 2612 2613 /* Find a dirty page */ 2614 run_start = find_next_bit(bitmap, pages, 0); 2615 2616 while (run_start < pages) { 2617 2618 /* 2619 * If the start of this run of pages is in the middle of a host 2620 * page, then we need to fixup this host page. 2621 */ 2622 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2623 /* Find the end of this run */ 2624 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2625 /* 2626 * If the end isn't at the start of a host page, then the 2627 * run doesn't finish at the end of a host page 2628 * and we need to discard. 2629 */ 2630 } 2631 2632 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2633 unsigned long page; 2634 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2635 host_ratio); 2636 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2637 2638 /* Clean up the bitmap */ 2639 for (page = fixup_start_addr; 2640 page < fixup_start_addr + host_ratio; page++) { 2641 /* 2642 * Remark them as dirty, updating the count for any pages 2643 * that weren't previously dirty. 2644 */ 2645 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2646 } 2647 } 2648 2649 /* Find the next dirty page for the next iteration */ 2650 run_start = find_next_bit(bitmap, pages, run_start); 2651 } 2652 } 2653 2654 /** 2655 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2656 * 2657 * Transmit the set of pages to be discarded after precopy to the target 2658 * these are pages that: 2659 * a) Have been previously transmitted but are now dirty again 2660 * b) Pages that have never been transmitted, this ensures that 2661 * any pages on the destination that have been mapped by background 2662 * tasks get discarded (transparent huge pages is the specific concern) 2663 * Hopefully this is pretty sparse 2664 * 2665 * @ms: current migration state 2666 */ 2667 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2668 { 2669 RAMState *rs = ram_state; 2670 2671 RCU_READ_LOCK_GUARD(); 2672 2673 /* This should be our last sync, the src is now paused */ 2674 migration_bitmap_sync(rs, false); 2675 2676 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2677 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2678 rs->last_seen_block = NULL; 2679 rs->last_page = 0; 2680 2681 postcopy_each_ram_send_discard(ms); 2682 2683 trace_ram_postcopy_send_discard_bitmap(); 2684 } 2685 2686 /** 2687 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2688 * 2689 * Returns zero on success 2690 * 2691 * @rbname: name of the RAMBlock of the request. NULL means the 2692 * same that last one. 2693 * @start: RAMBlock starting page 2694 * @length: RAMBlock size 2695 */ 2696 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2697 { 2698 trace_ram_discard_range(rbname, start, length); 2699 2700 RCU_READ_LOCK_GUARD(); 2701 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2702 2703 if (!rb) { 2704 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2705 return -1; 2706 } 2707 2708 /* 2709 * On source VM, we don't need to update the received bitmap since 2710 * we don't even have one. 2711 */ 2712 if (rb->receivedmap) { 2713 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2714 length >> qemu_target_page_bits()); 2715 } 2716 2717 return ram_block_discard_range(rb, start, length); 2718 } 2719 2720 /* 2721 * For every allocation, we will try not to crash the VM if the 2722 * allocation failed. 2723 */ 2724 static int xbzrle_init(void) 2725 { 2726 Error *local_err = NULL; 2727 2728 if (!migrate_xbzrle()) { 2729 return 0; 2730 } 2731 2732 XBZRLE_cache_lock(); 2733 2734 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2735 if (!XBZRLE.zero_target_page) { 2736 error_report("%s: Error allocating zero page", __func__); 2737 goto err_out; 2738 } 2739 2740 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2741 TARGET_PAGE_SIZE, &local_err); 2742 if (!XBZRLE.cache) { 2743 error_report_err(local_err); 2744 goto free_zero_page; 2745 } 2746 2747 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2748 if (!XBZRLE.encoded_buf) { 2749 error_report("%s: Error allocating encoded_buf", __func__); 2750 goto free_cache; 2751 } 2752 2753 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2754 if (!XBZRLE.current_buf) { 2755 error_report("%s: Error allocating current_buf", __func__); 2756 goto free_encoded_buf; 2757 } 2758 2759 /* We are all good */ 2760 XBZRLE_cache_unlock(); 2761 return 0; 2762 2763 free_encoded_buf: 2764 g_free(XBZRLE.encoded_buf); 2765 XBZRLE.encoded_buf = NULL; 2766 free_cache: 2767 cache_fini(XBZRLE.cache); 2768 XBZRLE.cache = NULL; 2769 free_zero_page: 2770 g_free(XBZRLE.zero_target_page); 2771 XBZRLE.zero_target_page = NULL; 2772 err_out: 2773 XBZRLE_cache_unlock(); 2774 return -ENOMEM; 2775 } 2776 2777 static int ram_state_init(RAMState **rsp) 2778 { 2779 *rsp = g_try_new0(RAMState, 1); 2780 2781 if (!*rsp) { 2782 error_report("%s: Init ramstate fail", __func__); 2783 return -1; 2784 } 2785 2786 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2787 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2788 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2789 (*rsp)->ram_bytes_total = ram_bytes_total(); 2790 2791 /* 2792 * Count the total number of pages used by ram blocks not including any 2793 * gaps due to alignment or unplugs. 2794 * This must match with the initial values of dirty bitmap. 2795 */ 2796 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2797 ram_state_reset(*rsp); 2798 2799 return 0; 2800 } 2801 2802 static void ram_list_init_bitmaps(void) 2803 { 2804 MigrationState *ms = migrate_get_current(); 2805 RAMBlock *block; 2806 unsigned long pages; 2807 uint8_t shift; 2808 2809 /* Skip setting bitmap if there is no RAM */ 2810 if (ram_bytes_total()) { 2811 shift = ms->clear_bitmap_shift; 2812 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2813 error_report("clear_bitmap_shift (%u) too big, using " 2814 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2815 shift = CLEAR_BITMAP_SHIFT_MAX; 2816 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2817 error_report("clear_bitmap_shift (%u) too small, using " 2818 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2819 shift = CLEAR_BITMAP_SHIFT_MIN; 2820 } 2821 2822 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2823 pages = block->max_length >> TARGET_PAGE_BITS; 2824 /* 2825 * The initial dirty bitmap for migration must be set with all 2826 * ones to make sure we'll migrate every guest RAM page to 2827 * destination. 2828 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2829 * new migration after a failed migration, ram_list. 2830 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2831 * guest memory. 2832 */ 2833 block->bmap = bitmap_new(pages); 2834 bitmap_set(block->bmap, 0, pages); 2835 block->clear_bmap_shift = shift; 2836 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2837 } 2838 } 2839 } 2840 2841 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2842 { 2843 unsigned long pages; 2844 RAMBlock *rb; 2845 2846 RCU_READ_LOCK_GUARD(); 2847 2848 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2849 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2850 rs->migration_dirty_pages -= pages; 2851 } 2852 } 2853 2854 static void ram_init_bitmaps(RAMState *rs) 2855 { 2856 /* For memory_global_dirty_log_start below. */ 2857 qemu_mutex_lock_iothread(); 2858 qemu_mutex_lock_ramlist(); 2859 2860 WITH_RCU_READ_LOCK_GUARD() { 2861 ram_list_init_bitmaps(); 2862 /* We don't use dirty log with background snapshots */ 2863 if (!migrate_background_snapshot()) { 2864 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 2865 migration_bitmap_sync_precopy(rs, false); 2866 } 2867 } 2868 qemu_mutex_unlock_ramlist(); 2869 qemu_mutex_unlock_iothread(); 2870 2871 /* 2872 * After an eventual first bitmap sync, fixup the initial bitmap 2873 * containing all 1s to exclude any discarded pages from migration. 2874 */ 2875 migration_bitmap_clear_discarded_pages(rs); 2876 } 2877 2878 static int ram_init_all(RAMState **rsp) 2879 { 2880 if (ram_state_init(rsp)) { 2881 return -1; 2882 } 2883 2884 if (xbzrle_init()) { 2885 ram_state_cleanup(rsp); 2886 return -1; 2887 } 2888 2889 ram_init_bitmaps(*rsp); 2890 2891 return 0; 2892 } 2893 2894 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2895 { 2896 RAMBlock *block; 2897 uint64_t pages = 0; 2898 2899 /* 2900 * Postcopy is not using xbzrle/compression, so no need for that. 2901 * Also, since source are already halted, we don't need to care 2902 * about dirty page logging as well. 2903 */ 2904 2905 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2906 pages += bitmap_count_one(block->bmap, 2907 block->used_length >> TARGET_PAGE_BITS); 2908 } 2909 2910 /* This may not be aligned with current bitmaps. Recalculate. */ 2911 rs->migration_dirty_pages = pages; 2912 2913 ram_state_reset(rs); 2914 2915 /* Update RAMState cache of output QEMUFile */ 2916 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2917 2918 trace_ram_state_resume_prepare(pages); 2919 } 2920 2921 /* 2922 * This function clears bits of the free pages reported by the caller from the 2923 * migration dirty bitmap. @addr is the host address corresponding to the 2924 * start of the continuous guest free pages, and @len is the total bytes of 2925 * those pages. 2926 */ 2927 void qemu_guest_free_page_hint(void *addr, size_t len) 2928 { 2929 RAMBlock *block; 2930 ram_addr_t offset; 2931 size_t used_len, start, npages; 2932 MigrationState *s = migrate_get_current(); 2933 2934 /* This function is currently expected to be used during live migration */ 2935 if (!migration_is_setup_or_active(s->state)) { 2936 return; 2937 } 2938 2939 for (; len > 0; len -= used_len, addr += used_len) { 2940 block = qemu_ram_block_from_host(addr, false, &offset); 2941 if (unlikely(!block || offset >= block->used_length)) { 2942 /* 2943 * The implementation might not support RAMBlock resize during 2944 * live migration, but it could happen in theory with future 2945 * updates. So we add a check here to capture that case. 2946 */ 2947 error_report_once("%s unexpected error", __func__); 2948 return; 2949 } 2950 2951 if (len <= block->used_length - offset) { 2952 used_len = len; 2953 } else { 2954 used_len = block->used_length - offset; 2955 } 2956 2957 start = offset >> TARGET_PAGE_BITS; 2958 npages = used_len >> TARGET_PAGE_BITS; 2959 2960 qemu_mutex_lock(&ram_state->bitmap_mutex); 2961 /* 2962 * The skipped free pages are equavalent to be sent from clear_bmap's 2963 * perspective, so clear the bits from the memory region bitmap which 2964 * are initially set. Otherwise those skipped pages will be sent in 2965 * the next round after syncing from the memory region bitmap. 2966 */ 2967 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2968 ram_state->migration_dirty_pages -= 2969 bitmap_count_one_with_offset(block->bmap, start, npages); 2970 bitmap_clear(block->bmap, start, npages); 2971 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2972 } 2973 } 2974 2975 /* 2976 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2977 * long-running RCU critical section. When rcu-reclaims in the code 2978 * start to become numerous it will be necessary to reduce the 2979 * granularity of these critical sections. 2980 */ 2981 2982 /** 2983 * ram_save_setup: Setup RAM for migration 2984 * 2985 * Returns zero to indicate success and negative for error 2986 * 2987 * @f: QEMUFile where to send the data 2988 * @opaque: RAMState pointer 2989 */ 2990 static int ram_save_setup(QEMUFile *f, void *opaque) 2991 { 2992 RAMState **rsp = opaque; 2993 RAMBlock *block; 2994 int ret; 2995 2996 if (compress_threads_save_setup()) { 2997 return -1; 2998 } 2999 3000 /* migration has already setup the bitmap, reuse it. */ 3001 if (!migration_in_colo_state()) { 3002 if (ram_init_all(rsp) != 0) { 3003 compress_threads_save_cleanup(); 3004 return -1; 3005 } 3006 } 3007 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3008 3009 WITH_RCU_READ_LOCK_GUARD() { 3010 qemu_put_be64(f, ram_bytes_total_with_ignored() 3011 | RAM_SAVE_FLAG_MEM_SIZE); 3012 3013 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3014 qemu_put_byte(f, strlen(block->idstr)); 3015 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3016 qemu_put_be64(f, block->used_length); 3017 if (migrate_postcopy_ram() && block->page_size != 3018 qemu_host_page_size) { 3019 qemu_put_be64(f, block->page_size); 3020 } 3021 if (migrate_ignore_shared()) { 3022 qemu_put_be64(f, block->mr->addr); 3023 } 3024 } 3025 } 3026 3027 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3028 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3029 3030 migration_ops = g_malloc0(sizeof(MigrationOps)); 3031 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3032 ret = multifd_send_sync_main(f); 3033 if (ret < 0) { 3034 return ret; 3035 } 3036 3037 if (!migrate_multifd_flush_after_each_section()) { 3038 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3039 } 3040 3041 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3042 qemu_fflush(f); 3043 3044 return 0; 3045 } 3046 3047 /** 3048 * ram_save_iterate: iterative stage for migration 3049 * 3050 * Returns zero to indicate success and negative for error 3051 * 3052 * @f: QEMUFile where to send the data 3053 * @opaque: RAMState pointer 3054 */ 3055 static int ram_save_iterate(QEMUFile *f, void *opaque) 3056 { 3057 RAMState **temp = opaque; 3058 RAMState *rs = *temp; 3059 int ret = 0; 3060 int i; 3061 int64_t t0; 3062 int done = 0; 3063 3064 if (blk_mig_bulk_active()) { 3065 /* Avoid transferring ram during bulk phase of block migration as 3066 * the bulk phase will usually take a long time and transferring 3067 * ram updates during that time is pointless. */ 3068 goto out; 3069 } 3070 3071 /* 3072 * We'll take this lock a little bit long, but it's okay for two reasons. 3073 * Firstly, the only possible other thread to take it is who calls 3074 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3075 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3076 * guarantees that we'll at least released it in a regular basis. 3077 */ 3078 qemu_mutex_lock(&rs->bitmap_mutex); 3079 WITH_RCU_READ_LOCK_GUARD() { 3080 if (ram_list.version != rs->last_version) { 3081 ram_state_reset(rs); 3082 } 3083 3084 /* Read version before ram_list.blocks */ 3085 smp_rmb(); 3086 3087 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3088 3089 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3090 i = 0; 3091 while ((ret = migration_rate_exceeded(f)) == 0 || 3092 postcopy_has_request(rs)) { 3093 int pages; 3094 3095 if (qemu_file_get_error(f)) { 3096 break; 3097 } 3098 3099 pages = ram_find_and_save_block(rs); 3100 /* no more pages to sent */ 3101 if (pages == 0) { 3102 done = 1; 3103 break; 3104 } 3105 3106 if (pages < 0) { 3107 qemu_file_set_error(f, pages); 3108 break; 3109 } 3110 3111 rs->target_page_count += pages; 3112 3113 /* 3114 * During postcopy, it is necessary to make sure one whole host 3115 * page is sent in one chunk. 3116 */ 3117 if (migrate_postcopy_ram()) { 3118 ram_flush_compressed_data(rs); 3119 } 3120 3121 /* 3122 * we want to check in the 1st loop, just in case it was the 1st 3123 * time and we had to sync the dirty bitmap. 3124 * qemu_clock_get_ns() is a bit expensive, so we only check each 3125 * some iterations 3126 */ 3127 if ((i & 63) == 0) { 3128 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3129 1000000; 3130 if (t1 > MAX_WAIT) { 3131 trace_ram_save_iterate_big_wait(t1, i); 3132 break; 3133 } 3134 } 3135 i++; 3136 } 3137 } 3138 qemu_mutex_unlock(&rs->bitmap_mutex); 3139 3140 /* 3141 * Must occur before EOS (or any QEMUFile operation) 3142 * because of RDMA protocol. 3143 */ 3144 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3145 3146 out: 3147 if (ret >= 0 3148 && migration_is_setup_or_active(migrate_get_current()->state)) { 3149 if (migrate_multifd_flush_after_each_section()) { 3150 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3151 if (ret < 0) { 3152 return ret; 3153 } 3154 } 3155 3156 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3157 qemu_fflush(f); 3158 ram_transferred_add(8); 3159 3160 ret = qemu_file_get_error(f); 3161 } 3162 if (ret < 0) { 3163 return ret; 3164 } 3165 3166 return done; 3167 } 3168 3169 /** 3170 * ram_save_complete: function called to send the remaining amount of ram 3171 * 3172 * Returns zero to indicate success or negative on error 3173 * 3174 * Called with iothread lock 3175 * 3176 * @f: QEMUFile where to send the data 3177 * @opaque: RAMState pointer 3178 */ 3179 static int ram_save_complete(QEMUFile *f, void *opaque) 3180 { 3181 RAMState **temp = opaque; 3182 RAMState *rs = *temp; 3183 int ret = 0; 3184 3185 rs->last_stage = !migration_in_colo_state(); 3186 3187 WITH_RCU_READ_LOCK_GUARD() { 3188 if (!migration_in_postcopy()) { 3189 migration_bitmap_sync_precopy(rs, true); 3190 } 3191 3192 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3193 3194 /* try transferring iterative blocks of memory */ 3195 3196 /* flush all remaining blocks regardless of rate limiting */ 3197 qemu_mutex_lock(&rs->bitmap_mutex); 3198 while (true) { 3199 int pages; 3200 3201 pages = ram_find_and_save_block(rs); 3202 /* no more blocks to sent */ 3203 if (pages == 0) { 3204 break; 3205 } 3206 if (pages < 0) { 3207 ret = pages; 3208 break; 3209 } 3210 } 3211 qemu_mutex_unlock(&rs->bitmap_mutex); 3212 3213 ram_flush_compressed_data(rs); 3214 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3215 } 3216 3217 if (ret < 0) { 3218 return ret; 3219 } 3220 3221 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3222 if (ret < 0) { 3223 return ret; 3224 } 3225 3226 if (!migrate_multifd_flush_after_each_section()) { 3227 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3228 } 3229 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3230 qemu_fflush(f); 3231 3232 return 0; 3233 } 3234 3235 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3236 uint64_t *can_postcopy) 3237 { 3238 RAMState **temp = opaque; 3239 RAMState *rs = *temp; 3240 3241 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3242 3243 if (migrate_postcopy_ram()) { 3244 /* We can do postcopy, and all the data is postcopiable */ 3245 *can_postcopy += remaining_size; 3246 } else { 3247 *must_precopy += remaining_size; 3248 } 3249 } 3250 3251 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3252 uint64_t *can_postcopy) 3253 { 3254 MigrationState *s = migrate_get_current(); 3255 RAMState **temp = opaque; 3256 RAMState *rs = *temp; 3257 3258 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3259 3260 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3261 qemu_mutex_lock_iothread(); 3262 WITH_RCU_READ_LOCK_GUARD() { 3263 migration_bitmap_sync_precopy(rs, false); 3264 } 3265 qemu_mutex_unlock_iothread(); 3266 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3267 } 3268 3269 if (migrate_postcopy_ram()) { 3270 /* We can do postcopy, and all the data is postcopiable */ 3271 *can_postcopy += remaining_size; 3272 } else { 3273 *must_precopy += remaining_size; 3274 } 3275 } 3276 3277 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3278 { 3279 unsigned int xh_len; 3280 int xh_flags; 3281 uint8_t *loaded_data; 3282 3283 /* extract RLE header */ 3284 xh_flags = qemu_get_byte(f); 3285 xh_len = qemu_get_be16(f); 3286 3287 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3288 error_report("Failed to load XBZRLE page - wrong compression!"); 3289 return -1; 3290 } 3291 3292 if (xh_len > TARGET_PAGE_SIZE) { 3293 error_report("Failed to load XBZRLE page - len overflow!"); 3294 return -1; 3295 } 3296 loaded_data = XBZRLE.decoded_buf; 3297 /* load data and decode */ 3298 /* it can change loaded_data to point to an internal buffer */ 3299 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3300 3301 /* decode RLE */ 3302 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3303 TARGET_PAGE_SIZE) == -1) { 3304 error_report("Failed to load XBZRLE page - decode error!"); 3305 return -1; 3306 } 3307 3308 return 0; 3309 } 3310 3311 /** 3312 * ram_block_from_stream: read a RAMBlock id from the migration stream 3313 * 3314 * Must be called from within a rcu critical section. 3315 * 3316 * Returns a pointer from within the RCU-protected ram_list. 3317 * 3318 * @mis: the migration incoming state pointer 3319 * @f: QEMUFile where to read the data from 3320 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3321 * @channel: the channel we're using 3322 */ 3323 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3324 QEMUFile *f, int flags, 3325 int channel) 3326 { 3327 RAMBlock *block = mis->last_recv_block[channel]; 3328 char id[256]; 3329 uint8_t len; 3330 3331 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3332 if (!block) { 3333 error_report("Ack, bad migration stream!"); 3334 return NULL; 3335 } 3336 return block; 3337 } 3338 3339 len = qemu_get_byte(f); 3340 qemu_get_buffer(f, (uint8_t *)id, len); 3341 id[len] = 0; 3342 3343 block = qemu_ram_block_by_name(id); 3344 if (!block) { 3345 error_report("Can't find block %s", id); 3346 return NULL; 3347 } 3348 3349 if (ramblock_is_ignored(block)) { 3350 error_report("block %s should not be migrated !", id); 3351 return NULL; 3352 } 3353 3354 mis->last_recv_block[channel] = block; 3355 3356 return block; 3357 } 3358 3359 static inline void *host_from_ram_block_offset(RAMBlock *block, 3360 ram_addr_t offset) 3361 { 3362 if (!offset_in_ramblock(block, offset)) { 3363 return NULL; 3364 } 3365 3366 return block->host + offset; 3367 } 3368 3369 static void *host_page_from_ram_block_offset(RAMBlock *block, 3370 ram_addr_t offset) 3371 { 3372 /* Note: Explicitly no check against offset_in_ramblock(). */ 3373 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3374 block->page_size); 3375 } 3376 3377 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3378 ram_addr_t offset) 3379 { 3380 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3381 } 3382 3383 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3384 { 3385 qemu_mutex_lock(&ram_state->bitmap_mutex); 3386 for (int i = 0; i < pages; i++) { 3387 ram_addr_t offset = normal[i]; 3388 ram_state->migration_dirty_pages += !test_and_set_bit( 3389 offset >> TARGET_PAGE_BITS, 3390 block->bmap); 3391 } 3392 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3393 } 3394 3395 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3396 ram_addr_t offset, bool record_bitmap) 3397 { 3398 if (!offset_in_ramblock(block, offset)) { 3399 return NULL; 3400 } 3401 if (!block->colo_cache) { 3402 error_report("%s: colo_cache is NULL in block :%s", 3403 __func__, block->idstr); 3404 return NULL; 3405 } 3406 3407 /* 3408 * During colo checkpoint, we need bitmap of these migrated pages. 3409 * It help us to decide which pages in ram cache should be flushed 3410 * into VM's RAM later. 3411 */ 3412 if (record_bitmap) { 3413 colo_record_bitmap(block, &offset, 1); 3414 } 3415 return block->colo_cache + offset; 3416 } 3417 3418 /** 3419 * ram_handle_compressed: handle the zero page case 3420 * 3421 * If a page (or a whole RDMA chunk) has been 3422 * determined to be zero, then zap it. 3423 * 3424 * @host: host address for the zero page 3425 * @ch: what the page is filled from. We only support zero 3426 * @size: size of the zero page 3427 */ 3428 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3429 { 3430 if (ch != 0 || !buffer_is_zero(host, size)) { 3431 memset(host, ch, size); 3432 } 3433 } 3434 3435 static void colo_init_ram_state(void) 3436 { 3437 ram_state_init(&ram_state); 3438 } 3439 3440 /* 3441 * colo cache: this is for secondary VM, we cache the whole 3442 * memory of the secondary VM, it is need to hold the global lock 3443 * to call this helper. 3444 */ 3445 int colo_init_ram_cache(void) 3446 { 3447 RAMBlock *block; 3448 3449 WITH_RCU_READ_LOCK_GUARD() { 3450 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3451 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3452 NULL, false, false); 3453 if (!block->colo_cache) { 3454 error_report("%s: Can't alloc memory for COLO cache of block %s," 3455 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3456 block->used_length); 3457 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3458 if (block->colo_cache) { 3459 qemu_anon_ram_free(block->colo_cache, block->used_length); 3460 block->colo_cache = NULL; 3461 } 3462 } 3463 return -errno; 3464 } 3465 if (!machine_dump_guest_core(current_machine)) { 3466 qemu_madvise(block->colo_cache, block->used_length, 3467 QEMU_MADV_DONTDUMP); 3468 } 3469 } 3470 } 3471 3472 /* 3473 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3474 * with to decide which page in cache should be flushed into SVM's RAM. Here 3475 * we use the same name 'ram_bitmap' as for migration. 3476 */ 3477 if (ram_bytes_total()) { 3478 RAMBlock *block; 3479 3480 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3481 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3482 block->bmap = bitmap_new(pages); 3483 } 3484 } 3485 3486 colo_init_ram_state(); 3487 return 0; 3488 } 3489 3490 /* TODO: duplicated with ram_init_bitmaps */ 3491 void colo_incoming_start_dirty_log(void) 3492 { 3493 RAMBlock *block = NULL; 3494 /* For memory_global_dirty_log_start below. */ 3495 qemu_mutex_lock_iothread(); 3496 qemu_mutex_lock_ramlist(); 3497 3498 memory_global_dirty_log_sync(false); 3499 WITH_RCU_READ_LOCK_GUARD() { 3500 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3501 ramblock_sync_dirty_bitmap(ram_state, block); 3502 /* Discard this dirty bitmap record */ 3503 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3504 } 3505 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3506 } 3507 ram_state->migration_dirty_pages = 0; 3508 qemu_mutex_unlock_ramlist(); 3509 qemu_mutex_unlock_iothread(); 3510 } 3511 3512 /* It is need to hold the global lock to call this helper */ 3513 void colo_release_ram_cache(void) 3514 { 3515 RAMBlock *block; 3516 3517 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3518 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3519 g_free(block->bmap); 3520 block->bmap = NULL; 3521 } 3522 3523 WITH_RCU_READ_LOCK_GUARD() { 3524 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3525 if (block->colo_cache) { 3526 qemu_anon_ram_free(block->colo_cache, block->used_length); 3527 block->colo_cache = NULL; 3528 } 3529 } 3530 } 3531 ram_state_cleanup(&ram_state); 3532 } 3533 3534 /** 3535 * ram_load_setup: Setup RAM for migration incoming side 3536 * 3537 * Returns zero to indicate success and negative for error 3538 * 3539 * @f: QEMUFile where to receive the data 3540 * @opaque: RAMState pointer 3541 */ 3542 static int ram_load_setup(QEMUFile *f, void *opaque) 3543 { 3544 xbzrle_load_setup(); 3545 ramblock_recv_map_init(); 3546 3547 return 0; 3548 } 3549 3550 static int ram_load_cleanup(void *opaque) 3551 { 3552 RAMBlock *rb; 3553 3554 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3555 qemu_ram_block_writeback(rb); 3556 } 3557 3558 xbzrle_load_cleanup(); 3559 3560 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3561 g_free(rb->receivedmap); 3562 rb->receivedmap = NULL; 3563 } 3564 3565 return 0; 3566 } 3567 3568 /** 3569 * ram_postcopy_incoming_init: allocate postcopy data structures 3570 * 3571 * Returns 0 for success and negative if there was one error 3572 * 3573 * @mis: current migration incoming state 3574 * 3575 * Allocate data structures etc needed by incoming migration with 3576 * postcopy-ram. postcopy-ram's similarly names 3577 * postcopy_ram_incoming_init does the work. 3578 */ 3579 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3580 { 3581 return postcopy_ram_incoming_init(mis); 3582 } 3583 3584 /** 3585 * ram_load_postcopy: load a page in postcopy case 3586 * 3587 * Returns 0 for success or -errno in case of error 3588 * 3589 * Called in postcopy mode by ram_load(). 3590 * rcu_read_lock is taken prior to this being called. 3591 * 3592 * @f: QEMUFile where to send the data 3593 * @channel: the channel to use for loading 3594 */ 3595 int ram_load_postcopy(QEMUFile *f, int channel) 3596 { 3597 int flags = 0, ret = 0; 3598 bool place_needed = false; 3599 bool matches_target_page_size = false; 3600 MigrationIncomingState *mis = migration_incoming_get_current(); 3601 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3602 3603 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3604 ram_addr_t addr; 3605 void *page_buffer = NULL; 3606 void *place_source = NULL; 3607 RAMBlock *block = NULL; 3608 uint8_t ch; 3609 int len; 3610 3611 addr = qemu_get_be64(f); 3612 3613 /* 3614 * If qemu file error, we should stop here, and then "addr" 3615 * may be invalid 3616 */ 3617 ret = qemu_file_get_error(f); 3618 if (ret) { 3619 break; 3620 } 3621 3622 flags = addr & ~TARGET_PAGE_MASK; 3623 addr &= TARGET_PAGE_MASK; 3624 3625 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3626 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3627 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3628 block = ram_block_from_stream(mis, f, flags, channel); 3629 if (!block) { 3630 ret = -EINVAL; 3631 break; 3632 } 3633 3634 /* 3635 * Relying on used_length is racy and can result in false positives. 3636 * We might place pages beyond used_length in case RAM was shrunk 3637 * while in postcopy, which is fine - trying to place via 3638 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3639 */ 3640 if (!block->host || addr >= block->postcopy_length) { 3641 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3642 ret = -EINVAL; 3643 break; 3644 } 3645 tmp_page->target_pages++; 3646 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3647 /* 3648 * Postcopy requires that we place whole host pages atomically; 3649 * these may be huge pages for RAMBlocks that are backed by 3650 * hugetlbfs. 3651 * To make it atomic, the data is read into a temporary page 3652 * that's moved into place later. 3653 * The migration protocol uses, possibly smaller, target-pages 3654 * however the source ensures it always sends all the components 3655 * of a host page in one chunk. 3656 */ 3657 page_buffer = tmp_page->tmp_huge_page + 3658 host_page_offset_from_ram_block_offset(block, addr); 3659 /* If all TP are zero then we can optimise the place */ 3660 if (tmp_page->target_pages == 1) { 3661 tmp_page->host_addr = 3662 host_page_from_ram_block_offset(block, addr); 3663 } else if (tmp_page->host_addr != 3664 host_page_from_ram_block_offset(block, addr)) { 3665 /* not the 1st TP within the HP */ 3666 error_report("Non-same host page detected on channel %d: " 3667 "Target host page %p, received host page %p " 3668 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3669 channel, tmp_page->host_addr, 3670 host_page_from_ram_block_offset(block, addr), 3671 block->idstr, addr, tmp_page->target_pages); 3672 ret = -EINVAL; 3673 break; 3674 } 3675 3676 /* 3677 * If it's the last part of a host page then we place the host 3678 * page 3679 */ 3680 if (tmp_page->target_pages == 3681 (block->page_size / TARGET_PAGE_SIZE)) { 3682 place_needed = true; 3683 } 3684 place_source = tmp_page->tmp_huge_page; 3685 } 3686 3687 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3688 case RAM_SAVE_FLAG_ZERO: 3689 ch = qemu_get_byte(f); 3690 /* 3691 * Can skip to set page_buffer when 3692 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3693 */ 3694 if (ch || !matches_target_page_size) { 3695 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3696 } 3697 if (ch) { 3698 tmp_page->all_zero = false; 3699 } 3700 break; 3701 3702 case RAM_SAVE_FLAG_PAGE: 3703 tmp_page->all_zero = false; 3704 if (!matches_target_page_size) { 3705 /* For huge pages, we always use temporary buffer */ 3706 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3707 } else { 3708 /* 3709 * For small pages that matches target page size, we 3710 * avoid the qemu_file copy. Instead we directly use 3711 * the buffer of QEMUFile to place the page. Note: we 3712 * cannot do any QEMUFile operation before using that 3713 * buffer to make sure the buffer is valid when 3714 * placing the page. 3715 */ 3716 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3717 TARGET_PAGE_SIZE); 3718 } 3719 break; 3720 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3721 tmp_page->all_zero = false; 3722 len = qemu_get_be32(f); 3723 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3724 error_report("Invalid compressed data length: %d", len); 3725 ret = -EINVAL; 3726 break; 3727 } 3728 decompress_data_with_multi_threads(f, page_buffer, len); 3729 break; 3730 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 3731 multifd_recv_sync_main(); 3732 break; 3733 case RAM_SAVE_FLAG_EOS: 3734 /* normal exit */ 3735 if (migrate_multifd_flush_after_each_section()) { 3736 multifd_recv_sync_main(); 3737 } 3738 break; 3739 default: 3740 error_report("Unknown combination of migration flags: 0x%x" 3741 " (postcopy mode)", flags); 3742 ret = -EINVAL; 3743 break; 3744 } 3745 3746 /* Got the whole host page, wait for decompress before placing. */ 3747 if (place_needed) { 3748 ret |= wait_for_decompress_done(); 3749 } 3750 3751 /* Detect for any possible file errors */ 3752 if (!ret && qemu_file_get_error(f)) { 3753 ret = qemu_file_get_error(f); 3754 } 3755 3756 if (!ret && place_needed) { 3757 if (tmp_page->all_zero) { 3758 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3759 } else { 3760 ret = postcopy_place_page(mis, tmp_page->host_addr, 3761 place_source, block); 3762 } 3763 place_needed = false; 3764 postcopy_temp_page_reset(tmp_page); 3765 } 3766 } 3767 3768 return ret; 3769 } 3770 3771 static bool postcopy_is_running(void) 3772 { 3773 PostcopyState ps = postcopy_state_get(); 3774 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3775 } 3776 3777 /* 3778 * Flush content of RAM cache into SVM's memory. 3779 * Only flush the pages that be dirtied by PVM or SVM or both. 3780 */ 3781 void colo_flush_ram_cache(void) 3782 { 3783 RAMBlock *block = NULL; 3784 void *dst_host; 3785 void *src_host; 3786 unsigned long offset = 0; 3787 3788 memory_global_dirty_log_sync(false); 3789 qemu_mutex_lock(&ram_state->bitmap_mutex); 3790 WITH_RCU_READ_LOCK_GUARD() { 3791 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3792 ramblock_sync_dirty_bitmap(ram_state, block); 3793 } 3794 } 3795 3796 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3797 WITH_RCU_READ_LOCK_GUARD() { 3798 block = QLIST_FIRST_RCU(&ram_list.blocks); 3799 3800 while (block) { 3801 unsigned long num = 0; 3802 3803 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3804 if (!offset_in_ramblock(block, 3805 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3806 offset = 0; 3807 num = 0; 3808 block = QLIST_NEXT_RCU(block, next); 3809 } else { 3810 unsigned long i = 0; 3811 3812 for (i = 0; i < num; i++) { 3813 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3814 } 3815 dst_host = block->host 3816 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3817 src_host = block->colo_cache 3818 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3819 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3820 offset += num; 3821 } 3822 } 3823 } 3824 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3825 trace_colo_flush_ram_cache_end(); 3826 } 3827 3828 /** 3829 * ram_load_precopy: load pages in precopy case 3830 * 3831 * Returns 0 for success or -errno in case of error 3832 * 3833 * Called in precopy mode by ram_load(). 3834 * rcu_read_lock is taken prior to this being called. 3835 * 3836 * @f: QEMUFile where to send the data 3837 */ 3838 static int ram_load_precopy(QEMUFile *f) 3839 { 3840 MigrationIncomingState *mis = migration_incoming_get_current(); 3841 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3842 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3843 bool postcopy_advised = migration_incoming_postcopy_advised(); 3844 if (!migrate_compress()) { 3845 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3846 } 3847 3848 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3849 ram_addr_t addr, total_ram_bytes; 3850 void *host = NULL, *host_bak = NULL; 3851 uint8_t ch; 3852 3853 /* 3854 * Yield periodically to let main loop run, but an iteration of 3855 * the main loop is expensive, so do it each some iterations 3856 */ 3857 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3858 aio_co_schedule(qemu_get_current_aio_context(), 3859 qemu_coroutine_self()); 3860 qemu_coroutine_yield(); 3861 } 3862 i++; 3863 3864 addr = qemu_get_be64(f); 3865 flags = addr & ~TARGET_PAGE_MASK; 3866 addr &= TARGET_PAGE_MASK; 3867 3868 if (flags & invalid_flags) { 3869 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3870 error_report("Received an unexpected compressed page"); 3871 } 3872 3873 ret = -EINVAL; 3874 break; 3875 } 3876 3877 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3878 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3879 RAMBlock *block = ram_block_from_stream(mis, f, flags, 3880 RAM_CHANNEL_PRECOPY); 3881 3882 host = host_from_ram_block_offset(block, addr); 3883 /* 3884 * After going into COLO stage, we should not load the page 3885 * into SVM's memory directly, we put them into colo_cache firstly. 3886 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3887 * Previously, we copied all these memory in preparing stage of COLO 3888 * while we need to stop VM, which is a time-consuming process. 3889 * Here we optimize it by a trick, back-up every page while in 3890 * migration process while COLO is enabled, though it affects the 3891 * speed of the migration, but it obviously reduce the downtime of 3892 * back-up all SVM'S memory in COLO preparing stage. 3893 */ 3894 if (migration_incoming_colo_enabled()) { 3895 if (migration_incoming_in_colo_state()) { 3896 /* In COLO stage, put all pages into cache temporarily */ 3897 host = colo_cache_from_block_offset(block, addr, true); 3898 } else { 3899 /* 3900 * In migration stage but before COLO stage, 3901 * Put all pages into both cache and SVM's memory. 3902 */ 3903 host_bak = colo_cache_from_block_offset(block, addr, false); 3904 } 3905 } 3906 if (!host) { 3907 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3908 ret = -EINVAL; 3909 break; 3910 } 3911 if (!migration_incoming_in_colo_state()) { 3912 ramblock_recv_bitmap_set(block, host); 3913 } 3914 3915 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3916 } 3917 3918 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3919 case RAM_SAVE_FLAG_MEM_SIZE: 3920 /* Synchronize RAM block list */ 3921 total_ram_bytes = addr; 3922 while (!ret && total_ram_bytes) { 3923 RAMBlock *block; 3924 char id[256]; 3925 ram_addr_t length; 3926 3927 len = qemu_get_byte(f); 3928 qemu_get_buffer(f, (uint8_t *)id, len); 3929 id[len] = 0; 3930 length = qemu_get_be64(f); 3931 3932 block = qemu_ram_block_by_name(id); 3933 if (block && !qemu_ram_is_migratable(block)) { 3934 error_report("block %s should not be migrated !", id); 3935 ret = -EINVAL; 3936 } else if (block) { 3937 if (length != block->used_length) { 3938 Error *local_err = NULL; 3939 3940 ret = qemu_ram_resize(block, length, 3941 &local_err); 3942 if (local_err) { 3943 error_report_err(local_err); 3944 } 3945 } 3946 /* For postcopy we need to check hugepage sizes match */ 3947 if (postcopy_advised && migrate_postcopy_ram() && 3948 block->page_size != qemu_host_page_size) { 3949 uint64_t remote_page_size = qemu_get_be64(f); 3950 if (remote_page_size != block->page_size) { 3951 error_report("Mismatched RAM page size %s " 3952 "(local) %zd != %" PRId64, 3953 id, block->page_size, 3954 remote_page_size); 3955 ret = -EINVAL; 3956 } 3957 } 3958 if (migrate_ignore_shared()) { 3959 hwaddr addr = qemu_get_be64(f); 3960 if (ramblock_is_ignored(block) && 3961 block->mr->addr != addr) { 3962 error_report("Mismatched GPAs for block %s " 3963 "%" PRId64 "!= %" PRId64, 3964 id, (uint64_t)addr, 3965 (uint64_t)block->mr->addr); 3966 ret = -EINVAL; 3967 } 3968 } 3969 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3970 block->idstr); 3971 } else { 3972 error_report("Unknown ramblock \"%s\", cannot " 3973 "accept migration", id); 3974 ret = -EINVAL; 3975 } 3976 3977 total_ram_bytes -= length; 3978 } 3979 break; 3980 3981 case RAM_SAVE_FLAG_ZERO: 3982 ch = qemu_get_byte(f); 3983 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3984 break; 3985 3986 case RAM_SAVE_FLAG_PAGE: 3987 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3988 break; 3989 3990 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3991 len = qemu_get_be32(f); 3992 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3993 error_report("Invalid compressed data length: %d", len); 3994 ret = -EINVAL; 3995 break; 3996 } 3997 decompress_data_with_multi_threads(f, host, len); 3998 break; 3999 4000 case RAM_SAVE_FLAG_XBZRLE: 4001 if (load_xbzrle(f, addr, host) < 0) { 4002 error_report("Failed to decompress XBZRLE page at " 4003 RAM_ADDR_FMT, addr); 4004 ret = -EINVAL; 4005 break; 4006 } 4007 break; 4008 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4009 multifd_recv_sync_main(); 4010 break; 4011 case RAM_SAVE_FLAG_EOS: 4012 /* normal exit */ 4013 if (migrate_multifd_flush_after_each_section()) { 4014 multifd_recv_sync_main(); 4015 } 4016 break; 4017 case RAM_SAVE_FLAG_HOOK: 4018 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4019 break; 4020 default: 4021 error_report("Unknown combination of migration flags: 0x%x", flags); 4022 ret = -EINVAL; 4023 } 4024 if (!ret) { 4025 ret = qemu_file_get_error(f); 4026 } 4027 if (!ret && host_bak) { 4028 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4029 } 4030 } 4031 4032 ret |= wait_for_decompress_done(); 4033 return ret; 4034 } 4035 4036 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4037 { 4038 int ret = 0; 4039 static uint64_t seq_iter; 4040 /* 4041 * If system is running in postcopy mode, page inserts to host memory must 4042 * be atomic 4043 */ 4044 bool postcopy_running = postcopy_is_running(); 4045 4046 seq_iter++; 4047 4048 if (version_id != 4) { 4049 return -EINVAL; 4050 } 4051 4052 /* 4053 * This RCU critical section can be very long running. 4054 * When RCU reclaims in the code start to become numerous, 4055 * it will be necessary to reduce the granularity of this 4056 * critical section. 4057 */ 4058 WITH_RCU_READ_LOCK_GUARD() { 4059 if (postcopy_running) { 4060 /* 4061 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4062 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4063 * service fast page faults. 4064 */ 4065 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4066 } else { 4067 ret = ram_load_precopy(f); 4068 } 4069 } 4070 trace_ram_load_complete(ret, seq_iter); 4071 4072 return ret; 4073 } 4074 4075 static bool ram_has_postcopy(void *opaque) 4076 { 4077 RAMBlock *rb; 4078 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4079 if (ramblock_is_pmem(rb)) { 4080 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4081 "is not supported now!", rb->idstr, rb->host); 4082 return false; 4083 } 4084 } 4085 4086 return migrate_postcopy_ram(); 4087 } 4088 4089 /* Sync all the dirty bitmap with destination VM. */ 4090 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4091 { 4092 RAMBlock *block; 4093 QEMUFile *file = s->to_dst_file; 4094 int ramblock_count = 0; 4095 4096 trace_ram_dirty_bitmap_sync_start(); 4097 4098 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4099 qemu_savevm_send_recv_bitmap(file, block->idstr); 4100 trace_ram_dirty_bitmap_request(block->idstr); 4101 ramblock_count++; 4102 } 4103 4104 trace_ram_dirty_bitmap_sync_wait(); 4105 4106 /* Wait until all the ramblocks' dirty bitmap synced */ 4107 while (ramblock_count--) { 4108 qemu_sem_wait(&s->rp_state.rp_sem); 4109 } 4110 4111 trace_ram_dirty_bitmap_sync_complete(); 4112 4113 return 0; 4114 } 4115 4116 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4117 { 4118 qemu_sem_post(&s->rp_state.rp_sem); 4119 } 4120 4121 /* 4122 * Read the received bitmap, revert it as the initial dirty bitmap. 4123 * This is only used when the postcopy migration is paused but wants 4124 * to resume from a middle point. 4125 */ 4126 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4127 { 4128 int ret = -EINVAL; 4129 /* from_dst_file is always valid because we're within rp_thread */ 4130 QEMUFile *file = s->rp_state.from_dst_file; 4131 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4132 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4133 uint64_t size, end_mark; 4134 4135 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4136 4137 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4138 error_report("%s: incorrect state %s", __func__, 4139 MigrationStatus_str(s->state)); 4140 return -EINVAL; 4141 } 4142 4143 /* 4144 * Note: see comments in ramblock_recv_bitmap_send() on why we 4145 * need the endianness conversion, and the paddings. 4146 */ 4147 local_size = ROUND_UP(local_size, 8); 4148 4149 /* Add paddings */ 4150 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4151 4152 size = qemu_get_be64(file); 4153 4154 /* The size of the bitmap should match with our ramblock */ 4155 if (size != local_size) { 4156 error_report("%s: ramblock '%s' bitmap size mismatch " 4157 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4158 block->idstr, size, local_size); 4159 ret = -EINVAL; 4160 goto out; 4161 } 4162 4163 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4164 end_mark = qemu_get_be64(file); 4165 4166 ret = qemu_file_get_error(file); 4167 if (ret || size != local_size) { 4168 error_report("%s: read bitmap failed for ramblock '%s': %d" 4169 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4170 __func__, block->idstr, ret, local_size, size); 4171 ret = -EIO; 4172 goto out; 4173 } 4174 4175 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4176 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4177 __func__, block->idstr, end_mark); 4178 ret = -EINVAL; 4179 goto out; 4180 } 4181 4182 /* 4183 * Endianness conversion. We are during postcopy (though paused). 4184 * The dirty bitmap won't change. We can directly modify it. 4185 */ 4186 bitmap_from_le(block->bmap, le_bitmap, nbits); 4187 4188 /* 4189 * What we received is "received bitmap". Revert it as the initial 4190 * dirty bitmap for this ramblock. 4191 */ 4192 bitmap_complement(block->bmap, block->bmap, nbits); 4193 4194 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4195 ramblock_dirty_bitmap_clear_discarded_pages(block); 4196 4197 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4198 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4199 4200 /* 4201 * We succeeded to sync bitmap for current ramblock. If this is 4202 * the last one to sync, we need to notify the main send thread. 4203 */ 4204 ram_dirty_bitmap_reload_notify(s); 4205 4206 ret = 0; 4207 out: 4208 g_free(le_bitmap); 4209 return ret; 4210 } 4211 4212 static int ram_resume_prepare(MigrationState *s, void *opaque) 4213 { 4214 RAMState *rs = *(RAMState **)opaque; 4215 int ret; 4216 4217 ret = ram_dirty_bitmap_sync_all(s, rs); 4218 if (ret) { 4219 return ret; 4220 } 4221 4222 ram_state_resume_prepare(rs, s->to_dst_file); 4223 4224 return 0; 4225 } 4226 4227 void postcopy_preempt_shutdown_file(MigrationState *s) 4228 { 4229 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4230 qemu_fflush(s->postcopy_qemufile_src); 4231 } 4232 4233 static SaveVMHandlers savevm_ram_handlers = { 4234 .save_setup = ram_save_setup, 4235 .save_live_iterate = ram_save_iterate, 4236 .save_live_complete_postcopy = ram_save_complete, 4237 .save_live_complete_precopy = ram_save_complete, 4238 .has_postcopy = ram_has_postcopy, 4239 .state_pending_exact = ram_state_pending_exact, 4240 .state_pending_estimate = ram_state_pending_estimate, 4241 .load_state = ram_load, 4242 .save_cleanup = ram_save_cleanup, 4243 .load_setup = ram_load_setup, 4244 .load_cleanup = ram_load_cleanup, 4245 .resume_prepare = ram_resume_prepare, 4246 }; 4247 4248 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4249 size_t old_size, size_t new_size) 4250 { 4251 PostcopyState ps = postcopy_state_get(); 4252 ram_addr_t offset; 4253 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4254 Error *err = NULL; 4255 4256 if (ramblock_is_ignored(rb)) { 4257 return; 4258 } 4259 4260 if (!migration_is_idle()) { 4261 /* 4262 * Precopy code on the source cannot deal with the size of RAM blocks 4263 * changing at random points in time - especially after sending the 4264 * RAM block sizes in the migration stream, they must no longer change. 4265 * Abort and indicate a proper reason. 4266 */ 4267 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4268 migration_cancel(err); 4269 error_free(err); 4270 } 4271 4272 switch (ps) { 4273 case POSTCOPY_INCOMING_ADVISE: 4274 /* 4275 * Update what ram_postcopy_incoming_init()->init_range() does at the 4276 * time postcopy was advised. Syncing RAM blocks with the source will 4277 * result in RAM resizes. 4278 */ 4279 if (old_size < new_size) { 4280 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4281 error_report("RAM block '%s' discard of resized RAM failed", 4282 rb->idstr); 4283 } 4284 } 4285 rb->postcopy_length = new_size; 4286 break; 4287 case POSTCOPY_INCOMING_NONE: 4288 case POSTCOPY_INCOMING_RUNNING: 4289 case POSTCOPY_INCOMING_END: 4290 /* 4291 * Once our guest is running, postcopy does no longer care about 4292 * resizes. When growing, the new memory was not available on the 4293 * source, no handler needed. 4294 */ 4295 break; 4296 default: 4297 error_report("RAM block '%s' resized during postcopy state: %d", 4298 rb->idstr, ps); 4299 exit(-1); 4300 } 4301 } 4302 4303 static RAMBlockNotifier ram_mig_ram_notifier = { 4304 .ram_block_resized = ram_mig_ram_block_resized, 4305 }; 4306 4307 void ram_mig_init(void) 4308 { 4309 qemu_mutex_init(&XBZRLE.lock); 4310 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4311 ram_block_notifier_add(&ram_mig_ram_notifier); 4312 } 4313