1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram-compress.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration-stats.h" 40 #include "migration/register.h" 41 #include "migration/misc.h" 42 #include "qemu-file.h" 43 #include "postcopy-ram.h" 44 #include "page_cache.h" 45 #include "qemu/error-report.h" 46 #include "qapi/error.h" 47 #include "qapi/qapi-types-migration.h" 48 #include "qapi/qapi-events-migration.h" 49 #include "qapi/qapi-commands-migration.h" 50 #include "qapi/qmp/qerror.h" 51 #include "trace.h" 52 #include "exec/ram_addr.h" 53 #include "exec/target_page.h" 54 #include "qemu/rcu_queue.h" 55 #include "migration/colo.h" 56 #include "block.h" 57 #include "sysemu/cpu-throttle.h" 58 #include "savevm.h" 59 #include "qemu/iov.h" 60 #include "multifd.h" 61 #include "sysemu/runstate.h" 62 #include "rdma.h" 63 #include "options.h" 64 #include "sysemu/dirtylimit.h" 65 #include "sysemu/kvm.h" 66 67 #include "hw/boards.h" /* for machine_dump_guest_core() */ 68 69 #if defined(__linux__) 70 #include "qemu/userfaultfd.h" 71 #endif /* defined(__linux__) */ 72 73 /***********************************************************/ 74 /* ram save/restore */ 75 76 /* 77 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 78 * worked for pages that were filled with the same char. We switched 79 * it to only search for the zero value. And to avoid confusion with 80 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 81 */ 82 /* 83 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 84 */ 85 #define RAM_SAVE_FLAG_FULL 0x01 86 #define RAM_SAVE_FLAG_ZERO 0x02 87 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 88 #define RAM_SAVE_FLAG_PAGE 0x08 89 #define RAM_SAVE_FLAG_EOS 0x10 90 #define RAM_SAVE_FLAG_CONTINUE 0x20 91 #define RAM_SAVE_FLAG_XBZRLE 0x40 92 /* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */ 93 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 94 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 95 /* We can't use any flag that is bigger than 0x200 */ 96 97 /* 98 * mapped-ram migration supports O_DIRECT, so we need to make sure the 99 * userspace buffer, the IO operation size and the file offset are 100 * aligned according to the underlying device's block size. The first 101 * two are already aligned to page size, but we need to add padding to 102 * the file to align the offset. We cannot read the block size 103 * dynamically because the migration file can be moved between 104 * different systems, so use 1M to cover most block sizes and to keep 105 * the file offset aligned at page size as well. 106 */ 107 #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 108 109 /* 110 * When doing mapped-ram migration, this is the amount we read from 111 * the pages region in the migration file at a time. 112 */ 113 #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 114 115 XBZRLECacheStats xbzrle_counters; 116 117 /* used by the search for pages to send */ 118 struct PageSearchStatus { 119 /* The migration channel used for a specific host page */ 120 QEMUFile *pss_channel; 121 /* Last block from where we have sent data */ 122 RAMBlock *last_sent_block; 123 /* Current block being searched */ 124 RAMBlock *block; 125 /* Current page to search from */ 126 unsigned long page; 127 /* Set once we wrap around */ 128 bool complete_round; 129 /* Whether we're sending a host page */ 130 bool host_page_sending; 131 /* The start/end of current host page. Invalid if host_page_sending==false */ 132 unsigned long host_page_start; 133 unsigned long host_page_end; 134 }; 135 typedef struct PageSearchStatus PageSearchStatus; 136 137 /* struct contains XBZRLE cache and a static page 138 used by the compression */ 139 static struct { 140 /* buffer used for XBZRLE encoding */ 141 uint8_t *encoded_buf; 142 /* buffer for storing page content */ 143 uint8_t *current_buf; 144 /* Cache for XBZRLE, Protected by lock. */ 145 PageCache *cache; 146 QemuMutex lock; 147 /* it will store a page full of zeros */ 148 uint8_t *zero_target_page; 149 /* buffer used for XBZRLE decoding */ 150 uint8_t *decoded_buf; 151 } XBZRLE; 152 153 static void XBZRLE_cache_lock(void) 154 { 155 if (migrate_xbzrle()) { 156 qemu_mutex_lock(&XBZRLE.lock); 157 } 158 } 159 160 static void XBZRLE_cache_unlock(void) 161 { 162 if (migrate_xbzrle()) { 163 qemu_mutex_unlock(&XBZRLE.lock); 164 } 165 } 166 167 /** 168 * xbzrle_cache_resize: resize the xbzrle cache 169 * 170 * This function is called from migrate_params_apply in main 171 * thread, possibly while a migration is in progress. A running 172 * migration may be using the cache and might finish during this call, 173 * hence changes to the cache are protected by XBZRLE.lock(). 174 * 175 * Returns 0 for success or -1 for error 176 * 177 * @new_size: new cache size 178 * @errp: set *errp if the check failed, with reason 179 */ 180 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 181 { 182 PageCache *new_cache; 183 int64_t ret = 0; 184 185 /* Check for truncation */ 186 if (new_size != (size_t)new_size) { 187 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 188 "exceeding address space"); 189 return -1; 190 } 191 192 if (new_size == migrate_xbzrle_cache_size()) { 193 /* nothing to do */ 194 return 0; 195 } 196 197 XBZRLE_cache_lock(); 198 199 if (XBZRLE.cache != NULL) { 200 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 201 if (!new_cache) { 202 ret = -1; 203 goto out; 204 } 205 206 cache_fini(XBZRLE.cache); 207 XBZRLE.cache = new_cache; 208 } 209 out: 210 XBZRLE_cache_unlock(); 211 return ret; 212 } 213 214 static bool postcopy_preempt_active(void) 215 { 216 return migrate_postcopy_preempt() && migration_in_postcopy(); 217 } 218 219 bool migrate_ram_is_ignored(RAMBlock *block) 220 { 221 return !qemu_ram_is_migratable(block) || 222 (migrate_ignore_shared() && qemu_ram_is_shared(block) 223 && qemu_ram_is_named_file(block)); 224 } 225 226 #undef RAMBLOCK_FOREACH 227 228 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 229 { 230 RAMBlock *block; 231 int ret = 0; 232 233 RCU_READ_LOCK_GUARD(); 234 235 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 236 ret = func(block, opaque); 237 if (ret) { 238 break; 239 } 240 } 241 return ret; 242 } 243 244 static void ramblock_recv_map_init(void) 245 { 246 RAMBlock *rb; 247 248 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 249 assert(!rb->receivedmap); 250 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 251 } 252 } 253 254 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 255 { 256 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 257 rb->receivedmap); 258 } 259 260 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 261 { 262 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 263 } 264 265 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 266 { 267 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 268 } 269 270 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 271 size_t nr) 272 { 273 bitmap_set_atomic(rb->receivedmap, 274 ramblock_recv_bitmap_offset(host_addr, rb), 275 nr); 276 } 277 278 void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) 279 { 280 set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 281 } 282 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 283 284 /* 285 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 286 * 287 * Returns >0 if success with sent bytes, or <0 if error. 288 */ 289 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 290 const char *block_name) 291 { 292 RAMBlock *block = qemu_ram_block_by_name(block_name); 293 unsigned long *le_bitmap, nbits; 294 uint64_t size; 295 296 if (!block) { 297 error_report("%s: invalid block name: %s", __func__, block_name); 298 return -1; 299 } 300 301 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 302 303 /* 304 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 305 * machines we may need 4 more bytes for padding (see below 306 * comment). So extend it a bit before hand. 307 */ 308 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 309 310 /* 311 * Always use little endian when sending the bitmap. This is 312 * required that when source and destination VMs are not using the 313 * same endianness. (Note: big endian won't work.) 314 */ 315 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 316 317 /* Size of the bitmap, in bytes */ 318 size = DIV_ROUND_UP(nbits, 8); 319 320 /* 321 * size is always aligned to 8 bytes for 64bit machines, but it 322 * may not be true for 32bit machines. We need this padding to 323 * make sure the migration can survive even between 32bit and 324 * 64bit machines. 325 */ 326 size = ROUND_UP(size, 8); 327 328 qemu_put_be64(file, size); 329 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 330 g_free(le_bitmap); 331 /* 332 * Mark as an end, in case the middle part is screwed up due to 333 * some "mysterious" reason. 334 */ 335 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 336 int ret = qemu_fflush(file); 337 if (ret) { 338 return ret; 339 } 340 341 return size + sizeof(size); 342 } 343 344 /* 345 * An outstanding page request, on the source, having been received 346 * and queued 347 */ 348 struct RAMSrcPageRequest { 349 RAMBlock *rb; 350 hwaddr offset; 351 hwaddr len; 352 353 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 354 }; 355 356 /* State of RAM for migration */ 357 struct RAMState { 358 /* 359 * PageSearchStatus structures for the channels when send pages. 360 * Protected by the bitmap_mutex. 361 */ 362 PageSearchStatus pss[RAM_CHANNEL_MAX]; 363 /* UFFD file descriptor, used in 'write-tracking' migration */ 364 int uffdio_fd; 365 /* total ram size in bytes */ 366 uint64_t ram_bytes_total; 367 /* Last block that we have visited searching for dirty pages */ 368 RAMBlock *last_seen_block; 369 /* Last dirty target page we have sent */ 370 ram_addr_t last_page; 371 /* last ram version we have seen */ 372 uint32_t last_version; 373 /* How many times we have dirty too many pages */ 374 int dirty_rate_high_cnt; 375 /* these variables are used for bitmap sync */ 376 /* last time we did a full bitmap_sync */ 377 int64_t time_last_bitmap_sync; 378 /* bytes transferred at start_time */ 379 uint64_t bytes_xfer_prev; 380 /* number of dirty pages since start_time */ 381 uint64_t num_dirty_pages_period; 382 /* xbzrle misses since the beginning of the period */ 383 uint64_t xbzrle_cache_miss_prev; 384 /* Amount of xbzrle pages since the beginning of the period */ 385 uint64_t xbzrle_pages_prev; 386 /* Amount of xbzrle encoded bytes since the beginning of the period */ 387 uint64_t xbzrle_bytes_prev; 388 /* Are we really using XBZRLE (e.g., after the first round). */ 389 bool xbzrle_started; 390 /* Are we on the last stage of migration */ 391 bool last_stage; 392 393 /* total handled target pages at the beginning of period */ 394 uint64_t target_page_count_prev; 395 /* total handled target pages since start */ 396 uint64_t target_page_count; 397 /* number of dirty bits in the bitmap */ 398 uint64_t migration_dirty_pages; 399 /* 400 * Protects: 401 * - dirty/clear bitmap 402 * - migration_dirty_pages 403 * - pss structures 404 */ 405 QemuMutex bitmap_mutex; 406 /* The RAMBlock used in the last src_page_requests */ 407 RAMBlock *last_req_rb; 408 /* Queue of outstanding page requests from the destination */ 409 QemuMutex src_page_req_mutex; 410 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 411 412 /* 413 * This is only used when postcopy is in recovery phase, to communicate 414 * between the migration thread and the return path thread on dirty 415 * bitmap synchronizations. This field is unused in other stages of 416 * RAM migration. 417 */ 418 unsigned int postcopy_bmap_sync_requested; 419 }; 420 typedef struct RAMState RAMState; 421 422 static RAMState *ram_state; 423 424 static NotifierWithReturnList precopy_notifier_list; 425 426 /* Whether postcopy has queued requests? */ 427 static bool postcopy_has_request(RAMState *rs) 428 { 429 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 430 } 431 432 void precopy_infrastructure_init(void) 433 { 434 notifier_with_return_list_init(&precopy_notifier_list); 435 } 436 437 void precopy_add_notifier(NotifierWithReturn *n) 438 { 439 notifier_with_return_list_add(&precopy_notifier_list, n); 440 } 441 442 void precopy_remove_notifier(NotifierWithReturn *n) 443 { 444 notifier_with_return_remove(n); 445 } 446 447 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 448 { 449 PrecopyNotifyData pnd; 450 pnd.reason = reason; 451 452 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd, errp); 453 } 454 455 uint64_t ram_bytes_remaining(void) 456 { 457 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 458 0; 459 } 460 461 void ram_transferred_add(uint64_t bytes) 462 { 463 if (runstate_is_running()) { 464 stat64_add(&mig_stats.precopy_bytes, bytes); 465 } else if (migration_in_postcopy()) { 466 stat64_add(&mig_stats.postcopy_bytes, bytes); 467 } else { 468 stat64_add(&mig_stats.downtime_bytes, bytes); 469 } 470 } 471 472 struct MigrationOps { 473 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 474 }; 475 typedef struct MigrationOps MigrationOps; 476 477 MigrationOps *migration_ops; 478 479 static int ram_save_host_page_urgent(PageSearchStatus *pss); 480 481 /* NOTE: page is the PFN not real ram_addr_t. */ 482 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 483 { 484 pss->block = rb; 485 pss->page = page; 486 pss->complete_round = false; 487 } 488 489 /* 490 * Check whether two PSSs are actively sending the same page. Return true 491 * if it is, false otherwise. 492 */ 493 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 494 { 495 return pss1->host_page_sending && pss2->host_page_sending && 496 (pss1->host_page_start == pss2->host_page_start); 497 } 498 499 /** 500 * save_page_header: write page header to wire 501 * 502 * If this is the 1st block, it also writes the block identification 503 * 504 * Returns the number of bytes written 505 * 506 * @pss: current PSS channel status 507 * @block: block that contains the page we want to send 508 * @offset: offset inside the block for the page 509 * in the lower bits, it contains flags 510 */ 511 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 512 RAMBlock *block, ram_addr_t offset) 513 { 514 size_t size, len; 515 bool same_block = (block == pss->last_sent_block); 516 517 if (same_block) { 518 offset |= RAM_SAVE_FLAG_CONTINUE; 519 } 520 qemu_put_be64(f, offset); 521 size = 8; 522 523 if (!same_block) { 524 len = strlen(block->idstr); 525 qemu_put_byte(f, len); 526 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 527 size += 1 + len; 528 pss->last_sent_block = block; 529 } 530 return size; 531 } 532 533 /** 534 * mig_throttle_guest_down: throttle down the guest 535 * 536 * Reduce amount of guest cpu execution to hopefully slow down memory 537 * writes. If guest dirty memory rate is reduced below the rate at 538 * which we can transfer pages to the destination then we should be 539 * able to complete migration. Some workloads dirty memory way too 540 * fast and will not effectively converge, even with auto-converge. 541 */ 542 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 543 uint64_t bytes_dirty_threshold) 544 { 545 uint64_t pct_initial = migrate_cpu_throttle_initial(); 546 uint64_t pct_increment = migrate_cpu_throttle_increment(); 547 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 548 int pct_max = migrate_max_cpu_throttle(); 549 550 uint64_t throttle_now = cpu_throttle_get_percentage(); 551 uint64_t cpu_now, cpu_ideal, throttle_inc; 552 553 /* We have not started throttling yet. Let's start it. */ 554 if (!cpu_throttle_active()) { 555 cpu_throttle_set(pct_initial); 556 } else { 557 /* Throttling already on, just increase the rate */ 558 if (!pct_tailslow) { 559 throttle_inc = pct_increment; 560 } else { 561 /* Compute the ideal CPU percentage used by Guest, which may 562 * make the dirty rate match the dirty rate threshold. */ 563 cpu_now = 100 - throttle_now; 564 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 565 bytes_dirty_period); 566 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 567 } 568 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 569 } 570 } 571 572 void mig_throttle_counter_reset(void) 573 { 574 RAMState *rs = ram_state; 575 576 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 577 rs->num_dirty_pages_period = 0; 578 rs->bytes_xfer_prev = migration_transferred_bytes(); 579 } 580 581 /** 582 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 583 * 584 * @current_addr: address for the zero page 585 * 586 * Update the xbzrle cache to reflect a page that's been sent as all 0. 587 * The important thing is that a stale (not-yet-0'd) page be replaced 588 * by the new data. 589 * As a bonus, if the page wasn't in the cache it gets added so that 590 * when a small write is made into the 0'd page it gets XBZRLE sent. 591 */ 592 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 593 { 594 /* We don't care if this fails to allocate a new cache page 595 * as long as it updated an old one */ 596 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 597 stat64_get(&mig_stats.dirty_sync_count)); 598 } 599 600 #define ENCODING_FLAG_XBZRLE 0x1 601 602 /** 603 * save_xbzrle_page: compress and send current page 604 * 605 * Returns: 1 means that we wrote the page 606 * 0 means that page is identical to the one already sent 607 * -1 means that xbzrle would be longer than normal 608 * 609 * @rs: current RAM state 610 * @pss: current PSS channel 611 * @current_data: pointer to the address of the page contents 612 * @current_addr: addr of the page 613 * @block: block that contains the page we want to send 614 * @offset: offset inside the block for the page 615 */ 616 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 617 uint8_t **current_data, ram_addr_t current_addr, 618 RAMBlock *block, ram_addr_t offset) 619 { 620 int encoded_len = 0, bytes_xbzrle; 621 uint8_t *prev_cached_page; 622 QEMUFile *file = pss->pss_channel; 623 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 624 625 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 626 xbzrle_counters.cache_miss++; 627 if (!rs->last_stage) { 628 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 629 generation) == -1) { 630 return -1; 631 } else { 632 /* update *current_data when the page has been 633 inserted into cache */ 634 *current_data = get_cached_data(XBZRLE.cache, current_addr); 635 } 636 } 637 return -1; 638 } 639 640 /* 641 * Reaching here means the page has hit the xbzrle cache, no matter what 642 * encoding result it is (normal encoding, overflow or skipping the page), 643 * count the page as encoded. This is used to calculate the encoding rate. 644 * 645 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 646 * 2nd page turns out to be skipped (i.e. no new bytes written to the 647 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 648 * skipped page included. In this way, the encoding rate can tell if the 649 * guest page is good for xbzrle encoding. 650 */ 651 xbzrle_counters.pages++; 652 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 653 654 /* save current buffer into memory */ 655 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 656 657 /* XBZRLE encoding (if there is no overflow) */ 658 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 659 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 660 TARGET_PAGE_SIZE); 661 662 /* 663 * Update the cache contents, so that it corresponds to the data 664 * sent, in all cases except where we skip the page. 665 */ 666 if (!rs->last_stage && encoded_len != 0) { 667 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 668 /* 669 * In the case where we couldn't compress, ensure that the caller 670 * sends the data from the cache, since the guest might have 671 * changed the RAM since we copied it. 672 */ 673 *current_data = prev_cached_page; 674 } 675 676 if (encoded_len == 0) { 677 trace_save_xbzrle_page_skipping(); 678 return 0; 679 } else if (encoded_len == -1) { 680 trace_save_xbzrle_page_overflow(); 681 xbzrle_counters.overflow++; 682 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 683 return -1; 684 } 685 686 /* Send XBZRLE based compressed page */ 687 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 688 offset | RAM_SAVE_FLAG_XBZRLE); 689 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 690 qemu_put_be16(file, encoded_len); 691 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 692 bytes_xbzrle += encoded_len + 1 + 2; 693 /* 694 * Like compressed_size (please see update_compress_thread_counts), 695 * the xbzrle encoded bytes don't count the 8 byte header with 696 * RAM_SAVE_FLAG_CONTINUE. 697 */ 698 xbzrle_counters.bytes += bytes_xbzrle - 8; 699 ram_transferred_add(bytes_xbzrle); 700 701 return 1; 702 } 703 704 /** 705 * pss_find_next_dirty: find the next dirty page of current ramblock 706 * 707 * This function updates pss->page to point to the next dirty page index 708 * within the ramblock to migrate, or the end of ramblock when nothing 709 * found. Note that when pss->host_page_sending==true it means we're 710 * during sending a host page, so we won't look for dirty page that is 711 * outside the host page boundary. 712 * 713 * @pss: the current page search status 714 */ 715 static void pss_find_next_dirty(PageSearchStatus *pss) 716 { 717 RAMBlock *rb = pss->block; 718 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 719 unsigned long *bitmap = rb->bmap; 720 721 if (migrate_ram_is_ignored(rb)) { 722 /* Points directly to the end, so we know no dirty page */ 723 pss->page = size; 724 return; 725 } 726 727 /* 728 * If during sending a host page, only look for dirty pages within the 729 * current host page being send. 730 */ 731 if (pss->host_page_sending) { 732 assert(pss->host_page_end); 733 size = MIN(size, pss->host_page_end); 734 } 735 736 pss->page = find_next_bit(bitmap, size, pss->page); 737 } 738 739 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 740 unsigned long page) 741 { 742 uint8_t shift; 743 hwaddr size, start; 744 745 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 746 return; 747 } 748 749 shift = rb->clear_bmap_shift; 750 /* 751 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 752 * can make things easier sometimes since then start address 753 * of the small chunk will always be 64 pages aligned so the 754 * bitmap will always be aligned to unsigned long. We should 755 * even be able to remove this restriction but I'm simply 756 * keeping it. 757 */ 758 assert(shift >= 6); 759 760 size = 1ULL << (TARGET_PAGE_BITS + shift); 761 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 762 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 763 memory_region_clear_dirty_bitmap(rb->mr, start, size); 764 } 765 766 static void 767 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 768 unsigned long start, 769 unsigned long npages) 770 { 771 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 772 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 773 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 774 775 /* 776 * Clear pages from start to start + npages - 1, so the end boundary is 777 * exclusive. 778 */ 779 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 780 migration_clear_memory_region_dirty_bitmap(rb, i); 781 } 782 } 783 784 /* 785 * colo_bitmap_find_diry:find contiguous dirty pages from start 786 * 787 * Returns the page offset within memory region of the start of the contiguout 788 * dirty page 789 * 790 * @rs: current RAM state 791 * @rb: RAMBlock where to search for dirty pages 792 * @start: page where we start the search 793 * @num: the number of contiguous dirty pages 794 */ 795 static inline 796 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 797 unsigned long start, unsigned long *num) 798 { 799 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 800 unsigned long *bitmap = rb->bmap; 801 unsigned long first, next; 802 803 *num = 0; 804 805 if (migrate_ram_is_ignored(rb)) { 806 return size; 807 } 808 809 first = find_next_bit(bitmap, size, start); 810 if (first >= size) { 811 return first; 812 } 813 next = find_next_zero_bit(bitmap, size, first + 1); 814 assert(next >= first); 815 *num = next - first; 816 return first; 817 } 818 819 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 820 RAMBlock *rb, 821 unsigned long page) 822 { 823 bool ret; 824 825 /* 826 * Clear dirty bitmap if needed. This _must_ be called before we 827 * send any of the page in the chunk because we need to make sure 828 * we can capture further page content changes when we sync dirty 829 * log the next time. So as long as we are going to send any of 830 * the page in the chunk we clear the remote dirty bitmap for all. 831 * Clearing it earlier won't be a problem, but too late will. 832 */ 833 migration_clear_memory_region_dirty_bitmap(rb, page); 834 835 ret = test_and_clear_bit(page, rb->bmap); 836 if (ret) { 837 rs->migration_dirty_pages--; 838 } 839 840 return ret; 841 } 842 843 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 844 void *opaque) 845 { 846 const hwaddr offset = section->offset_within_region; 847 const hwaddr size = int128_get64(section->size); 848 const unsigned long start = offset >> TARGET_PAGE_BITS; 849 const unsigned long npages = size >> TARGET_PAGE_BITS; 850 RAMBlock *rb = section->mr->ram_block; 851 uint64_t *cleared_bits = opaque; 852 853 /* 854 * We don't grab ram_state->bitmap_mutex because we expect to run 855 * only when starting migration or during postcopy recovery where 856 * we don't have concurrent access. 857 */ 858 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 859 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 860 } 861 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 862 bitmap_clear(rb->bmap, start, npages); 863 } 864 865 /* 866 * Exclude all dirty pages from migration that fall into a discarded range as 867 * managed by a RamDiscardManager responsible for the mapped memory region of 868 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 869 * 870 * Discarded pages ("logically unplugged") have undefined content and must 871 * not get migrated, because even reading these pages for migration might 872 * result in undesired behavior. 873 * 874 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 875 * 876 * Note: The result is only stable while migrating (precopy/postcopy). 877 */ 878 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 879 { 880 uint64_t cleared_bits = 0; 881 882 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 883 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 884 MemoryRegionSection section = { 885 .mr = rb->mr, 886 .offset_within_region = 0, 887 .size = int128_make64(qemu_ram_get_used_length(rb)), 888 }; 889 890 ram_discard_manager_replay_discarded(rdm, §ion, 891 dirty_bitmap_clear_section, 892 &cleared_bits); 893 } 894 return cleared_bits; 895 } 896 897 /* 898 * Check if a host-page aligned page falls into a discarded range as managed by 899 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 900 * 901 * Note: The result is only stable while migrating (precopy/postcopy). 902 */ 903 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 904 { 905 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 906 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 907 MemoryRegionSection section = { 908 .mr = rb->mr, 909 .offset_within_region = start, 910 .size = int128_make64(qemu_ram_pagesize(rb)), 911 }; 912 913 return !ram_discard_manager_is_populated(rdm, §ion); 914 } 915 return false; 916 } 917 918 /* Called with RCU critical section */ 919 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 920 { 921 uint64_t new_dirty_pages = 922 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 923 924 rs->migration_dirty_pages += new_dirty_pages; 925 rs->num_dirty_pages_period += new_dirty_pages; 926 } 927 928 /** 929 * ram_pagesize_summary: calculate all the pagesizes of a VM 930 * 931 * Returns a summary bitmap of the page sizes of all RAMBlocks 932 * 933 * For VMs with just normal pages this is equivalent to the host page 934 * size. If it's got some huge pages then it's the OR of all the 935 * different page sizes. 936 */ 937 uint64_t ram_pagesize_summary(void) 938 { 939 RAMBlock *block; 940 uint64_t summary = 0; 941 942 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 943 summary |= block->page_size; 944 } 945 946 return summary; 947 } 948 949 uint64_t ram_get_total_transferred_pages(void) 950 { 951 return stat64_get(&mig_stats.normal_pages) + 952 stat64_get(&mig_stats.zero_pages) + 953 compress_ram_pages() + xbzrle_counters.pages; 954 } 955 956 static void migration_update_rates(RAMState *rs, int64_t end_time) 957 { 958 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 959 960 /* calculate period counters */ 961 stat64_set(&mig_stats.dirty_pages_rate, 962 rs->num_dirty_pages_period * 1000 / 963 (end_time - rs->time_last_bitmap_sync)); 964 965 if (!page_count) { 966 return; 967 } 968 969 if (migrate_xbzrle()) { 970 double encoded_size, unencoded_size; 971 972 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 973 rs->xbzrle_cache_miss_prev) / page_count; 974 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 975 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 976 TARGET_PAGE_SIZE; 977 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 978 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 979 xbzrle_counters.encoding_rate = 0; 980 } else { 981 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 982 } 983 rs->xbzrle_pages_prev = xbzrle_counters.pages; 984 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 985 } 986 compress_update_rates(page_count); 987 } 988 989 /* 990 * Enable dirty-limit to throttle down the guest 991 */ 992 static void migration_dirty_limit_guest(void) 993 { 994 /* 995 * dirty page rate quota for all vCPUs fetched from 996 * migration parameter 'vcpu_dirty_limit' 997 */ 998 static int64_t quota_dirtyrate; 999 MigrationState *s = migrate_get_current(); 1000 1001 /* 1002 * If dirty limit already enabled and migration parameter 1003 * vcpu-dirty-limit untouched. 1004 */ 1005 if (dirtylimit_in_service() && 1006 quota_dirtyrate == s->parameters.vcpu_dirty_limit) { 1007 return; 1008 } 1009 1010 quota_dirtyrate = s->parameters.vcpu_dirty_limit; 1011 1012 /* 1013 * Set all vCPU a quota dirtyrate, note that the second 1014 * parameter will be ignored if setting all vCPU for the vm 1015 */ 1016 qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL); 1017 trace_migration_dirty_limit_guest(quota_dirtyrate); 1018 } 1019 1020 static void migration_trigger_throttle(RAMState *rs) 1021 { 1022 uint64_t threshold = migrate_throttle_trigger_threshold(); 1023 uint64_t bytes_xfer_period = 1024 migration_transferred_bytes() - rs->bytes_xfer_prev; 1025 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1026 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1027 1028 /* During block migration the auto-converge logic incorrectly detects 1029 * that ram migration makes no progress. Avoid this by disabling the 1030 * throttling logic during the bulk phase of block migration. */ 1031 if (blk_mig_bulk_active()) { 1032 return; 1033 } 1034 1035 /* 1036 * The following detection logic can be refined later. For now: 1037 * Check to see if the ratio between dirtied bytes and the approx. 1038 * amount of bytes that just got transferred since the last time 1039 * we were in this routine reaches the threshold. If that happens 1040 * twice, start or increase throttling. 1041 */ 1042 if ((bytes_dirty_period > bytes_dirty_threshold) && 1043 (++rs->dirty_rate_high_cnt >= 2)) { 1044 rs->dirty_rate_high_cnt = 0; 1045 if (migrate_auto_converge()) { 1046 trace_migration_throttle(); 1047 mig_throttle_guest_down(bytes_dirty_period, 1048 bytes_dirty_threshold); 1049 } else if (migrate_dirty_limit()) { 1050 migration_dirty_limit_guest(); 1051 } 1052 } 1053 } 1054 1055 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1056 { 1057 RAMBlock *block; 1058 int64_t end_time; 1059 1060 stat64_add(&mig_stats.dirty_sync_count, 1); 1061 1062 if (!rs->time_last_bitmap_sync) { 1063 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1064 } 1065 1066 trace_migration_bitmap_sync_start(); 1067 memory_global_dirty_log_sync(last_stage); 1068 1069 qemu_mutex_lock(&rs->bitmap_mutex); 1070 WITH_RCU_READ_LOCK_GUARD() { 1071 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1072 ramblock_sync_dirty_bitmap(rs, block); 1073 } 1074 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1075 } 1076 qemu_mutex_unlock(&rs->bitmap_mutex); 1077 1078 memory_global_after_dirty_log_sync(); 1079 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1080 1081 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1082 1083 /* more than 1 second = 1000 millisecons */ 1084 if (end_time > rs->time_last_bitmap_sync + 1000) { 1085 migration_trigger_throttle(rs); 1086 1087 migration_update_rates(rs, end_time); 1088 1089 rs->target_page_count_prev = rs->target_page_count; 1090 1091 /* reset period counters */ 1092 rs->time_last_bitmap_sync = end_time; 1093 rs->num_dirty_pages_period = 0; 1094 rs->bytes_xfer_prev = migration_transferred_bytes(); 1095 } 1096 if (migrate_events()) { 1097 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1098 qapi_event_send_migration_pass(generation); 1099 } 1100 } 1101 1102 static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage) 1103 { 1104 Error *local_err = NULL; 1105 1106 /* 1107 * The current notifier usage is just an optimization to migration, so we 1108 * don't stop the normal migration process in the error case. 1109 */ 1110 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1111 error_report_err(local_err); 1112 local_err = NULL; 1113 } 1114 1115 migration_bitmap_sync(rs, last_stage); 1116 1117 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1118 error_report_err(local_err); 1119 } 1120 } 1121 1122 void ram_release_page(const char *rbname, uint64_t offset) 1123 { 1124 if (!migrate_release_ram() || !migration_in_postcopy()) { 1125 return; 1126 } 1127 1128 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1129 } 1130 1131 /** 1132 * save_zero_page: send the zero page to the stream 1133 * 1134 * Returns the number of pages written. 1135 * 1136 * @rs: current RAM state 1137 * @pss: current PSS channel 1138 * @offset: offset inside the block for the page 1139 */ 1140 static int save_zero_page(RAMState *rs, PageSearchStatus *pss, 1141 ram_addr_t offset) 1142 { 1143 uint8_t *p = pss->block->host + offset; 1144 QEMUFile *file = pss->pss_channel; 1145 int len = 0; 1146 1147 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { 1148 return 0; 1149 } 1150 1151 if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1152 return 0; 1153 } 1154 1155 stat64_add(&mig_stats.zero_pages, 1); 1156 1157 if (migrate_mapped_ram()) { 1158 /* zero pages are not transferred with mapped-ram */ 1159 clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); 1160 return 1; 1161 } 1162 1163 len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); 1164 qemu_put_byte(file, 0); 1165 len += 1; 1166 ram_release_page(pss->block->idstr, offset); 1167 ram_transferred_add(len); 1168 1169 /* 1170 * Must let xbzrle know, otherwise a previous (now 0'd) cached 1171 * page would be stale. 1172 */ 1173 if (rs->xbzrle_started) { 1174 XBZRLE_cache_lock(); 1175 xbzrle_cache_zero_page(pss->block->offset + offset); 1176 XBZRLE_cache_unlock(); 1177 } 1178 1179 return len; 1180 } 1181 1182 /* 1183 * @pages: the number of pages written by the control path, 1184 * < 0 - error 1185 * > 0 - number of pages written 1186 * 1187 * Return true if the pages has been saved, otherwise false is returned. 1188 */ 1189 static bool control_save_page(PageSearchStatus *pss, 1190 ram_addr_t offset, int *pages) 1191 { 1192 int ret; 1193 1194 ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset, 1195 TARGET_PAGE_SIZE); 1196 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1197 return false; 1198 } 1199 1200 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1201 *pages = 1; 1202 return true; 1203 } 1204 *pages = ret; 1205 return true; 1206 } 1207 1208 /* 1209 * directly send the page to the stream 1210 * 1211 * Returns the number of pages written. 1212 * 1213 * @pss: current PSS channel 1214 * @block: block that contains the page we want to send 1215 * @offset: offset inside the block for the page 1216 * @buf: the page to be sent 1217 * @async: send to page asyncly 1218 */ 1219 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1220 ram_addr_t offset, uint8_t *buf, bool async) 1221 { 1222 QEMUFile *file = pss->pss_channel; 1223 1224 if (migrate_mapped_ram()) { 1225 qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, 1226 block->pages_offset + offset); 1227 set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); 1228 } else { 1229 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1230 offset | RAM_SAVE_FLAG_PAGE)); 1231 if (async) { 1232 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1233 migrate_release_ram() && 1234 migration_in_postcopy()); 1235 } else { 1236 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1237 } 1238 } 1239 ram_transferred_add(TARGET_PAGE_SIZE); 1240 stat64_add(&mig_stats.normal_pages, 1); 1241 return 1; 1242 } 1243 1244 /** 1245 * ram_save_page: send the given page to the stream 1246 * 1247 * Returns the number of pages written. 1248 * < 0 - error 1249 * >=0 - Number of pages written - this might legally be 0 1250 * if xbzrle noticed the page was the same. 1251 * 1252 * @rs: current RAM state 1253 * @block: block that contains the page we want to send 1254 * @offset: offset inside the block for the page 1255 */ 1256 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1257 { 1258 int pages = -1; 1259 uint8_t *p; 1260 bool send_async = true; 1261 RAMBlock *block = pss->block; 1262 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1263 ram_addr_t current_addr = block->offset + offset; 1264 1265 p = block->host + offset; 1266 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1267 1268 XBZRLE_cache_lock(); 1269 if (rs->xbzrle_started && !migration_in_postcopy()) { 1270 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1271 block, offset); 1272 if (!rs->last_stage) { 1273 /* Can't send this cached data async, since the cache page 1274 * might get updated before it gets to the wire 1275 */ 1276 send_async = false; 1277 } 1278 } 1279 1280 /* XBZRLE overflow or normal page */ 1281 if (pages == -1) { 1282 pages = save_normal_page(pss, block, offset, p, send_async); 1283 } 1284 1285 XBZRLE_cache_unlock(); 1286 1287 return pages; 1288 } 1289 1290 static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) 1291 { 1292 if (!multifd_queue_page(block, offset)) { 1293 return -1; 1294 } 1295 1296 return 1; 1297 } 1298 1299 int compress_send_queued_data(CompressParam *param) 1300 { 1301 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY]; 1302 MigrationState *ms = migrate_get_current(); 1303 QEMUFile *file = ms->to_dst_file; 1304 int len = 0; 1305 1306 RAMBlock *block = param->block; 1307 ram_addr_t offset = param->offset; 1308 1309 if (param->result == RES_NONE) { 1310 return 0; 1311 } 1312 1313 assert(block == pss->last_sent_block); 1314 1315 if (param->result == RES_ZEROPAGE) { 1316 assert(qemu_file_buffer_empty(param->file)); 1317 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1318 qemu_put_byte(file, 0); 1319 len += 1; 1320 ram_release_page(block->idstr, offset); 1321 } else if (param->result == RES_COMPRESS) { 1322 assert(!qemu_file_buffer_empty(param->file)); 1323 len += save_page_header(pss, file, block, 1324 offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1325 len += qemu_put_qemu_file(file, param->file); 1326 } else { 1327 abort(); 1328 } 1329 1330 update_compress_thread_counts(param, len); 1331 1332 return len; 1333 } 1334 1335 #define PAGE_ALL_CLEAN 0 1336 #define PAGE_TRY_AGAIN 1 1337 #define PAGE_DIRTY_FOUND 2 1338 /** 1339 * find_dirty_block: find the next dirty page and update any state 1340 * associated with the search process. 1341 * 1342 * Returns: 1343 * <0: An error happened 1344 * PAGE_ALL_CLEAN: no dirty page found, give up 1345 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1346 * PAGE_DIRTY_FOUND: dirty page found 1347 * 1348 * @rs: current RAM state 1349 * @pss: data about the state of the current dirty page scan 1350 * @again: set to false if the search has scanned the whole of RAM 1351 */ 1352 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1353 { 1354 /* Update pss->page for the next dirty bit in ramblock */ 1355 pss_find_next_dirty(pss); 1356 1357 if (pss->complete_round && pss->block == rs->last_seen_block && 1358 pss->page >= rs->last_page) { 1359 /* 1360 * We've been once around the RAM and haven't found anything. 1361 * Give up. 1362 */ 1363 return PAGE_ALL_CLEAN; 1364 } 1365 if (!offset_in_ramblock(pss->block, 1366 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1367 /* Didn't find anything in this RAM Block */ 1368 pss->page = 0; 1369 pss->block = QLIST_NEXT_RCU(pss->block, next); 1370 if (!pss->block) { 1371 if (migrate_multifd() && 1372 (!migrate_multifd_flush_after_each_section() || 1373 migrate_mapped_ram())) { 1374 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1375 int ret = multifd_send_sync_main(); 1376 if (ret < 0) { 1377 return ret; 1378 } 1379 1380 if (!migrate_mapped_ram()) { 1381 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 1382 qemu_fflush(f); 1383 } 1384 } 1385 /* 1386 * If memory migration starts over, we will meet a dirtied page 1387 * which may still exists in compression threads's ring, so we 1388 * should flush the compressed data to make sure the new page 1389 * is not overwritten by the old one in the destination. 1390 * 1391 * Also If xbzrle is on, stop using the data compression at this 1392 * point. In theory, xbzrle can do better than compression. 1393 */ 1394 compress_flush_data(); 1395 1396 /* Hit the end of the list */ 1397 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1398 /* Flag that we've looped */ 1399 pss->complete_round = true; 1400 /* After the first round, enable XBZRLE. */ 1401 if (migrate_xbzrle()) { 1402 rs->xbzrle_started = true; 1403 } 1404 } 1405 /* Didn't find anything this time, but try again on the new block */ 1406 return PAGE_TRY_AGAIN; 1407 } else { 1408 /* We've found something */ 1409 return PAGE_DIRTY_FOUND; 1410 } 1411 } 1412 1413 /** 1414 * unqueue_page: gets a page of the queue 1415 * 1416 * Helper for 'get_queued_page' - gets a page off the queue 1417 * 1418 * Returns the block of the page (or NULL if none available) 1419 * 1420 * @rs: current RAM state 1421 * @offset: used to return the offset within the RAMBlock 1422 */ 1423 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1424 { 1425 struct RAMSrcPageRequest *entry; 1426 RAMBlock *block = NULL; 1427 1428 if (!postcopy_has_request(rs)) { 1429 return NULL; 1430 } 1431 1432 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1433 1434 /* 1435 * This should _never_ change even after we take the lock, because no one 1436 * should be taking anything off the request list other than us. 1437 */ 1438 assert(postcopy_has_request(rs)); 1439 1440 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1441 block = entry->rb; 1442 *offset = entry->offset; 1443 1444 if (entry->len > TARGET_PAGE_SIZE) { 1445 entry->len -= TARGET_PAGE_SIZE; 1446 entry->offset += TARGET_PAGE_SIZE; 1447 } else { 1448 memory_region_unref(block->mr); 1449 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1450 g_free(entry); 1451 migration_consume_urgent_request(); 1452 } 1453 1454 return block; 1455 } 1456 1457 #if defined(__linux__) 1458 /** 1459 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1460 * is found, return RAM block pointer and page offset 1461 * 1462 * Returns pointer to the RAMBlock containing faulting page, 1463 * NULL if no write faults are pending 1464 * 1465 * @rs: current RAM state 1466 * @offset: page offset from the beginning of the block 1467 */ 1468 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1469 { 1470 struct uffd_msg uffd_msg; 1471 void *page_address; 1472 RAMBlock *block; 1473 int res; 1474 1475 if (!migrate_background_snapshot()) { 1476 return NULL; 1477 } 1478 1479 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1480 if (res <= 0) { 1481 return NULL; 1482 } 1483 1484 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1485 block = qemu_ram_block_from_host(page_address, false, offset); 1486 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1487 return block; 1488 } 1489 1490 /** 1491 * ram_save_release_protection: release UFFD write protection after 1492 * a range of pages has been saved 1493 * 1494 * @rs: current RAM state 1495 * @pss: page-search-status structure 1496 * @start_page: index of the first page in the range relative to pss->block 1497 * 1498 * Returns 0 on success, negative value in case of an error 1499 */ 1500 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1501 unsigned long start_page) 1502 { 1503 int res = 0; 1504 1505 /* Check if page is from UFFD-managed region. */ 1506 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1507 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1508 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1509 1510 /* Flush async buffers before un-protect. */ 1511 qemu_fflush(pss->pss_channel); 1512 /* Un-protect memory range. */ 1513 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1514 false, false); 1515 } 1516 1517 return res; 1518 } 1519 1520 /* ram_write_tracking_available: check if kernel supports required UFFD features 1521 * 1522 * Returns true if supports, false otherwise 1523 */ 1524 bool ram_write_tracking_available(void) 1525 { 1526 uint64_t uffd_features; 1527 int res; 1528 1529 res = uffd_query_features(&uffd_features); 1530 return (res == 0 && 1531 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1532 } 1533 1534 /* ram_write_tracking_compatible: check if guest configuration is 1535 * compatible with 'write-tracking' 1536 * 1537 * Returns true if compatible, false otherwise 1538 */ 1539 bool ram_write_tracking_compatible(void) 1540 { 1541 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1542 int uffd_fd; 1543 RAMBlock *block; 1544 bool ret = false; 1545 1546 /* Open UFFD file descriptor */ 1547 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1548 if (uffd_fd < 0) { 1549 return false; 1550 } 1551 1552 RCU_READ_LOCK_GUARD(); 1553 1554 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1555 uint64_t uffd_ioctls; 1556 1557 /* Nothing to do with read-only and MMIO-writable regions */ 1558 if (block->mr->readonly || block->mr->rom_device) { 1559 continue; 1560 } 1561 /* Try to register block memory via UFFD-IO to track writes */ 1562 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1563 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1564 goto out; 1565 } 1566 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1567 goto out; 1568 } 1569 } 1570 ret = true; 1571 1572 out: 1573 uffd_close_fd(uffd_fd); 1574 return ret; 1575 } 1576 1577 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1578 ram_addr_t size) 1579 { 1580 const ram_addr_t end = offset + size; 1581 1582 /* 1583 * We read one byte of each page; this will preallocate page tables if 1584 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1585 * where no page was populated yet. This might require adaption when 1586 * supporting other mappings, like shmem. 1587 */ 1588 for (; offset < end; offset += block->page_size) { 1589 char tmp = *((char *)block->host + offset); 1590 1591 /* Don't optimize the read out */ 1592 asm volatile("" : "+r" (tmp)); 1593 } 1594 } 1595 1596 static inline int populate_read_section(MemoryRegionSection *section, 1597 void *opaque) 1598 { 1599 const hwaddr size = int128_get64(section->size); 1600 hwaddr offset = section->offset_within_region; 1601 RAMBlock *block = section->mr->ram_block; 1602 1603 populate_read_range(block, offset, size); 1604 return 0; 1605 } 1606 1607 /* 1608 * ram_block_populate_read: preallocate page tables and populate pages in the 1609 * RAM block by reading a byte of each page. 1610 * 1611 * Since it's solely used for userfault_fd WP feature, here we just 1612 * hardcode page size to qemu_real_host_page_size. 1613 * 1614 * @block: RAM block to populate 1615 */ 1616 static void ram_block_populate_read(RAMBlock *rb) 1617 { 1618 /* 1619 * Skip populating all pages that fall into a discarded range as managed by 1620 * a RamDiscardManager responsible for the mapped memory region of the 1621 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1622 * must not get populated automatically. We don't have to track 1623 * modifications via userfaultfd WP reliably, because these pages will 1624 * not be part of the migration stream either way -- see 1625 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1626 * 1627 * Note: The result is only stable while migrating (precopy/postcopy). 1628 */ 1629 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1630 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1631 MemoryRegionSection section = { 1632 .mr = rb->mr, 1633 .offset_within_region = 0, 1634 .size = rb->mr->size, 1635 }; 1636 1637 ram_discard_manager_replay_populated(rdm, §ion, 1638 populate_read_section, NULL); 1639 } else { 1640 populate_read_range(rb, 0, rb->used_length); 1641 } 1642 } 1643 1644 /* 1645 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1646 */ 1647 void ram_write_tracking_prepare(void) 1648 { 1649 RAMBlock *block; 1650 1651 RCU_READ_LOCK_GUARD(); 1652 1653 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1654 /* Nothing to do with read-only and MMIO-writable regions */ 1655 if (block->mr->readonly || block->mr->rom_device) { 1656 continue; 1657 } 1658 1659 /* 1660 * Populate pages of the RAM block before enabling userfault_fd 1661 * write protection. 1662 * 1663 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1664 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1665 * pages with pte_none() entries in page table. 1666 */ 1667 ram_block_populate_read(block); 1668 } 1669 } 1670 1671 static inline int uffd_protect_section(MemoryRegionSection *section, 1672 void *opaque) 1673 { 1674 const hwaddr size = int128_get64(section->size); 1675 const hwaddr offset = section->offset_within_region; 1676 RAMBlock *rb = section->mr->ram_block; 1677 int uffd_fd = (uintptr_t)opaque; 1678 1679 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1680 false); 1681 } 1682 1683 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1684 { 1685 assert(rb->flags & RAM_UF_WRITEPROTECT); 1686 1687 /* See ram_block_populate_read() */ 1688 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1689 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1690 MemoryRegionSection section = { 1691 .mr = rb->mr, 1692 .offset_within_region = 0, 1693 .size = rb->mr->size, 1694 }; 1695 1696 return ram_discard_manager_replay_populated(rdm, §ion, 1697 uffd_protect_section, 1698 (void *)(uintptr_t)uffd_fd); 1699 } 1700 return uffd_change_protection(uffd_fd, rb->host, 1701 rb->used_length, true, false); 1702 } 1703 1704 /* 1705 * ram_write_tracking_start: start UFFD-WP memory tracking 1706 * 1707 * Returns 0 for success or negative value in case of error 1708 */ 1709 int ram_write_tracking_start(void) 1710 { 1711 int uffd_fd; 1712 RAMState *rs = ram_state; 1713 RAMBlock *block; 1714 1715 /* Open UFFD file descriptor */ 1716 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1717 if (uffd_fd < 0) { 1718 return uffd_fd; 1719 } 1720 rs->uffdio_fd = uffd_fd; 1721 1722 RCU_READ_LOCK_GUARD(); 1723 1724 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1725 /* Nothing to do with read-only and MMIO-writable regions */ 1726 if (block->mr->readonly || block->mr->rom_device) { 1727 continue; 1728 } 1729 1730 /* Register block memory with UFFD to track writes */ 1731 if (uffd_register_memory(rs->uffdio_fd, block->host, 1732 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1733 goto fail; 1734 } 1735 block->flags |= RAM_UF_WRITEPROTECT; 1736 memory_region_ref(block->mr); 1737 1738 /* Apply UFFD write protection to the block memory range */ 1739 if (ram_block_uffd_protect(block, uffd_fd)) { 1740 goto fail; 1741 } 1742 1743 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1744 block->host, block->max_length); 1745 } 1746 1747 return 0; 1748 1749 fail: 1750 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1751 1752 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1753 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1754 continue; 1755 } 1756 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1757 /* Cleanup flags and remove reference */ 1758 block->flags &= ~RAM_UF_WRITEPROTECT; 1759 memory_region_unref(block->mr); 1760 } 1761 1762 uffd_close_fd(uffd_fd); 1763 rs->uffdio_fd = -1; 1764 return -1; 1765 } 1766 1767 /** 1768 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1769 */ 1770 void ram_write_tracking_stop(void) 1771 { 1772 RAMState *rs = ram_state; 1773 RAMBlock *block; 1774 1775 RCU_READ_LOCK_GUARD(); 1776 1777 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1778 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1779 continue; 1780 } 1781 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1782 1783 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1784 block->host, block->max_length); 1785 1786 /* Cleanup flags and remove reference */ 1787 block->flags &= ~RAM_UF_WRITEPROTECT; 1788 memory_region_unref(block->mr); 1789 } 1790 1791 /* Finally close UFFD file descriptor */ 1792 uffd_close_fd(rs->uffdio_fd); 1793 rs->uffdio_fd = -1; 1794 } 1795 1796 #else 1797 /* No target OS support, stubs just fail or ignore */ 1798 1799 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1800 { 1801 (void) rs; 1802 (void) offset; 1803 1804 return NULL; 1805 } 1806 1807 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1808 unsigned long start_page) 1809 { 1810 (void) rs; 1811 (void) pss; 1812 (void) start_page; 1813 1814 return 0; 1815 } 1816 1817 bool ram_write_tracking_available(void) 1818 { 1819 return false; 1820 } 1821 1822 bool ram_write_tracking_compatible(void) 1823 { 1824 assert(0); 1825 return false; 1826 } 1827 1828 int ram_write_tracking_start(void) 1829 { 1830 assert(0); 1831 return -1; 1832 } 1833 1834 void ram_write_tracking_stop(void) 1835 { 1836 assert(0); 1837 } 1838 #endif /* defined(__linux__) */ 1839 1840 /** 1841 * get_queued_page: unqueue a page from the postcopy requests 1842 * 1843 * Skips pages that are already sent (!dirty) 1844 * 1845 * Returns true if a queued page is found 1846 * 1847 * @rs: current RAM state 1848 * @pss: data about the state of the current dirty page scan 1849 */ 1850 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1851 { 1852 RAMBlock *block; 1853 ram_addr_t offset; 1854 bool dirty; 1855 1856 do { 1857 block = unqueue_page(rs, &offset); 1858 /* 1859 * We're sending this page, and since it's postcopy nothing else 1860 * will dirty it, and we must make sure it doesn't get sent again 1861 * even if this queue request was received after the background 1862 * search already sent it. 1863 */ 1864 if (block) { 1865 unsigned long page; 1866 1867 page = offset >> TARGET_PAGE_BITS; 1868 dirty = test_bit(page, block->bmap); 1869 if (!dirty) { 1870 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1871 page); 1872 } else { 1873 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1874 } 1875 } 1876 1877 } while (block && !dirty); 1878 1879 if (!block) { 1880 /* 1881 * Poll write faults too if background snapshot is enabled; that's 1882 * when we have vcpus got blocked by the write protected pages. 1883 */ 1884 block = poll_fault_page(rs, &offset); 1885 } 1886 1887 if (block) { 1888 /* 1889 * We want the background search to continue from the queued page 1890 * since the guest is likely to want other pages near to the page 1891 * it just requested. 1892 */ 1893 pss->block = block; 1894 pss->page = offset >> TARGET_PAGE_BITS; 1895 1896 /* 1897 * This unqueued page would break the "one round" check, even is 1898 * really rare. 1899 */ 1900 pss->complete_round = false; 1901 } 1902 1903 return !!block; 1904 } 1905 1906 /** 1907 * migration_page_queue_free: drop any remaining pages in the ram 1908 * request queue 1909 * 1910 * It should be empty at the end anyway, but in error cases there may 1911 * be some left. in case that there is any page left, we drop it. 1912 * 1913 */ 1914 static void migration_page_queue_free(RAMState *rs) 1915 { 1916 struct RAMSrcPageRequest *mspr, *next_mspr; 1917 /* This queue generally should be empty - but in the case of a failed 1918 * migration might have some droppings in. 1919 */ 1920 RCU_READ_LOCK_GUARD(); 1921 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1922 memory_region_unref(mspr->rb->mr); 1923 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1924 g_free(mspr); 1925 } 1926 } 1927 1928 /** 1929 * ram_save_queue_pages: queue the page for transmission 1930 * 1931 * A request from postcopy destination for example. 1932 * 1933 * Returns zero on success or negative on error 1934 * 1935 * @rbname: Name of the RAMBLock of the request. NULL means the 1936 * same that last one. 1937 * @start: starting address from the start of the RAMBlock 1938 * @len: length (in bytes) to send 1939 */ 1940 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, 1941 Error **errp) 1942 { 1943 RAMBlock *ramblock; 1944 RAMState *rs = ram_state; 1945 1946 stat64_add(&mig_stats.postcopy_requests, 1); 1947 RCU_READ_LOCK_GUARD(); 1948 1949 if (!rbname) { 1950 /* Reuse last RAMBlock */ 1951 ramblock = rs->last_req_rb; 1952 1953 if (!ramblock) { 1954 /* 1955 * Shouldn't happen, we can't reuse the last RAMBlock if 1956 * it's the 1st request. 1957 */ 1958 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block"); 1959 return -1; 1960 } 1961 } else { 1962 ramblock = qemu_ram_block_by_name(rbname); 1963 1964 if (!ramblock) { 1965 /* We shouldn't be asked for a non-existent RAMBlock */ 1966 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname); 1967 return -1; 1968 } 1969 rs->last_req_rb = ramblock; 1970 } 1971 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1972 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1973 error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, " 1974 "start=" RAM_ADDR_FMT " len=" 1975 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1976 start, len, ramblock->used_length); 1977 return -1; 1978 } 1979 1980 /* 1981 * When with postcopy preempt, we send back the page directly in the 1982 * rp-return thread. 1983 */ 1984 if (postcopy_preempt_active()) { 1985 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1986 size_t page_size = qemu_ram_pagesize(ramblock); 1987 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1988 int ret = 0; 1989 1990 qemu_mutex_lock(&rs->bitmap_mutex); 1991 1992 pss_init(pss, ramblock, page_start); 1993 /* 1994 * Always use the preempt channel, and make sure it's there. It's 1995 * safe to access without lock, because when rp-thread is running 1996 * we should be the only one who operates on the qemufile 1997 */ 1998 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 1999 assert(pss->pss_channel); 2000 2001 /* 2002 * It must be either one or multiple of host page size. Just 2003 * assert; if something wrong we're mostly split brain anyway. 2004 */ 2005 assert(len % page_size == 0); 2006 while (len) { 2007 if (ram_save_host_page_urgent(pss)) { 2008 error_setg(errp, "ram_save_host_page_urgent() failed: " 2009 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2010 ramblock->idstr, start); 2011 ret = -1; 2012 break; 2013 } 2014 /* 2015 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2016 * will automatically be moved and point to the next host page 2017 * we're going to send, so no need to update here. 2018 * 2019 * Normally QEMU never sends >1 host page in requests, so 2020 * logically we don't even need that as the loop should only 2021 * run once, but just to be consistent. 2022 */ 2023 len -= page_size; 2024 }; 2025 qemu_mutex_unlock(&rs->bitmap_mutex); 2026 2027 return ret; 2028 } 2029 2030 struct RAMSrcPageRequest *new_entry = 2031 g_new0(struct RAMSrcPageRequest, 1); 2032 new_entry->rb = ramblock; 2033 new_entry->offset = start; 2034 new_entry->len = len; 2035 2036 memory_region_ref(ramblock->mr); 2037 qemu_mutex_lock(&rs->src_page_req_mutex); 2038 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2039 migration_make_urgent_request(); 2040 qemu_mutex_unlock(&rs->src_page_req_mutex); 2041 2042 return 0; 2043 } 2044 2045 /* 2046 * try to compress the page before posting it out, return true if the page 2047 * has been properly handled by compression, otherwise needs other 2048 * paths to handle it 2049 */ 2050 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2051 ram_addr_t offset) 2052 { 2053 if (!migrate_compress()) { 2054 return false; 2055 } 2056 2057 /* 2058 * When starting the process of a new block, the first page of 2059 * the block should be sent out before other pages in the same 2060 * block, and all the pages in last block should have been sent 2061 * out, keeping this order is important, because the 'cont' flag 2062 * is used to avoid resending the block name. 2063 * 2064 * We post the fist page as normal page as compression will take 2065 * much CPU resource. 2066 */ 2067 if (pss->block != pss->last_sent_block) { 2068 compress_flush_data(); 2069 return false; 2070 } 2071 2072 return compress_page_with_multi_thread(pss->block, offset, 2073 compress_send_queued_data); 2074 } 2075 2076 /** 2077 * ram_save_target_page_legacy: save one target page 2078 * 2079 * Returns the number of pages written 2080 * 2081 * @rs: current RAM state 2082 * @pss: data about the page we want to send 2083 */ 2084 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2085 { 2086 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2087 int res; 2088 2089 if (control_save_page(pss, offset, &res)) { 2090 return res; 2091 } 2092 2093 if (save_compress_page(rs, pss, offset)) { 2094 return 1; 2095 } 2096 2097 if (save_zero_page(rs, pss, offset)) { 2098 return 1; 2099 } 2100 2101 return ram_save_page(rs, pss); 2102 } 2103 2104 /** 2105 * ram_save_target_page_multifd: send one target page to multifd workers 2106 * 2107 * Returns 1 if the page was queued, -1 otherwise. 2108 * 2109 * @rs: current RAM state 2110 * @pss: data about the page we want to send 2111 */ 2112 static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) 2113 { 2114 RAMBlock *block = pss->block; 2115 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2116 2117 /* 2118 * While using multifd live migration, we still need to handle zero 2119 * page checking on the migration main thread. 2120 */ 2121 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { 2122 if (save_zero_page(rs, pss, offset)) { 2123 return 1; 2124 } 2125 } 2126 2127 return ram_save_multifd_page(block, offset); 2128 } 2129 2130 /* Should be called before sending a host page */ 2131 static void pss_host_page_prepare(PageSearchStatus *pss) 2132 { 2133 /* How many guest pages are there in one host page? */ 2134 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2135 2136 pss->host_page_sending = true; 2137 if (guest_pfns <= 1) { 2138 /* 2139 * This covers both when guest psize == host psize, or when guest 2140 * has larger psize than the host (guest_pfns==0). 2141 * 2142 * For the latter, we always send one whole guest page per 2143 * iteration of the host page (example: an Alpha VM on x86 host 2144 * will have guest psize 8K while host psize 4K). 2145 */ 2146 pss->host_page_start = pss->page; 2147 pss->host_page_end = pss->page + 1; 2148 } else { 2149 /* 2150 * The host page spans over multiple guest pages, we send them 2151 * within the same host page iteration. 2152 */ 2153 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2154 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2155 } 2156 } 2157 2158 /* 2159 * Whether the page pointed by PSS is within the host page being sent. 2160 * Must be called after a previous pss_host_page_prepare(). 2161 */ 2162 static bool pss_within_range(PageSearchStatus *pss) 2163 { 2164 ram_addr_t ram_addr; 2165 2166 assert(pss->host_page_sending); 2167 2168 /* Over host-page boundary? */ 2169 if (pss->page >= pss->host_page_end) { 2170 return false; 2171 } 2172 2173 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2174 2175 return offset_in_ramblock(pss->block, ram_addr); 2176 } 2177 2178 static void pss_host_page_finish(PageSearchStatus *pss) 2179 { 2180 pss->host_page_sending = false; 2181 /* This is not needed, but just to reset it */ 2182 pss->host_page_start = pss->host_page_end = 0; 2183 } 2184 2185 /* 2186 * Send an urgent host page specified by `pss'. Need to be called with 2187 * bitmap_mutex held. 2188 * 2189 * Returns 0 if save host page succeeded, false otherwise. 2190 */ 2191 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2192 { 2193 bool page_dirty, sent = false; 2194 RAMState *rs = ram_state; 2195 int ret = 0; 2196 2197 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2198 pss_host_page_prepare(pss); 2199 2200 /* 2201 * If precopy is sending the same page, let it be done in precopy, or 2202 * we could send the same page in two channels and none of them will 2203 * receive the whole page. 2204 */ 2205 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2206 trace_postcopy_preempt_hit(pss->block->idstr, 2207 pss->page << TARGET_PAGE_BITS); 2208 return 0; 2209 } 2210 2211 do { 2212 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2213 2214 if (page_dirty) { 2215 /* Be strict to return code; it must be 1, or what else? */ 2216 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2217 error_report_once("%s: ram_save_target_page failed", __func__); 2218 ret = -1; 2219 goto out; 2220 } 2221 sent = true; 2222 } 2223 pss_find_next_dirty(pss); 2224 } while (pss_within_range(pss)); 2225 out: 2226 pss_host_page_finish(pss); 2227 /* For urgent requests, flush immediately if sent */ 2228 if (sent) { 2229 qemu_fflush(pss->pss_channel); 2230 } 2231 return ret; 2232 } 2233 2234 /** 2235 * ram_save_host_page: save a whole host page 2236 * 2237 * Starting at *offset send pages up to the end of the current host 2238 * page. It's valid for the initial offset to point into the middle of 2239 * a host page in which case the remainder of the hostpage is sent. 2240 * Only dirty target pages are sent. Note that the host page size may 2241 * be a huge page for this block. 2242 * 2243 * The saving stops at the boundary of the used_length of the block 2244 * if the RAMBlock isn't a multiple of the host page size. 2245 * 2246 * The caller must be with ram_state.bitmap_mutex held to call this 2247 * function. Note that this function can temporarily release the lock, but 2248 * when the function is returned it'll make sure the lock is still held. 2249 * 2250 * Returns the number of pages written or negative on error 2251 * 2252 * @rs: current RAM state 2253 * @pss: data about the page we want to send 2254 */ 2255 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2256 { 2257 bool page_dirty, preempt_active = postcopy_preempt_active(); 2258 int tmppages, pages = 0; 2259 size_t pagesize_bits = 2260 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2261 unsigned long start_page = pss->page; 2262 int res; 2263 2264 if (migrate_ram_is_ignored(pss->block)) { 2265 error_report("block %s should not be migrated !", pss->block->idstr); 2266 return 0; 2267 } 2268 2269 /* Update host page boundary information */ 2270 pss_host_page_prepare(pss); 2271 2272 do { 2273 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2274 2275 /* Check the pages is dirty and if it is send it */ 2276 if (page_dirty) { 2277 /* 2278 * Properly yield the lock only in postcopy preempt mode 2279 * because both migration thread and rp-return thread can 2280 * operate on the bitmaps. 2281 */ 2282 if (preempt_active) { 2283 qemu_mutex_unlock(&rs->bitmap_mutex); 2284 } 2285 tmppages = migration_ops->ram_save_target_page(rs, pss); 2286 if (tmppages >= 0) { 2287 pages += tmppages; 2288 /* 2289 * Allow rate limiting to happen in the middle of huge pages if 2290 * something is sent in the current iteration. 2291 */ 2292 if (pagesize_bits > 1 && tmppages > 0) { 2293 migration_rate_limit(); 2294 } 2295 } 2296 if (preempt_active) { 2297 qemu_mutex_lock(&rs->bitmap_mutex); 2298 } 2299 } else { 2300 tmppages = 0; 2301 } 2302 2303 if (tmppages < 0) { 2304 pss_host_page_finish(pss); 2305 return tmppages; 2306 } 2307 2308 pss_find_next_dirty(pss); 2309 } while (pss_within_range(pss)); 2310 2311 pss_host_page_finish(pss); 2312 2313 res = ram_save_release_protection(rs, pss, start_page); 2314 return (res < 0 ? res : pages); 2315 } 2316 2317 /** 2318 * ram_find_and_save_block: finds a dirty page and sends it to f 2319 * 2320 * Called within an RCU critical section. 2321 * 2322 * Returns the number of pages written where zero means no dirty pages, 2323 * or negative on error 2324 * 2325 * @rs: current RAM state 2326 * 2327 * On systems where host-page-size > target-page-size it will send all the 2328 * pages in a host page that are dirty. 2329 */ 2330 static int ram_find_and_save_block(RAMState *rs) 2331 { 2332 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2333 int pages = 0; 2334 2335 /* No dirty page as there is zero RAM */ 2336 if (!rs->ram_bytes_total) { 2337 return pages; 2338 } 2339 2340 /* 2341 * Always keep last_seen_block/last_page valid during this procedure, 2342 * because find_dirty_block() relies on these values (e.g., we compare 2343 * last_seen_block with pss.block to see whether we searched all the 2344 * ramblocks) to detect the completion of migration. Having NULL value 2345 * of last_seen_block can conditionally cause below loop to run forever. 2346 */ 2347 if (!rs->last_seen_block) { 2348 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2349 rs->last_page = 0; 2350 } 2351 2352 pss_init(pss, rs->last_seen_block, rs->last_page); 2353 2354 while (true){ 2355 if (!get_queued_page(rs, pss)) { 2356 /* priority queue empty, so just search for something dirty */ 2357 int res = find_dirty_block(rs, pss); 2358 if (res != PAGE_DIRTY_FOUND) { 2359 if (res == PAGE_ALL_CLEAN) { 2360 break; 2361 } else if (res == PAGE_TRY_AGAIN) { 2362 continue; 2363 } else if (res < 0) { 2364 pages = res; 2365 break; 2366 } 2367 } 2368 } 2369 pages = ram_save_host_page(rs, pss); 2370 if (pages) { 2371 break; 2372 } 2373 } 2374 2375 rs->last_seen_block = pss->block; 2376 rs->last_page = pss->page; 2377 2378 return pages; 2379 } 2380 2381 static uint64_t ram_bytes_total_with_ignored(void) 2382 { 2383 RAMBlock *block; 2384 uint64_t total = 0; 2385 2386 RCU_READ_LOCK_GUARD(); 2387 2388 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2389 total += block->used_length; 2390 } 2391 return total; 2392 } 2393 2394 uint64_t ram_bytes_total(void) 2395 { 2396 RAMBlock *block; 2397 uint64_t total = 0; 2398 2399 RCU_READ_LOCK_GUARD(); 2400 2401 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2402 total += block->used_length; 2403 } 2404 return total; 2405 } 2406 2407 static void xbzrle_load_setup(void) 2408 { 2409 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2410 } 2411 2412 static void xbzrle_load_cleanup(void) 2413 { 2414 g_free(XBZRLE.decoded_buf); 2415 XBZRLE.decoded_buf = NULL; 2416 } 2417 2418 static void ram_state_cleanup(RAMState **rsp) 2419 { 2420 if (*rsp) { 2421 migration_page_queue_free(*rsp); 2422 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2423 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2424 g_free(*rsp); 2425 *rsp = NULL; 2426 } 2427 } 2428 2429 static void xbzrle_cleanup(void) 2430 { 2431 XBZRLE_cache_lock(); 2432 if (XBZRLE.cache) { 2433 cache_fini(XBZRLE.cache); 2434 g_free(XBZRLE.encoded_buf); 2435 g_free(XBZRLE.current_buf); 2436 g_free(XBZRLE.zero_target_page); 2437 XBZRLE.cache = NULL; 2438 XBZRLE.encoded_buf = NULL; 2439 XBZRLE.current_buf = NULL; 2440 XBZRLE.zero_target_page = NULL; 2441 } 2442 XBZRLE_cache_unlock(); 2443 } 2444 2445 static void ram_bitmaps_destroy(void) 2446 { 2447 RAMBlock *block; 2448 2449 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2450 g_free(block->clear_bmap); 2451 block->clear_bmap = NULL; 2452 g_free(block->bmap); 2453 block->bmap = NULL; 2454 g_free(block->file_bmap); 2455 block->file_bmap = NULL; 2456 } 2457 } 2458 2459 static void ram_save_cleanup(void *opaque) 2460 { 2461 RAMState **rsp = opaque; 2462 2463 /* We don't use dirty log with background snapshots */ 2464 if (!migrate_background_snapshot()) { 2465 /* caller have hold BQL or is in a bh, so there is 2466 * no writing race against the migration bitmap 2467 */ 2468 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2469 /* 2470 * do not stop dirty log without starting it, since 2471 * memory_global_dirty_log_stop will assert that 2472 * memory_global_dirty_log_start/stop used in pairs 2473 */ 2474 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2475 } 2476 } 2477 2478 ram_bitmaps_destroy(); 2479 2480 xbzrle_cleanup(); 2481 compress_threads_save_cleanup(); 2482 ram_state_cleanup(rsp); 2483 g_free(migration_ops); 2484 migration_ops = NULL; 2485 } 2486 2487 static void ram_state_reset(RAMState *rs) 2488 { 2489 int i; 2490 2491 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2492 rs->pss[i].last_sent_block = NULL; 2493 } 2494 2495 rs->last_seen_block = NULL; 2496 rs->last_page = 0; 2497 rs->last_version = ram_list.version; 2498 rs->xbzrle_started = false; 2499 } 2500 2501 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2502 2503 /* **** functions for postcopy ***** */ 2504 2505 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2506 { 2507 struct RAMBlock *block; 2508 2509 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2510 unsigned long *bitmap = block->bmap; 2511 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2512 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2513 2514 while (run_start < range) { 2515 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2516 ram_discard_range(block->idstr, 2517 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2518 ((ram_addr_t)(run_end - run_start)) 2519 << TARGET_PAGE_BITS); 2520 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2521 } 2522 } 2523 } 2524 2525 /** 2526 * postcopy_send_discard_bm_ram: discard a RAMBlock 2527 * 2528 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2529 * 2530 * @ms: current migration state 2531 * @block: RAMBlock to discard 2532 */ 2533 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2534 { 2535 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2536 unsigned long current; 2537 unsigned long *bitmap = block->bmap; 2538 2539 for (current = 0; current < end; ) { 2540 unsigned long one = find_next_bit(bitmap, end, current); 2541 unsigned long zero, discard_length; 2542 2543 if (one >= end) { 2544 break; 2545 } 2546 2547 zero = find_next_zero_bit(bitmap, end, one + 1); 2548 2549 if (zero >= end) { 2550 discard_length = end - one; 2551 } else { 2552 discard_length = zero - one; 2553 } 2554 postcopy_discard_send_range(ms, one, discard_length); 2555 current = one + discard_length; 2556 } 2557 } 2558 2559 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2560 2561 /** 2562 * postcopy_each_ram_send_discard: discard all RAMBlocks 2563 * 2564 * Utility for the outgoing postcopy code. 2565 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2566 * passing it bitmap indexes and name. 2567 * (qemu_ram_foreach_block ends up passing unscaled lengths 2568 * which would mean postcopy code would have to deal with target page) 2569 * 2570 * @ms: current migration state 2571 */ 2572 static void postcopy_each_ram_send_discard(MigrationState *ms) 2573 { 2574 struct RAMBlock *block; 2575 2576 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2577 postcopy_discard_send_init(ms, block->idstr); 2578 2579 /* 2580 * Deal with TPS != HPS and huge pages. It discard any partially sent 2581 * host-page size chunks, mark any partially dirty host-page size 2582 * chunks as all dirty. In this case the host-page is the host-page 2583 * for the particular RAMBlock, i.e. it might be a huge page. 2584 */ 2585 postcopy_chunk_hostpages_pass(ms, block); 2586 2587 /* 2588 * Postcopy sends chunks of bitmap over the wire, but it 2589 * just needs indexes at this point, avoids it having 2590 * target page specific code. 2591 */ 2592 postcopy_send_discard_bm_ram(ms, block); 2593 postcopy_discard_send_finish(ms); 2594 } 2595 } 2596 2597 /** 2598 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2599 * 2600 * Helper for postcopy_chunk_hostpages; it's called twice to 2601 * canonicalize the two bitmaps, that are similar, but one is 2602 * inverted. 2603 * 2604 * Postcopy requires that all target pages in a hostpage are dirty or 2605 * clean, not a mix. This function canonicalizes the bitmaps. 2606 * 2607 * @ms: current migration state 2608 * @block: block that contains the page we want to canonicalize 2609 */ 2610 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2611 { 2612 RAMState *rs = ram_state; 2613 unsigned long *bitmap = block->bmap; 2614 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2615 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2616 unsigned long run_start; 2617 2618 if (block->page_size == TARGET_PAGE_SIZE) { 2619 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2620 return; 2621 } 2622 2623 /* Find a dirty page */ 2624 run_start = find_next_bit(bitmap, pages, 0); 2625 2626 while (run_start < pages) { 2627 2628 /* 2629 * If the start of this run of pages is in the middle of a host 2630 * page, then we need to fixup this host page. 2631 */ 2632 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2633 /* Find the end of this run */ 2634 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2635 /* 2636 * If the end isn't at the start of a host page, then the 2637 * run doesn't finish at the end of a host page 2638 * and we need to discard. 2639 */ 2640 } 2641 2642 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2643 unsigned long page; 2644 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2645 host_ratio); 2646 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2647 2648 /* Clean up the bitmap */ 2649 for (page = fixup_start_addr; 2650 page < fixup_start_addr + host_ratio; page++) { 2651 /* 2652 * Remark them as dirty, updating the count for any pages 2653 * that weren't previously dirty. 2654 */ 2655 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2656 } 2657 } 2658 2659 /* Find the next dirty page for the next iteration */ 2660 run_start = find_next_bit(bitmap, pages, run_start); 2661 } 2662 } 2663 2664 /** 2665 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2666 * 2667 * Transmit the set of pages to be discarded after precopy to the target 2668 * these are pages that: 2669 * a) Have been previously transmitted but are now dirty again 2670 * b) Pages that have never been transmitted, this ensures that 2671 * any pages on the destination that have been mapped by background 2672 * tasks get discarded (transparent huge pages is the specific concern) 2673 * Hopefully this is pretty sparse 2674 * 2675 * @ms: current migration state 2676 */ 2677 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2678 { 2679 RAMState *rs = ram_state; 2680 2681 RCU_READ_LOCK_GUARD(); 2682 2683 /* This should be our last sync, the src is now paused */ 2684 migration_bitmap_sync(rs, false); 2685 2686 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2687 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2688 rs->last_seen_block = NULL; 2689 rs->last_page = 0; 2690 2691 postcopy_each_ram_send_discard(ms); 2692 2693 trace_ram_postcopy_send_discard_bitmap(); 2694 } 2695 2696 /** 2697 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2698 * 2699 * Returns zero on success 2700 * 2701 * @rbname: name of the RAMBlock of the request. NULL means the 2702 * same that last one. 2703 * @start: RAMBlock starting page 2704 * @length: RAMBlock size 2705 */ 2706 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2707 { 2708 trace_ram_discard_range(rbname, start, length); 2709 2710 RCU_READ_LOCK_GUARD(); 2711 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2712 2713 if (!rb) { 2714 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2715 return -1; 2716 } 2717 2718 /* 2719 * On source VM, we don't need to update the received bitmap since 2720 * we don't even have one. 2721 */ 2722 if (rb->receivedmap) { 2723 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2724 length >> qemu_target_page_bits()); 2725 } 2726 2727 return ram_block_discard_range(rb, start, length); 2728 } 2729 2730 /* 2731 * For every allocation, we will try not to crash the VM if the 2732 * allocation failed. 2733 */ 2734 static bool xbzrle_init(Error **errp) 2735 { 2736 if (!migrate_xbzrle()) { 2737 return true; 2738 } 2739 2740 XBZRLE_cache_lock(); 2741 2742 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2743 if (!XBZRLE.zero_target_page) { 2744 error_setg(errp, "%s: Error allocating zero page", __func__); 2745 goto err_out; 2746 } 2747 2748 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2749 TARGET_PAGE_SIZE, errp); 2750 if (!XBZRLE.cache) { 2751 goto free_zero_page; 2752 } 2753 2754 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2755 if (!XBZRLE.encoded_buf) { 2756 error_setg(errp, "%s: Error allocating encoded_buf", __func__); 2757 goto free_cache; 2758 } 2759 2760 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2761 if (!XBZRLE.current_buf) { 2762 error_setg(errp, "%s: Error allocating current_buf", __func__); 2763 goto free_encoded_buf; 2764 } 2765 2766 /* We are all good */ 2767 XBZRLE_cache_unlock(); 2768 return true; 2769 2770 free_encoded_buf: 2771 g_free(XBZRLE.encoded_buf); 2772 XBZRLE.encoded_buf = NULL; 2773 free_cache: 2774 cache_fini(XBZRLE.cache); 2775 XBZRLE.cache = NULL; 2776 free_zero_page: 2777 g_free(XBZRLE.zero_target_page); 2778 XBZRLE.zero_target_page = NULL; 2779 err_out: 2780 XBZRLE_cache_unlock(); 2781 return false; 2782 } 2783 2784 static bool ram_state_init(RAMState **rsp, Error **errp) 2785 { 2786 *rsp = g_try_new0(RAMState, 1); 2787 2788 if (!*rsp) { 2789 error_setg(errp, "%s: Init ramstate fail", __func__); 2790 return false; 2791 } 2792 2793 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2794 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2795 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2796 (*rsp)->ram_bytes_total = ram_bytes_total(); 2797 2798 /* 2799 * Count the total number of pages used by ram blocks not including any 2800 * gaps due to alignment or unplugs. 2801 * This must match with the initial values of dirty bitmap. 2802 */ 2803 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2804 ram_state_reset(*rsp); 2805 2806 return true; 2807 } 2808 2809 static void ram_list_init_bitmaps(void) 2810 { 2811 MigrationState *ms = migrate_get_current(); 2812 RAMBlock *block; 2813 unsigned long pages; 2814 uint8_t shift; 2815 2816 /* Skip setting bitmap if there is no RAM */ 2817 if (ram_bytes_total()) { 2818 shift = ms->clear_bitmap_shift; 2819 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2820 error_report("clear_bitmap_shift (%u) too big, using " 2821 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2822 shift = CLEAR_BITMAP_SHIFT_MAX; 2823 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2824 error_report("clear_bitmap_shift (%u) too small, using " 2825 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2826 shift = CLEAR_BITMAP_SHIFT_MIN; 2827 } 2828 2829 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2830 pages = block->max_length >> TARGET_PAGE_BITS; 2831 /* 2832 * The initial dirty bitmap for migration must be set with all 2833 * ones to make sure we'll migrate every guest RAM page to 2834 * destination. 2835 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2836 * new migration after a failed migration, ram_list. 2837 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2838 * guest memory. 2839 */ 2840 block->bmap = bitmap_new(pages); 2841 bitmap_set(block->bmap, 0, pages); 2842 if (migrate_mapped_ram()) { 2843 block->file_bmap = bitmap_new(pages); 2844 } 2845 block->clear_bmap_shift = shift; 2846 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2847 } 2848 } 2849 } 2850 2851 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2852 { 2853 unsigned long pages; 2854 RAMBlock *rb; 2855 2856 RCU_READ_LOCK_GUARD(); 2857 2858 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2859 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2860 rs->migration_dirty_pages -= pages; 2861 } 2862 } 2863 2864 static bool ram_init_bitmaps(RAMState *rs, Error **errp) 2865 { 2866 bool ret = true; 2867 2868 qemu_mutex_lock_ramlist(); 2869 2870 WITH_RCU_READ_LOCK_GUARD() { 2871 ram_list_init_bitmaps(); 2872 /* We don't use dirty log with background snapshots */ 2873 if (!migrate_background_snapshot()) { 2874 ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp); 2875 if (!ret) { 2876 goto out_unlock; 2877 } 2878 migration_bitmap_sync_precopy(rs, false); 2879 } 2880 } 2881 out_unlock: 2882 qemu_mutex_unlock_ramlist(); 2883 2884 if (!ret) { 2885 ram_bitmaps_destroy(); 2886 return false; 2887 } 2888 2889 /* 2890 * After an eventual first bitmap sync, fixup the initial bitmap 2891 * containing all 1s to exclude any discarded pages from migration. 2892 */ 2893 migration_bitmap_clear_discarded_pages(rs); 2894 return true; 2895 } 2896 2897 static int ram_init_all(RAMState **rsp, Error **errp) 2898 { 2899 if (!ram_state_init(rsp, errp)) { 2900 return -1; 2901 } 2902 2903 if (!xbzrle_init(errp)) { 2904 ram_state_cleanup(rsp); 2905 return -1; 2906 } 2907 2908 if (!ram_init_bitmaps(*rsp, errp)) { 2909 return -1; 2910 } 2911 2912 return 0; 2913 } 2914 2915 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2916 { 2917 RAMBlock *block; 2918 uint64_t pages = 0; 2919 2920 /* 2921 * Postcopy is not using xbzrle/compression, so no need for that. 2922 * Also, since source are already halted, we don't need to care 2923 * about dirty page logging as well. 2924 */ 2925 2926 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2927 pages += bitmap_count_one(block->bmap, 2928 block->used_length >> TARGET_PAGE_BITS); 2929 } 2930 2931 /* This may not be aligned with current bitmaps. Recalculate. */ 2932 rs->migration_dirty_pages = pages; 2933 2934 ram_state_reset(rs); 2935 2936 /* Update RAMState cache of output QEMUFile */ 2937 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2938 2939 trace_ram_state_resume_prepare(pages); 2940 } 2941 2942 /* 2943 * This function clears bits of the free pages reported by the caller from the 2944 * migration dirty bitmap. @addr is the host address corresponding to the 2945 * start of the continuous guest free pages, and @len is the total bytes of 2946 * those pages. 2947 */ 2948 void qemu_guest_free_page_hint(void *addr, size_t len) 2949 { 2950 RAMBlock *block; 2951 ram_addr_t offset; 2952 size_t used_len, start, npages; 2953 2954 /* This function is currently expected to be used during live migration */ 2955 if (!migration_is_setup_or_active()) { 2956 return; 2957 } 2958 2959 for (; len > 0; len -= used_len, addr += used_len) { 2960 block = qemu_ram_block_from_host(addr, false, &offset); 2961 if (unlikely(!block || offset >= block->used_length)) { 2962 /* 2963 * The implementation might not support RAMBlock resize during 2964 * live migration, but it could happen in theory with future 2965 * updates. So we add a check here to capture that case. 2966 */ 2967 error_report_once("%s unexpected error", __func__); 2968 return; 2969 } 2970 2971 if (len <= block->used_length - offset) { 2972 used_len = len; 2973 } else { 2974 used_len = block->used_length - offset; 2975 } 2976 2977 start = offset >> TARGET_PAGE_BITS; 2978 npages = used_len >> TARGET_PAGE_BITS; 2979 2980 qemu_mutex_lock(&ram_state->bitmap_mutex); 2981 /* 2982 * The skipped free pages are equavalent to be sent from clear_bmap's 2983 * perspective, so clear the bits from the memory region bitmap which 2984 * are initially set. Otherwise those skipped pages will be sent in 2985 * the next round after syncing from the memory region bitmap. 2986 */ 2987 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2988 ram_state->migration_dirty_pages -= 2989 bitmap_count_one_with_offset(block->bmap, start, npages); 2990 bitmap_clear(block->bmap, start, npages); 2991 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2992 } 2993 } 2994 2995 #define MAPPED_RAM_HDR_VERSION 1 2996 struct MappedRamHeader { 2997 uint32_t version; 2998 /* 2999 * The target's page size, so we know how many pages are in the 3000 * bitmap. 3001 */ 3002 uint64_t page_size; 3003 /* 3004 * The offset in the migration file where the pages bitmap is 3005 * stored. 3006 */ 3007 uint64_t bitmap_offset; 3008 /* 3009 * The offset in the migration file where the actual pages (data) 3010 * are stored. 3011 */ 3012 uint64_t pages_offset; 3013 } QEMU_PACKED; 3014 typedef struct MappedRamHeader MappedRamHeader; 3015 3016 static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) 3017 { 3018 g_autofree MappedRamHeader *header = NULL; 3019 size_t header_size, bitmap_size; 3020 long num_pages; 3021 3022 header = g_new0(MappedRamHeader, 1); 3023 header_size = sizeof(MappedRamHeader); 3024 3025 num_pages = block->used_length >> TARGET_PAGE_BITS; 3026 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3027 3028 /* 3029 * Save the file offsets of where the bitmap and the pages should 3030 * go as they are written at the end of migration and during the 3031 * iterative phase, respectively. 3032 */ 3033 block->bitmap_offset = qemu_get_offset(file) + header_size; 3034 block->pages_offset = ROUND_UP(block->bitmap_offset + 3035 bitmap_size, 3036 MAPPED_RAM_FILE_OFFSET_ALIGNMENT); 3037 3038 header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); 3039 header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); 3040 header->bitmap_offset = cpu_to_be64(block->bitmap_offset); 3041 header->pages_offset = cpu_to_be64(block->pages_offset); 3042 3043 qemu_put_buffer(file, (uint8_t *) header, header_size); 3044 3045 /* prepare offset for next ramblock */ 3046 qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); 3047 } 3048 3049 static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, 3050 Error **errp) 3051 { 3052 size_t ret, header_size = sizeof(MappedRamHeader); 3053 3054 ret = qemu_get_buffer(file, (uint8_t *)header, header_size); 3055 if (ret != header_size) { 3056 error_setg(errp, "Could not read whole mapped-ram migration header " 3057 "(expected %zd, got %zd bytes)", header_size, ret); 3058 return false; 3059 } 3060 3061 /* migration stream is big-endian */ 3062 header->version = be32_to_cpu(header->version); 3063 3064 if (header->version > MAPPED_RAM_HDR_VERSION) { 3065 error_setg(errp, "Migration mapped-ram capability version not " 3066 "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, 3067 header->version); 3068 return false; 3069 } 3070 3071 header->page_size = be64_to_cpu(header->page_size); 3072 header->bitmap_offset = be64_to_cpu(header->bitmap_offset); 3073 header->pages_offset = be64_to_cpu(header->pages_offset); 3074 3075 return true; 3076 } 3077 3078 /* 3079 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3080 * long-running RCU critical section. When rcu-reclaims in the code 3081 * start to become numerous it will be necessary to reduce the 3082 * granularity of these critical sections. 3083 */ 3084 3085 /** 3086 * ram_save_setup: Setup RAM for migration 3087 * 3088 * Returns zero to indicate success and negative for error 3089 * 3090 * @f: QEMUFile where to send the data 3091 * @opaque: RAMState pointer 3092 * @errp: pointer to Error*, to store an error if it happens. 3093 */ 3094 static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp) 3095 { 3096 RAMState **rsp = opaque; 3097 RAMBlock *block; 3098 int ret, max_hg_page_size; 3099 3100 if (compress_threads_save_setup()) { 3101 error_setg(errp, "%s: failed to start compress threads", __func__); 3102 return -1; 3103 } 3104 3105 /* migration has already setup the bitmap, reuse it. */ 3106 if (!migration_in_colo_state()) { 3107 if (ram_init_all(rsp, errp) != 0) { 3108 compress_threads_save_cleanup(); 3109 return -1; 3110 } 3111 } 3112 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3113 3114 /* 3115 * ??? Mirrors the previous value of qemu_host_page_size, 3116 * but is this really what was intended for the migration? 3117 */ 3118 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 3119 3120 WITH_RCU_READ_LOCK_GUARD() { 3121 qemu_put_be64(f, ram_bytes_total_with_ignored() 3122 | RAM_SAVE_FLAG_MEM_SIZE); 3123 3124 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3125 qemu_put_byte(f, strlen(block->idstr)); 3126 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3127 qemu_put_be64(f, block->used_length); 3128 if (migrate_postcopy_ram() && 3129 block->page_size != max_hg_page_size) { 3130 qemu_put_be64(f, block->page_size); 3131 } 3132 if (migrate_ignore_shared()) { 3133 qemu_put_be64(f, block->mr->addr); 3134 } 3135 3136 if (migrate_mapped_ram()) { 3137 mapped_ram_setup_ramblock(f, block); 3138 } 3139 } 3140 } 3141 3142 ret = rdma_registration_start(f, RAM_CONTROL_SETUP); 3143 if (ret < 0) { 3144 error_setg(errp, "%s: failed to start RDMA registration", __func__); 3145 qemu_file_set_error(f, ret); 3146 return ret; 3147 } 3148 3149 ret = rdma_registration_stop(f, RAM_CONTROL_SETUP); 3150 if (ret < 0) { 3151 error_setg(errp, "%s: failed to stop RDMA registration", __func__); 3152 qemu_file_set_error(f, ret); 3153 return ret; 3154 } 3155 3156 migration_ops = g_malloc0(sizeof(MigrationOps)); 3157 3158 if (migrate_multifd()) { 3159 migration_ops->ram_save_target_page = ram_save_target_page_multifd; 3160 } else { 3161 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3162 } 3163 3164 bql_unlock(); 3165 ret = multifd_send_sync_main(); 3166 bql_lock(); 3167 if (ret < 0) { 3168 error_setg(errp, "%s: multifd synchronization failed", __func__); 3169 return ret; 3170 } 3171 3172 if (migrate_multifd() && !migrate_multifd_flush_after_each_section() 3173 && !migrate_mapped_ram()) { 3174 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3175 } 3176 3177 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3178 ret = qemu_fflush(f); 3179 if (ret < 0) { 3180 error_setg_errno(errp, -ret, "%s failed", __func__); 3181 } 3182 return ret; 3183 } 3184 3185 static void ram_save_file_bmap(QEMUFile *f) 3186 { 3187 RAMBlock *block; 3188 3189 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3190 long num_pages = block->used_length >> TARGET_PAGE_BITS; 3191 long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3192 3193 qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, 3194 block->bitmap_offset); 3195 ram_transferred_add(bitmap_size); 3196 3197 /* 3198 * Free the bitmap here to catch any synchronization issues 3199 * with multifd channels. No channels should be sending pages 3200 * after we've written the bitmap to file. 3201 */ 3202 g_free(block->file_bmap); 3203 block->file_bmap = NULL; 3204 } 3205 } 3206 3207 void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) 3208 { 3209 if (set) { 3210 set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3211 } else { 3212 clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3213 } 3214 } 3215 3216 /** 3217 * ram_save_iterate: iterative stage for migration 3218 * 3219 * Returns zero to indicate success and negative for error 3220 * 3221 * @f: QEMUFile where to send the data 3222 * @opaque: RAMState pointer 3223 */ 3224 static int ram_save_iterate(QEMUFile *f, void *opaque) 3225 { 3226 RAMState **temp = opaque; 3227 RAMState *rs = *temp; 3228 int ret = 0; 3229 int i; 3230 int64_t t0; 3231 int done = 0; 3232 3233 if (blk_mig_bulk_active()) { 3234 /* Avoid transferring ram during bulk phase of block migration as 3235 * the bulk phase will usually take a long time and transferring 3236 * ram updates during that time is pointless. */ 3237 goto out; 3238 } 3239 3240 /* 3241 * We'll take this lock a little bit long, but it's okay for two reasons. 3242 * Firstly, the only possible other thread to take it is who calls 3243 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3244 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3245 * guarantees that we'll at least released it in a regular basis. 3246 */ 3247 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 3248 WITH_RCU_READ_LOCK_GUARD() { 3249 if (ram_list.version != rs->last_version) { 3250 ram_state_reset(rs); 3251 } 3252 3253 /* Read version before ram_list.blocks */ 3254 smp_rmb(); 3255 3256 ret = rdma_registration_start(f, RAM_CONTROL_ROUND); 3257 if (ret < 0) { 3258 qemu_file_set_error(f, ret); 3259 goto out; 3260 } 3261 3262 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3263 i = 0; 3264 while ((ret = migration_rate_exceeded(f)) == 0 || 3265 postcopy_has_request(rs)) { 3266 int pages; 3267 3268 if (qemu_file_get_error(f)) { 3269 break; 3270 } 3271 3272 pages = ram_find_and_save_block(rs); 3273 /* no more pages to sent */ 3274 if (pages == 0) { 3275 done = 1; 3276 break; 3277 } 3278 3279 if (pages < 0) { 3280 qemu_file_set_error(f, pages); 3281 break; 3282 } 3283 3284 rs->target_page_count += pages; 3285 3286 /* 3287 * During postcopy, it is necessary to make sure one whole host 3288 * page is sent in one chunk. 3289 */ 3290 if (migrate_postcopy_ram()) { 3291 compress_flush_data(); 3292 } 3293 3294 /* 3295 * we want to check in the 1st loop, just in case it was the 1st 3296 * time and we had to sync the dirty bitmap. 3297 * qemu_clock_get_ns() is a bit expensive, so we only check each 3298 * some iterations 3299 */ 3300 if ((i & 63) == 0) { 3301 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3302 1000000; 3303 if (t1 > MAX_WAIT) { 3304 trace_ram_save_iterate_big_wait(t1, i); 3305 break; 3306 } 3307 } 3308 i++; 3309 } 3310 } 3311 } 3312 3313 /* 3314 * Must occur before EOS (or any QEMUFile operation) 3315 * because of RDMA protocol. 3316 */ 3317 ret = rdma_registration_stop(f, RAM_CONTROL_ROUND); 3318 if (ret < 0) { 3319 qemu_file_set_error(f, ret); 3320 } 3321 3322 out: 3323 if (ret >= 0 3324 && migration_is_setup_or_active()) { 3325 if (migrate_multifd() && migrate_multifd_flush_after_each_section() && 3326 !migrate_mapped_ram()) { 3327 ret = multifd_send_sync_main(); 3328 if (ret < 0) { 3329 return ret; 3330 } 3331 } 3332 3333 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3334 ram_transferred_add(8); 3335 ret = qemu_fflush(f); 3336 } 3337 if (ret < 0) { 3338 return ret; 3339 } 3340 3341 return done; 3342 } 3343 3344 /** 3345 * ram_save_complete: function called to send the remaining amount of ram 3346 * 3347 * Returns zero to indicate success or negative on error 3348 * 3349 * Called with the BQL 3350 * 3351 * @f: QEMUFile where to send the data 3352 * @opaque: RAMState pointer 3353 */ 3354 static int ram_save_complete(QEMUFile *f, void *opaque) 3355 { 3356 RAMState **temp = opaque; 3357 RAMState *rs = *temp; 3358 int ret = 0; 3359 3360 rs->last_stage = !migration_in_colo_state(); 3361 3362 WITH_RCU_READ_LOCK_GUARD() { 3363 if (!migration_in_postcopy()) { 3364 migration_bitmap_sync_precopy(rs, true); 3365 } 3366 3367 ret = rdma_registration_start(f, RAM_CONTROL_FINISH); 3368 if (ret < 0) { 3369 qemu_file_set_error(f, ret); 3370 return ret; 3371 } 3372 3373 /* try transferring iterative blocks of memory */ 3374 3375 /* flush all remaining blocks regardless of rate limiting */ 3376 qemu_mutex_lock(&rs->bitmap_mutex); 3377 while (true) { 3378 int pages; 3379 3380 pages = ram_find_and_save_block(rs); 3381 /* no more blocks to sent */ 3382 if (pages == 0) { 3383 break; 3384 } 3385 if (pages < 0) { 3386 qemu_mutex_unlock(&rs->bitmap_mutex); 3387 return pages; 3388 } 3389 } 3390 qemu_mutex_unlock(&rs->bitmap_mutex); 3391 3392 compress_flush_data(); 3393 3394 ret = rdma_registration_stop(f, RAM_CONTROL_FINISH); 3395 if (ret < 0) { 3396 qemu_file_set_error(f, ret); 3397 return ret; 3398 } 3399 } 3400 3401 ret = multifd_send_sync_main(); 3402 if (ret < 0) { 3403 return ret; 3404 } 3405 3406 if (migrate_mapped_ram()) { 3407 ram_save_file_bmap(f); 3408 3409 if (qemu_file_get_error(f)) { 3410 Error *local_err = NULL; 3411 int err = qemu_file_get_error_obj(f, &local_err); 3412 3413 error_reportf_err(local_err, "Failed to write bitmap to file: "); 3414 return -err; 3415 } 3416 } 3417 3418 if (migrate_multifd() && !migrate_multifd_flush_after_each_section() && 3419 !migrate_mapped_ram()) { 3420 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3421 } 3422 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3423 return qemu_fflush(f); 3424 } 3425 3426 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3427 uint64_t *can_postcopy) 3428 { 3429 RAMState **temp = opaque; 3430 RAMState *rs = *temp; 3431 3432 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3433 3434 if (migrate_postcopy_ram()) { 3435 /* We can do postcopy, and all the data is postcopiable */ 3436 *can_postcopy += remaining_size; 3437 } else { 3438 *must_precopy += remaining_size; 3439 } 3440 } 3441 3442 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3443 uint64_t *can_postcopy) 3444 { 3445 RAMState **temp = opaque; 3446 RAMState *rs = *temp; 3447 uint64_t remaining_size; 3448 3449 if (!migration_in_postcopy()) { 3450 bql_lock(); 3451 WITH_RCU_READ_LOCK_GUARD() { 3452 migration_bitmap_sync_precopy(rs, false); 3453 } 3454 bql_unlock(); 3455 } 3456 3457 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3458 3459 if (migrate_postcopy_ram()) { 3460 /* We can do postcopy, and all the data is postcopiable */ 3461 *can_postcopy += remaining_size; 3462 } else { 3463 *must_precopy += remaining_size; 3464 } 3465 } 3466 3467 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3468 { 3469 unsigned int xh_len; 3470 int xh_flags; 3471 uint8_t *loaded_data; 3472 3473 /* extract RLE header */ 3474 xh_flags = qemu_get_byte(f); 3475 xh_len = qemu_get_be16(f); 3476 3477 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3478 error_report("Failed to load XBZRLE page - wrong compression!"); 3479 return -1; 3480 } 3481 3482 if (xh_len > TARGET_PAGE_SIZE) { 3483 error_report("Failed to load XBZRLE page - len overflow!"); 3484 return -1; 3485 } 3486 loaded_data = XBZRLE.decoded_buf; 3487 /* load data and decode */ 3488 /* it can change loaded_data to point to an internal buffer */ 3489 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3490 3491 /* decode RLE */ 3492 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3493 TARGET_PAGE_SIZE) == -1) { 3494 error_report("Failed to load XBZRLE page - decode error!"); 3495 return -1; 3496 } 3497 3498 return 0; 3499 } 3500 3501 /** 3502 * ram_block_from_stream: read a RAMBlock id from the migration stream 3503 * 3504 * Must be called from within a rcu critical section. 3505 * 3506 * Returns a pointer from within the RCU-protected ram_list. 3507 * 3508 * @mis: the migration incoming state pointer 3509 * @f: QEMUFile where to read the data from 3510 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3511 * @channel: the channel we're using 3512 */ 3513 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3514 QEMUFile *f, int flags, 3515 int channel) 3516 { 3517 RAMBlock *block = mis->last_recv_block[channel]; 3518 char id[256]; 3519 uint8_t len; 3520 3521 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3522 if (!block) { 3523 error_report("Ack, bad migration stream!"); 3524 return NULL; 3525 } 3526 return block; 3527 } 3528 3529 len = qemu_get_byte(f); 3530 qemu_get_buffer(f, (uint8_t *)id, len); 3531 id[len] = 0; 3532 3533 block = qemu_ram_block_by_name(id); 3534 if (!block) { 3535 error_report("Can't find block %s", id); 3536 return NULL; 3537 } 3538 3539 if (migrate_ram_is_ignored(block)) { 3540 error_report("block %s should not be migrated !", id); 3541 return NULL; 3542 } 3543 3544 mis->last_recv_block[channel] = block; 3545 3546 return block; 3547 } 3548 3549 static inline void *host_from_ram_block_offset(RAMBlock *block, 3550 ram_addr_t offset) 3551 { 3552 if (!offset_in_ramblock(block, offset)) { 3553 return NULL; 3554 } 3555 3556 return block->host + offset; 3557 } 3558 3559 static void *host_page_from_ram_block_offset(RAMBlock *block, 3560 ram_addr_t offset) 3561 { 3562 /* Note: Explicitly no check against offset_in_ramblock(). */ 3563 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3564 block->page_size); 3565 } 3566 3567 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3568 ram_addr_t offset) 3569 { 3570 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3571 } 3572 3573 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3574 { 3575 qemu_mutex_lock(&ram_state->bitmap_mutex); 3576 for (int i = 0; i < pages; i++) { 3577 ram_addr_t offset = normal[i]; 3578 ram_state->migration_dirty_pages += !test_and_set_bit( 3579 offset >> TARGET_PAGE_BITS, 3580 block->bmap); 3581 } 3582 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3583 } 3584 3585 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3586 ram_addr_t offset, bool record_bitmap) 3587 { 3588 if (!offset_in_ramblock(block, offset)) { 3589 return NULL; 3590 } 3591 if (!block->colo_cache) { 3592 error_report("%s: colo_cache is NULL in block :%s", 3593 __func__, block->idstr); 3594 return NULL; 3595 } 3596 3597 /* 3598 * During colo checkpoint, we need bitmap of these migrated pages. 3599 * It help us to decide which pages in ram cache should be flushed 3600 * into VM's RAM later. 3601 */ 3602 if (record_bitmap) { 3603 colo_record_bitmap(block, &offset, 1); 3604 } 3605 return block->colo_cache + offset; 3606 } 3607 3608 /** 3609 * ram_handle_zero: handle the zero page case 3610 * 3611 * If a page (or a whole RDMA chunk) has been 3612 * determined to be zero, then zap it. 3613 * 3614 * @host: host address for the zero page 3615 * @ch: what the page is filled from. We only support zero 3616 * @size: size of the zero page 3617 */ 3618 void ram_handle_zero(void *host, uint64_t size) 3619 { 3620 if (!buffer_is_zero(host, size)) { 3621 memset(host, 0, size); 3622 } 3623 } 3624 3625 static void colo_init_ram_state(void) 3626 { 3627 Error *local_err = NULL; 3628 3629 if (!ram_state_init(&ram_state, &local_err)) { 3630 error_report_err(local_err); 3631 } 3632 } 3633 3634 /* 3635 * colo cache: this is for secondary VM, we cache the whole 3636 * memory of the secondary VM, it is need to hold the global lock 3637 * to call this helper. 3638 */ 3639 int colo_init_ram_cache(void) 3640 { 3641 RAMBlock *block; 3642 3643 WITH_RCU_READ_LOCK_GUARD() { 3644 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3645 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3646 NULL, false, false); 3647 if (!block->colo_cache) { 3648 error_report("%s: Can't alloc memory for COLO cache of block %s," 3649 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3650 block->used_length); 3651 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3652 if (block->colo_cache) { 3653 qemu_anon_ram_free(block->colo_cache, block->used_length); 3654 block->colo_cache = NULL; 3655 } 3656 } 3657 return -errno; 3658 } 3659 if (!machine_dump_guest_core(current_machine)) { 3660 qemu_madvise(block->colo_cache, block->used_length, 3661 QEMU_MADV_DONTDUMP); 3662 } 3663 } 3664 } 3665 3666 /* 3667 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3668 * with to decide which page in cache should be flushed into SVM's RAM. Here 3669 * we use the same name 'ram_bitmap' as for migration. 3670 */ 3671 if (ram_bytes_total()) { 3672 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3673 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3674 block->bmap = bitmap_new(pages); 3675 } 3676 } 3677 3678 colo_init_ram_state(); 3679 return 0; 3680 } 3681 3682 /* TODO: duplicated with ram_init_bitmaps */ 3683 void colo_incoming_start_dirty_log(void) 3684 { 3685 RAMBlock *block = NULL; 3686 Error *local_err = NULL; 3687 3688 /* For memory_global_dirty_log_start below. */ 3689 bql_lock(); 3690 qemu_mutex_lock_ramlist(); 3691 3692 memory_global_dirty_log_sync(false); 3693 WITH_RCU_READ_LOCK_GUARD() { 3694 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3695 ramblock_sync_dirty_bitmap(ram_state, block); 3696 /* Discard this dirty bitmap record */ 3697 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3698 } 3699 if (!memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, 3700 &local_err)) { 3701 error_report_err(local_err); 3702 } 3703 } 3704 ram_state->migration_dirty_pages = 0; 3705 qemu_mutex_unlock_ramlist(); 3706 bql_unlock(); 3707 } 3708 3709 /* It is need to hold the global lock to call this helper */ 3710 void colo_release_ram_cache(void) 3711 { 3712 RAMBlock *block; 3713 3714 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3715 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3716 g_free(block->bmap); 3717 block->bmap = NULL; 3718 } 3719 3720 WITH_RCU_READ_LOCK_GUARD() { 3721 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3722 if (block->colo_cache) { 3723 qemu_anon_ram_free(block->colo_cache, block->used_length); 3724 block->colo_cache = NULL; 3725 } 3726 } 3727 } 3728 ram_state_cleanup(&ram_state); 3729 } 3730 3731 /** 3732 * ram_load_setup: Setup RAM for migration incoming side 3733 * 3734 * Returns zero to indicate success and negative for error 3735 * 3736 * @f: QEMUFile where to receive the data 3737 * @opaque: RAMState pointer 3738 * @errp: pointer to Error*, to store an error if it happens. 3739 */ 3740 static int ram_load_setup(QEMUFile *f, void *opaque, Error **errp) 3741 { 3742 xbzrle_load_setup(); 3743 ramblock_recv_map_init(); 3744 3745 return 0; 3746 } 3747 3748 static int ram_load_cleanup(void *opaque) 3749 { 3750 RAMBlock *rb; 3751 3752 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3753 qemu_ram_block_writeback(rb); 3754 } 3755 3756 xbzrle_load_cleanup(); 3757 3758 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3759 g_free(rb->receivedmap); 3760 rb->receivedmap = NULL; 3761 } 3762 3763 return 0; 3764 } 3765 3766 /** 3767 * ram_postcopy_incoming_init: allocate postcopy data structures 3768 * 3769 * Returns 0 for success and negative if there was one error 3770 * 3771 * @mis: current migration incoming state 3772 * 3773 * Allocate data structures etc needed by incoming migration with 3774 * postcopy-ram. postcopy-ram's similarly names 3775 * postcopy_ram_incoming_init does the work. 3776 */ 3777 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3778 { 3779 return postcopy_ram_incoming_init(mis); 3780 } 3781 3782 /** 3783 * ram_load_postcopy: load a page in postcopy case 3784 * 3785 * Returns 0 for success or -errno in case of error 3786 * 3787 * Called in postcopy mode by ram_load(). 3788 * rcu_read_lock is taken prior to this being called. 3789 * 3790 * @f: QEMUFile where to send the data 3791 * @channel: the channel to use for loading 3792 */ 3793 int ram_load_postcopy(QEMUFile *f, int channel) 3794 { 3795 int flags = 0, ret = 0; 3796 bool place_needed = false; 3797 bool matches_target_page_size = false; 3798 MigrationIncomingState *mis = migration_incoming_get_current(); 3799 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3800 3801 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3802 ram_addr_t addr; 3803 void *page_buffer = NULL; 3804 void *place_source = NULL; 3805 RAMBlock *block = NULL; 3806 uint8_t ch; 3807 int len; 3808 3809 addr = qemu_get_be64(f); 3810 3811 /* 3812 * If qemu file error, we should stop here, and then "addr" 3813 * may be invalid 3814 */ 3815 ret = qemu_file_get_error(f); 3816 if (ret) { 3817 break; 3818 } 3819 3820 flags = addr & ~TARGET_PAGE_MASK; 3821 addr &= TARGET_PAGE_MASK; 3822 3823 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3824 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3825 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3826 block = ram_block_from_stream(mis, f, flags, channel); 3827 if (!block) { 3828 ret = -EINVAL; 3829 break; 3830 } 3831 3832 /* 3833 * Relying on used_length is racy and can result in false positives. 3834 * We might place pages beyond used_length in case RAM was shrunk 3835 * while in postcopy, which is fine - trying to place via 3836 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3837 */ 3838 if (!block->host || addr >= block->postcopy_length) { 3839 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3840 ret = -EINVAL; 3841 break; 3842 } 3843 tmp_page->target_pages++; 3844 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3845 /* 3846 * Postcopy requires that we place whole host pages atomically; 3847 * these may be huge pages for RAMBlocks that are backed by 3848 * hugetlbfs. 3849 * To make it atomic, the data is read into a temporary page 3850 * that's moved into place later. 3851 * The migration protocol uses, possibly smaller, target-pages 3852 * however the source ensures it always sends all the components 3853 * of a host page in one chunk. 3854 */ 3855 page_buffer = tmp_page->tmp_huge_page + 3856 host_page_offset_from_ram_block_offset(block, addr); 3857 /* If all TP are zero then we can optimise the place */ 3858 if (tmp_page->target_pages == 1) { 3859 tmp_page->host_addr = 3860 host_page_from_ram_block_offset(block, addr); 3861 } else if (tmp_page->host_addr != 3862 host_page_from_ram_block_offset(block, addr)) { 3863 /* not the 1st TP within the HP */ 3864 error_report("Non-same host page detected on channel %d: " 3865 "Target host page %p, received host page %p " 3866 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3867 channel, tmp_page->host_addr, 3868 host_page_from_ram_block_offset(block, addr), 3869 block->idstr, addr, tmp_page->target_pages); 3870 ret = -EINVAL; 3871 break; 3872 } 3873 3874 /* 3875 * If it's the last part of a host page then we place the host 3876 * page 3877 */ 3878 if (tmp_page->target_pages == 3879 (block->page_size / TARGET_PAGE_SIZE)) { 3880 place_needed = true; 3881 } 3882 place_source = tmp_page->tmp_huge_page; 3883 } 3884 3885 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3886 case RAM_SAVE_FLAG_ZERO: 3887 ch = qemu_get_byte(f); 3888 if (ch != 0) { 3889 error_report("Found a zero page with value %d", ch); 3890 ret = -EINVAL; 3891 break; 3892 } 3893 /* 3894 * Can skip to set page_buffer when 3895 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3896 */ 3897 if (!matches_target_page_size) { 3898 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3899 } 3900 break; 3901 3902 case RAM_SAVE_FLAG_PAGE: 3903 tmp_page->all_zero = false; 3904 if (!matches_target_page_size) { 3905 /* For huge pages, we always use temporary buffer */ 3906 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3907 } else { 3908 /* 3909 * For small pages that matches target page size, we 3910 * avoid the qemu_file copy. Instead we directly use 3911 * the buffer of QEMUFile to place the page. Note: we 3912 * cannot do any QEMUFile operation before using that 3913 * buffer to make sure the buffer is valid when 3914 * placing the page. 3915 */ 3916 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3917 TARGET_PAGE_SIZE); 3918 } 3919 break; 3920 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3921 tmp_page->all_zero = false; 3922 len = qemu_get_be32(f); 3923 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3924 error_report("Invalid compressed data length: %d", len); 3925 ret = -EINVAL; 3926 break; 3927 } 3928 decompress_data_with_multi_threads(f, page_buffer, len); 3929 break; 3930 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 3931 multifd_recv_sync_main(); 3932 break; 3933 case RAM_SAVE_FLAG_EOS: 3934 /* normal exit */ 3935 if (migrate_multifd() && 3936 migrate_multifd_flush_after_each_section()) { 3937 multifd_recv_sync_main(); 3938 } 3939 break; 3940 default: 3941 error_report("Unknown combination of migration flags: 0x%x" 3942 " (postcopy mode)", flags); 3943 ret = -EINVAL; 3944 break; 3945 } 3946 3947 /* Got the whole host page, wait for decompress before placing. */ 3948 if (place_needed) { 3949 ret |= wait_for_decompress_done(); 3950 } 3951 3952 /* Detect for any possible file errors */ 3953 if (!ret && qemu_file_get_error(f)) { 3954 ret = qemu_file_get_error(f); 3955 } 3956 3957 if (!ret && place_needed) { 3958 if (tmp_page->all_zero) { 3959 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3960 } else { 3961 ret = postcopy_place_page(mis, tmp_page->host_addr, 3962 place_source, block); 3963 } 3964 place_needed = false; 3965 postcopy_temp_page_reset(tmp_page); 3966 } 3967 } 3968 3969 return ret; 3970 } 3971 3972 static bool postcopy_is_running(void) 3973 { 3974 PostcopyState ps = postcopy_state_get(); 3975 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3976 } 3977 3978 /* 3979 * Flush content of RAM cache into SVM's memory. 3980 * Only flush the pages that be dirtied by PVM or SVM or both. 3981 */ 3982 void colo_flush_ram_cache(void) 3983 { 3984 RAMBlock *block = NULL; 3985 void *dst_host; 3986 void *src_host; 3987 unsigned long offset = 0; 3988 3989 memory_global_dirty_log_sync(false); 3990 qemu_mutex_lock(&ram_state->bitmap_mutex); 3991 WITH_RCU_READ_LOCK_GUARD() { 3992 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3993 ramblock_sync_dirty_bitmap(ram_state, block); 3994 } 3995 } 3996 3997 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3998 WITH_RCU_READ_LOCK_GUARD() { 3999 block = QLIST_FIRST_RCU(&ram_list.blocks); 4000 4001 while (block) { 4002 unsigned long num = 0; 4003 4004 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4005 if (!offset_in_ramblock(block, 4006 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4007 offset = 0; 4008 num = 0; 4009 block = QLIST_NEXT_RCU(block, next); 4010 } else { 4011 unsigned long i = 0; 4012 4013 for (i = 0; i < num; i++) { 4014 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4015 } 4016 dst_host = block->host 4017 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4018 src_host = block->colo_cache 4019 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4020 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4021 offset += num; 4022 } 4023 } 4024 } 4025 qemu_mutex_unlock(&ram_state->bitmap_mutex); 4026 trace_colo_flush_ram_cache_end(); 4027 } 4028 4029 static size_t ram_load_multifd_pages(void *host_addr, size_t size, 4030 uint64_t offset) 4031 { 4032 MultiFDRecvData *data = multifd_get_recv_data(); 4033 4034 data->opaque = host_addr; 4035 data->file_offset = offset; 4036 data->size = size; 4037 4038 if (!multifd_recv()) { 4039 return 0; 4040 } 4041 4042 return size; 4043 } 4044 4045 static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 4046 long num_pages, unsigned long *bitmap, 4047 Error **errp) 4048 { 4049 ERRP_GUARD(); 4050 unsigned long set_bit_idx, clear_bit_idx; 4051 ram_addr_t offset; 4052 void *host; 4053 size_t read, unread, size; 4054 4055 for (set_bit_idx = find_first_bit(bitmap, num_pages); 4056 set_bit_idx < num_pages; 4057 set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { 4058 4059 clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); 4060 4061 unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); 4062 offset = set_bit_idx << TARGET_PAGE_BITS; 4063 4064 while (unread > 0) { 4065 host = host_from_ram_block_offset(block, offset); 4066 if (!host) { 4067 error_setg(errp, "page outside of ramblock %s range", 4068 block->idstr); 4069 return false; 4070 } 4071 4072 size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); 4073 4074 if (migrate_multifd()) { 4075 read = ram_load_multifd_pages(host, size, 4076 block->pages_offset + offset); 4077 } else { 4078 read = qemu_get_buffer_at(f, host, size, 4079 block->pages_offset + offset); 4080 } 4081 4082 if (!read) { 4083 goto err; 4084 } 4085 offset += read; 4086 unread -= read; 4087 } 4088 } 4089 4090 return true; 4091 4092 err: 4093 qemu_file_get_error_obj(f, errp); 4094 error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT 4095 "from file offset %" PRIx64 ": ", block->idstr, offset, 4096 block->pages_offset + offset); 4097 return false; 4098 } 4099 4100 static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 4101 ram_addr_t length, Error **errp) 4102 { 4103 g_autofree unsigned long *bitmap = NULL; 4104 MappedRamHeader header; 4105 size_t bitmap_size; 4106 long num_pages; 4107 4108 if (!mapped_ram_read_header(f, &header, errp)) { 4109 return; 4110 } 4111 4112 block->pages_offset = header.pages_offset; 4113 4114 /* 4115 * Check the alignment of the file region that contains pages. We 4116 * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that 4117 * value to change in the future. Do only a sanity check with page 4118 * size alignment. 4119 */ 4120 if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { 4121 error_setg(errp, 4122 "Error reading ramblock %s pages, region has bad alignment", 4123 block->idstr); 4124 return; 4125 } 4126 4127 num_pages = length / header.page_size; 4128 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 4129 4130 bitmap = g_malloc0(bitmap_size); 4131 if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, 4132 header.bitmap_offset) != bitmap_size) { 4133 error_setg(errp, "Error reading dirty bitmap"); 4134 return; 4135 } 4136 4137 if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { 4138 return; 4139 } 4140 4141 /* Skip pages array */ 4142 qemu_set_offset(f, block->pages_offset + length, SEEK_SET); 4143 4144 return; 4145 } 4146 4147 static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) 4148 { 4149 int ret = 0; 4150 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4151 bool postcopy_advised = migration_incoming_postcopy_advised(); 4152 int max_hg_page_size; 4153 Error *local_err = NULL; 4154 4155 assert(block); 4156 4157 if (migrate_mapped_ram()) { 4158 parse_ramblock_mapped_ram(f, block, length, &local_err); 4159 if (local_err) { 4160 error_report_err(local_err); 4161 return -EINVAL; 4162 } 4163 return 0; 4164 } 4165 4166 if (!qemu_ram_is_migratable(block)) { 4167 error_report("block %s should not be migrated !", block->idstr); 4168 return -EINVAL; 4169 } 4170 4171 if (length != block->used_length) { 4172 ret = qemu_ram_resize(block, length, &local_err); 4173 if (local_err) { 4174 error_report_err(local_err); 4175 return ret; 4176 } 4177 } 4178 4179 /* 4180 * ??? Mirrors the previous value of qemu_host_page_size, 4181 * but is this really what was intended for the migration? 4182 */ 4183 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 4184 4185 /* For postcopy we need to check hugepage sizes match */ 4186 if (postcopy_advised && migrate_postcopy_ram() && 4187 block->page_size != max_hg_page_size) { 4188 uint64_t remote_page_size = qemu_get_be64(f); 4189 if (remote_page_size != block->page_size) { 4190 error_report("Mismatched RAM page size %s " 4191 "(local) %zd != %" PRId64, block->idstr, 4192 block->page_size, remote_page_size); 4193 return -EINVAL; 4194 } 4195 } 4196 if (migrate_ignore_shared()) { 4197 hwaddr addr = qemu_get_be64(f); 4198 if (migrate_ram_is_ignored(block) && 4199 block->mr->addr != addr) { 4200 error_report("Mismatched GPAs for block %s " 4201 "%" PRId64 "!= %" PRId64, block->idstr, 4202 (uint64_t)addr, (uint64_t)block->mr->addr); 4203 return -EINVAL; 4204 } 4205 } 4206 ret = rdma_block_notification_handle(f, block->idstr); 4207 if (ret < 0) { 4208 qemu_file_set_error(f, ret); 4209 } 4210 4211 return ret; 4212 } 4213 4214 static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes) 4215 { 4216 int ret = 0; 4217 4218 /* Synchronize RAM block list */ 4219 while (!ret && total_ram_bytes) { 4220 RAMBlock *block; 4221 char id[256]; 4222 ram_addr_t length; 4223 int len = qemu_get_byte(f); 4224 4225 qemu_get_buffer(f, (uint8_t *)id, len); 4226 id[len] = 0; 4227 length = qemu_get_be64(f); 4228 4229 block = qemu_ram_block_by_name(id); 4230 if (block) { 4231 ret = parse_ramblock(f, block, length); 4232 } else { 4233 error_report("Unknown ramblock \"%s\", cannot accept " 4234 "migration", id); 4235 ret = -EINVAL; 4236 } 4237 total_ram_bytes -= length; 4238 } 4239 4240 return ret; 4241 } 4242 4243 /** 4244 * ram_load_precopy: load pages in precopy case 4245 * 4246 * Returns 0 for success or -errno in case of error 4247 * 4248 * Called in precopy mode by ram_load(). 4249 * rcu_read_lock is taken prior to this being called. 4250 * 4251 * @f: QEMUFile where to send the data 4252 */ 4253 static int ram_load_precopy(QEMUFile *f) 4254 { 4255 MigrationIncomingState *mis = migration_incoming_get_current(); 4256 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4257 4258 if (!migrate_compress()) { 4259 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4260 } 4261 4262 if (migrate_mapped_ram()) { 4263 invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | 4264 RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | 4265 RAM_SAVE_FLAG_ZERO); 4266 } 4267 4268 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4269 ram_addr_t addr; 4270 void *host = NULL, *host_bak = NULL; 4271 uint8_t ch; 4272 4273 /* 4274 * Yield periodically to let main loop run, but an iteration of 4275 * the main loop is expensive, so do it each some iterations 4276 */ 4277 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4278 aio_co_schedule(qemu_get_current_aio_context(), 4279 qemu_coroutine_self()); 4280 qemu_coroutine_yield(); 4281 } 4282 i++; 4283 4284 addr = qemu_get_be64(f); 4285 ret = qemu_file_get_error(f); 4286 if (ret) { 4287 error_report("Getting RAM address failed"); 4288 break; 4289 } 4290 4291 flags = addr & ~TARGET_PAGE_MASK; 4292 addr &= TARGET_PAGE_MASK; 4293 4294 if (flags & invalid_flags) { 4295 error_report("Unexpected RAM flags: %d", flags & invalid_flags); 4296 4297 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4298 error_report("Received an unexpected compressed page"); 4299 } 4300 4301 ret = -EINVAL; 4302 break; 4303 } 4304 4305 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4306 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4307 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4308 RAM_CHANNEL_PRECOPY); 4309 4310 host = host_from_ram_block_offset(block, addr); 4311 /* 4312 * After going into COLO stage, we should not load the page 4313 * into SVM's memory directly, we put them into colo_cache firstly. 4314 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4315 * Previously, we copied all these memory in preparing stage of COLO 4316 * while we need to stop VM, which is a time-consuming process. 4317 * Here we optimize it by a trick, back-up every page while in 4318 * migration process while COLO is enabled, though it affects the 4319 * speed of the migration, but it obviously reduce the downtime of 4320 * back-up all SVM'S memory in COLO preparing stage. 4321 */ 4322 if (migration_incoming_colo_enabled()) { 4323 if (migration_incoming_in_colo_state()) { 4324 /* In COLO stage, put all pages into cache temporarily */ 4325 host = colo_cache_from_block_offset(block, addr, true); 4326 } else { 4327 /* 4328 * In migration stage but before COLO stage, 4329 * Put all pages into both cache and SVM's memory. 4330 */ 4331 host_bak = colo_cache_from_block_offset(block, addr, false); 4332 } 4333 } 4334 if (!host) { 4335 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4336 ret = -EINVAL; 4337 break; 4338 } 4339 if (!migration_incoming_in_colo_state()) { 4340 ramblock_recv_bitmap_set(block, host); 4341 } 4342 4343 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4344 } 4345 4346 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4347 case RAM_SAVE_FLAG_MEM_SIZE: 4348 ret = parse_ramblocks(f, addr); 4349 /* 4350 * For mapped-ram migration (to a file) using multifd, we sync 4351 * once and for all here to make sure all tasks we queued to 4352 * multifd threads are completed, so that all the ramblocks 4353 * (including all the guest memory pages within) are fully 4354 * loaded after this sync returns. 4355 */ 4356 if (migrate_mapped_ram()) { 4357 multifd_recv_sync_main(); 4358 } 4359 break; 4360 4361 case RAM_SAVE_FLAG_ZERO: 4362 ch = qemu_get_byte(f); 4363 if (ch != 0) { 4364 error_report("Found a zero page with value %d", ch); 4365 ret = -EINVAL; 4366 break; 4367 } 4368 ram_handle_zero(host, TARGET_PAGE_SIZE); 4369 break; 4370 4371 case RAM_SAVE_FLAG_PAGE: 4372 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4373 break; 4374 4375 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4376 len = qemu_get_be32(f); 4377 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4378 error_report("Invalid compressed data length: %d", len); 4379 ret = -EINVAL; 4380 break; 4381 } 4382 decompress_data_with_multi_threads(f, host, len); 4383 break; 4384 4385 case RAM_SAVE_FLAG_XBZRLE: 4386 if (load_xbzrle(f, addr, host) < 0) { 4387 error_report("Failed to decompress XBZRLE page at " 4388 RAM_ADDR_FMT, addr); 4389 ret = -EINVAL; 4390 break; 4391 } 4392 break; 4393 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4394 multifd_recv_sync_main(); 4395 break; 4396 case RAM_SAVE_FLAG_EOS: 4397 /* normal exit */ 4398 if (migrate_multifd() && 4399 migrate_multifd_flush_after_each_section() && 4400 /* 4401 * Mapped-ram migration flushes once and for all after 4402 * parsing ramblocks. Always ignore EOS for it. 4403 */ 4404 !migrate_mapped_ram()) { 4405 multifd_recv_sync_main(); 4406 } 4407 break; 4408 case RAM_SAVE_FLAG_HOOK: 4409 ret = rdma_registration_handle(f); 4410 if (ret < 0) { 4411 qemu_file_set_error(f, ret); 4412 } 4413 break; 4414 default: 4415 error_report("Unknown combination of migration flags: 0x%x", flags); 4416 ret = -EINVAL; 4417 } 4418 if (!ret) { 4419 ret = qemu_file_get_error(f); 4420 } 4421 if (!ret && host_bak) { 4422 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4423 } 4424 } 4425 4426 ret |= wait_for_decompress_done(); 4427 return ret; 4428 } 4429 4430 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4431 { 4432 int ret = 0; 4433 static uint64_t seq_iter; 4434 /* 4435 * If system is running in postcopy mode, page inserts to host memory must 4436 * be atomic 4437 */ 4438 bool postcopy_running = postcopy_is_running(); 4439 4440 seq_iter++; 4441 4442 if (version_id != 4) { 4443 return -EINVAL; 4444 } 4445 4446 /* 4447 * This RCU critical section can be very long running. 4448 * When RCU reclaims in the code start to become numerous, 4449 * it will be necessary to reduce the granularity of this 4450 * critical section. 4451 */ 4452 WITH_RCU_READ_LOCK_GUARD() { 4453 if (postcopy_running) { 4454 /* 4455 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4456 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4457 * service fast page faults. 4458 */ 4459 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4460 } else { 4461 ret = ram_load_precopy(f); 4462 } 4463 } 4464 trace_ram_load_complete(ret, seq_iter); 4465 4466 return ret; 4467 } 4468 4469 static bool ram_has_postcopy(void *opaque) 4470 { 4471 RAMBlock *rb; 4472 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4473 if (ramblock_is_pmem(rb)) { 4474 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4475 "is not supported now!", rb->idstr, rb->host); 4476 return false; 4477 } 4478 } 4479 4480 return migrate_postcopy_ram(); 4481 } 4482 4483 /* Sync all the dirty bitmap with destination VM. */ 4484 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4485 { 4486 RAMBlock *block; 4487 QEMUFile *file = s->to_dst_file; 4488 4489 trace_ram_dirty_bitmap_sync_start(); 4490 4491 qatomic_set(&rs->postcopy_bmap_sync_requested, 0); 4492 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4493 qemu_savevm_send_recv_bitmap(file, block->idstr); 4494 trace_ram_dirty_bitmap_request(block->idstr); 4495 qatomic_inc(&rs->postcopy_bmap_sync_requested); 4496 } 4497 4498 trace_ram_dirty_bitmap_sync_wait(); 4499 4500 /* Wait until all the ramblocks' dirty bitmap synced */ 4501 while (qatomic_read(&rs->postcopy_bmap_sync_requested)) { 4502 if (migration_rp_wait(s)) { 4503 return -1; 4504 } 4505 } 4506 4507 trace_ram_dirty_bitmap_sync_complete(); 4508 4509 return 0; 4510 } 4511 4512 /* 4513 * Read the received bitmap, revert it as the initial dirty bitmap. 4514 * This is only used when the postcopy migration is paused but wants 4515 * to resume from a middle point. 4516 * 4517 * Returns true if succeeded, false for errors. 4518 */ 4519 bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp) 4520 { 4521 /* from_dst_file is always valid because we're within rp_thread */ 4522 QEMUFile *file = s->rp_state.from_dst_file; 4523 g_autofree unsigned long *le_bitmap = NULL; 4524 unsigned long nbits = block->used_length >> TARGET_PAGE_BITS; 4525 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4526 uint64_t size, end_mark; 4527 RAMState *rs = ram_state; 4528 4529 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4530 4531 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4532 error_setg(errp, "Reload bitmap in incorrect state %s", 4533 MigrationStatus_str(s->state)); 4534 return false; 4535 } 4536 4537 /* 4538 * Note: see comments in ramblock_recv_bitmap_send() on why we 4539 * need the endianness conversion, and the paddings. 4540 */ 4541 local_size = ROUND_UP(local_size, 8); 4542 4543 /* Add paddings */ 4544 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4545 4546 size = qemu_get_be64(file); 4547 4548 /* The size of the bitmap should match with our ramblock */ 4549 if (size != local_size) { 4550 error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64 4551 " != 0x%"PRIx64")", block->idstr, size, local_size); 4552 return false; 4553 } 4554 4555 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4556 end_mark = qemu_get_be64(file); 4557 4558 if (qemu_file_get_error(file) || size != local_size) { 4559 error_setg(errp, "read bitmap failed for ramblock '%s': " 4560 "(size 0x%"PRIx64", got: 0x%"PRIx64")", 4561 block->idstr, local_size, size); 4562 return false; 4563 } 4564 4565 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4566 error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64, 4567 block->idstr, end_mark); 4568 return false; 4569 } 4570 4571 /* 4572 * Endianness conversion. We are during postcopy (though paused). 4573 * The dirty bitmap won't change. We can directly modify it. 4574 */ 4575 bitmap_from_le(block->bmap, le_bitmap, nbits); 4576 4577 /* 4578 * What we received is "received bitmap". Revert it as the initial 4579 * dirty bitmap for this ramblock. 4580 */ 4581 bitmap_complement(block->bmap, block->bmap, nbits); 4582 4583 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4584 ramblock_dirty_bitmap_clear_discarded_pages(block); 4585 4586 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4587 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4588 4589 qatomic_dec(&rs->postcopy_bmap_sync_requested); 4590 4591 /* 4592 * We succeeded to sync bitmap for current ramblock. Always kick the 4593 * migration thread to check whether all requested bitmaps are 4594 * reloaded. NOTE: it's racy to only kick when requested==0, because 4595 * we don't know whether the migration thread may still be increasing 4596 * it. 4597 */ 4598 migration_rp_kick(s); 4599 4600 return true; 4601 } 4602 4603 static int ram_resume_prepare(MigrationState *s, void *opaque) 4604 { 4605 RAMState *rs = *(RAMState **)opaque; 4606 int ret; 4607 4608 ret = ram_dirty_bitmap_sync_all(s, rs); 4609 if (ret) { 4610 return ret; 4611 } 4612 4613 ram_state_resume_prepare(rs, s->to_dst_file); 4614 4615 return 0; 4616 } 4617 4618 void postcopy_preempt_shutdown_file(MigrationState *s) 4619 { 4620 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4621 qemu_fflush(s->postcopy_qemufile_src); 4622 } 4623 4624 static SaveVMHandlers savevm_ram_handlers = { 4625 .save_setup = ram_save_setup, 4626 .save_live_iterate = ram_save_iterate, 4627 .save_live_complete_postcopy = ram_save_complete, 4628 .save_live_complete_precopy = ram_save_complete, 4629 .has_postcopy = ram_has_postcopy, 4630 .state_pending_exact = ram_state_pending_exact, 4631 .state_pending_estimate = ram_state_pending_estimate, 4632 .load_state = ram_load, 4633 .save_cleanup = ram_save_cleanup, 4634 .load_setup = ram_load_setup, 4635 .load_cleanup = ram_load_cleanup, 4636 .resume_prepare = ram_resume_prepare, 4637 }; 4638 4639 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4640 size_t old_size, size_t new_size) 4641 { 4642 PostcopyState ps = postcopy_state_get(); 4643 ram_addr_t offset; 4644 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4645 Error *err = NULL; 4646 4647 if (!rb) { 4648 error_report("RAM block not found"); 4649 return; 4650 } 4651 4652 if (migrate_ram_is_ignored(rb)) { 4653 return; 4654 } 4655 4656 if (!migration_is_idle()) { 4657 /* 4658 * Precopy code on the source cannot deal with the size of RAM blocks 4659 * changing at random points in time - especially after sending the 4660 * RAM block sizes in the migration stream, they must no longer change. 4661 * Abort and indicate a proper reason. 4662 */ 4663 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4664 migration_cancel(err); 4665 error_free(err); 4666 } 4667 4668 switch (ps) { 4669 case POSTCOPY_INCOMING_ADVISE: 4670 /* 4671 * Update what ram_postcopy_incoming_init()->init_range() does at the 4672 * time postcopy was advised. Syncing RAM blocks with the source will 4673 * result in RAM resizes. 4674 */ 4675 if (old_size < new_size) { 4676 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4677 error_report("RAM block '%s' discard of resized RAM failed", 4678 rb->idstr); 4679 } 4680 } 4681 rb->postcopy_length = new_size; 4682 break; 4683 case POSTCOPY_INCOMING_NONE: 4684 case POSTCOPY_INCOMING_RUNNING: 4685 case POSTCOPY_INCOMING_END: 4686 /* 4687 * Once our guest is running, postcopy does no longer care about 4688 * resizes. When growing, the new memory was not available on the 4689 * source, no handler needed. 4690 */ 4691 break; 4692 default: 4693 error_report("RAM block '%s' resized during postcopy state: %d", 4694 rb->idstr, ps); 4695 exit(-1); 4696 } 4697 } 4698 4699 static RAMBlockNotifier ram_mig_ram_notifier = { 4700 .ram_block_resized = ram_mig_ram_block_resized, 4701 }; 4702 4703 void ram_mig_init(void) 4704 { 4705 qemu_mutex_init(&XBZRLE.lock); 4706 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4707 ram_block_notifier_add(&ram_mig_ram_notifier); 4708 } 4709