1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram-compress.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration-stats.h" 40 #include "migration/register.h" 41 #include "migration/misc.h" 42 #include "qemu-file.h" 43 #include "postcopy-ram.h" 44 #include "page_cache.h" 45 #include "qemu/error-report.h" 46 #include "qapi/error.h" 47 #include "qapi/qapi-types-migration.h" 48 #include "qapi/qapi-events-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "exec/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "block.h" 56 #include "sysemu/cpu-throttle.h" 57 #include "savevm.h" 58 #include "qemu/iov.h" 59 #include "multifd.h" 60 #include "sysemu/runstate.h" 61 #include "options.h" 62 63 #include "hw/boards.h" /* for machine_dump_guest_core() */ 64 65 #if defined(__linux__) 66 #include "qemu/userfaultfd.h" 67 #endif /* defined(__linux__) */ 68 69 /***********************************************************/ 70 /* ram save/restore */ 71 72 /* 73 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 74 * worked for pages that were filled with the same char. We switched 75 * it to only search for the zero value. And to avoid confusion with 76 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 77 */ 78 /* 79 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 80 */ 81 #define RAM_SAVE_FLAG_FULL 0x01 82 #define RAM_SAVE_FLAG_ZERO 0x02 83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 84 #define RAM_SAVE_FLAG_PAGE 0x08 85 #define RAM_SAVE_FLAG_EOS 0x10 86 #define RAM_SAVE_FLAG_CONTINUE 0x20 87 #define RAM_SAVE_FLAG_XBZRLE 0x40 88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 89 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 91 /* We can't use any flag that is bigger than 0x200 */ 92 93 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, 94 uint8_t *, int) = xbzrle_encode_buffer; 95 #if defined(CONFIG_AVX512BW_OPT) 96 #include "qemu/cpuid.h" 97 static void __attribute__((constructor)) init_cpu_flag(void) 98 { 99 unsigned max = __get_cpuid_max(0, NULL); 100 int a, b, c, d; 101 if (max >= 1) { 102 __cpuid(1, a, b, c, d); 103 /* We must check that AVX is not just available, but usable. */ 104 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { 105 int bv; 106 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); 107 __cpuid_count(7, 0, a, b, c, d); 108 /* 0xe6: 109 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 110 * and ZMM16-ZMM31 state are enabled by OS) 111 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) 112 */ 113 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { 114 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; 115 } 116 } 117 } 118 } 119 #endif 120 121 XBZRLECacheStats xbzrle_counters; 122 123 /* used by the search for pages to send */ 124 struct PageSearchStatus { 125 /* The migration channel used for a specific host page */ 126 QEMUFile *pss_channel; 127 /* Last block from where we have sent data */ 128 RAMBlock *last_sent_block; 129 /* Current block being searched */ 130 RAMBlock *block; 131 /* Current page to search from */ 132 unsigned long page; 133 /* Set once we wrap around */ 134 bool complete_round; 135 /* Whether we're sending a host page */ 136 bool host_page_sending; 137 /* The start/end of current host page. Invalid if host_page_sending==false */ 138 unsigned long host_page_start; 139 unsigned long host_page_end; 140 }; 141 typedef struct PageSearchStatus PageSearchStatus; 142 143 /* struct contains XBZRLE cache and a static page 144 used by the compression */ 145 static struct { 146 /* buffer used for XBZRLE encoding */ 147 uint8_t *encoded_buf; 148 /* buffer for storing page content */ 149 uint8_t *current_buf; 150 /* Cache for XBZRLE, Protected by lock. */ 151 PageCache *cache; 152 QemuMutex lock; 153 /* it will store a page full of zeros */ 154 uint8_t *zero_target_page; 155 /* buffer used for XBZRLE decoding */ 156 uint8_t *decoded_buf; 157 } XBZRLE; 158 159 static void XBZRLE_cache_lock(void) 160 { 161 if (migrate_xbzrle()) { 162 qemu_mutex_lock(&XBZRLE.lock); 163 } 164 } 165 166 static void XBZRLE_cache_unlock(void) 167 { 168 if (migrate_xbzrle()) { 169 qemu_mutex_unlock(&XBZRLE.lock); 170 } 171 } 172 173 /** 174 * xbzrle_cache_resize: resize the xbzrle cache 175 * 176 * This function is called from migrate_params_apply in main 177 * thread, possibly while a migration is in progress. A running 178 * migration may be using the cache and might finish during this call, 179 * hence changes to the cache are protected by XBZRLE.lock(). 180 * 181 * Returns 0 for success or -1 for error 182 * 183 * @new_size: new cache size 184 * @errp: set *errp if the check failed, with reason 185 */ 186 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 187 { 188 PageCache *new_cache; 189 int64_t ret = 0; 190 191 /* Check for truncation */ 192 if (new_size != (size_t)new_size) { 193 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 194 "exceeding address space"); 195 return -1; 196 } 197 198 if (new_size == migrate_xbzrle_cache_size()) { 199 /* nothing to do */ 200 return 0; 201 } 202 203 XBZRLE_cache_lock(); 204 205 if (XBZRLE.cache != NULL) { 206 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 207 if (!new_cache) { 208 ret = -1; 209 goto out; 210 } 211 212 cache_fini(XBZRLE.cache); 213 XBZRLE.cache = new_cache; 214 } 215 out: 216 XBZRLE_cache_unlock(); 217 return ret; 218 } 219 220 static bool postcopy_preempt_active(void) 221 { 222 return migrate_postcopy_preempt() && migration_in_postcopy(); 223 } 224 225 bool ramblock_is_ignored(RAMBlock *block) 226 { 227 return !qemu_ram_is_migratable(block) || 228 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 229 } 230 231 #undef RAMBLOCK_FOREACH 232 233 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 234 { 235 RAMBlock *block; 236 int ret = 0; 237 238 RCU_READ_LOCK_GUARD(); 239 240 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 241 ret = func(block, opaque); 242 if (ret) { 243 break; 244 } 245 } 246 return ret; 247 } 248 249 static void ramblock_recv_map_init(void) 250 { 251 RAMBlock *rb; 252 253 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 254 assert(!rb->receivedmap); 255 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 256 } 257 } 258 259 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 260 { 261 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 262 rb->receivedmap); 263 } 264 265 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 266 { 267 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 268 } 269 270 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 271 { 272 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 273 } 274 275 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 276 size_t nr) 277 { 278 bitmap_set_atomic(rb->receivedmap, 279 ramblock_recv_bitmap_offset(host_addr, rb), 280 nr); 281 } 282 283 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 284 285 /* 286 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 287 * 288 * Returns >0 if success with sent bytes, or <0 if error. 289 */ 290 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 291 const char *block_name) 292 { 293 RAMBlock *block = qemu_ram_block_by_name(block_name); 294 unsigned long *le_bitmap, nbits; 295 uint64_t size; 296 297 if (!block) { 298 error_report("%s: invalid block name: %s", __func__, block_name); 299 return -1; 300 } 301 302 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 303 304 /* 305 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 306 * machines we may need 4 more bytes for padding (see below 307 * comment). So extend it a bit before hand. 308 */ 309 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 310 311 /* 312 * Always use little endian when sending the bitmap. This is 313 * required that when source and destination VMs are not using the 314 * same endianness. (Note: big endian won't work.) 315 */ 316 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 317 318 /* Size of the bitmap, in bytes */ 319 size = DIV_ROUND_UP(nbits, 8); 320 321 /* 322 * size is always aligned to 8 bytes for 64bit machines, but it 323 * may not be true for 32bit machines. We need this padding to 324 * make sure the migration can survive even between 32bit and 325 * 64bit machines. 326 */ 327 size = ROUND_UP(size, 8); 328 329 qemu_put_be64(file, size); 330 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 331 /* 332 * Mark as an end, in case the middle part is screwed up due to 333 * some "mysterious" reason. 334 */ 335 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 336 qemu_fflush(file); 337 338 g_free(le_bitmap); 339 340 if (qemu_file_get_error(file)) { 341 return qemu_file_get_error(file); 342 } 343 344 return size + sizeof(size); 345 } 346 347 /* 348 * An outstanding page request, on the source, having been received 349 * and queued 350 */ 351 struct RAMSrcPageRequest { 352 RAMBlock *rb; 353 hwaddr offset; 354 hwaddr len; 355 356 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 357 }; 358 359 /* State of RAM for migration */ 360 struct RAMState { 361 /* 362 * PageSearchStatus structures for the channels when send pages. 363 * Protected by the bitmap_mutex. 364 */ 365 PageSearchStatus pss[RAM_CHANNEL_MAX]; 366 /* UFFD file descriptor, used in 'write-tracking' migration */ 367 int uffdio_fd; 368 /* total ram size in bytes */ 369 uint64_t ram_bytes_total; 370 /* Last block that we have visited searching for dirty pages */ 371 RAMBlock *last_seen_block; 372 /* Last dirty target page we have sent */ 373 ram_addr_t last_page; 374 /* last ram version we have seen */ 375 uint32_t last_version; 376 /* How many times we have dirty too many pages */ 377 int dirty_rate_high_cnt; 378 /* these variables are used for bitmap sync */ 379 /* last time we did a full bitmap_sync */ 380 int64_t time_last_bitmap_sync; 381 /* bytes transferred at start_time */ 382 uint64_t bytes_xfer_prev; 383 /* number of dirty pages since start_time */ 384 uint64_t num_dirty_pages_period; 385 /* xbzrle misses since the beginning of the period */ 386 uint64_t xbzrle_cache_miss_prev; 387 /* Amount of xbzrle pages since the beginning of the period */ 388 uint64_t xbzrle_pages_prev; 389 /* Amount of xbzrle encoded bytes since the beginning of the period */ 390 uint64_t xbzrle_bytes_prev; 391 /* Are we really using XBZRLE (e.g., after the first round). */ 392 bool xbzrle_started; 393 /* Are we on the last stage of migration */ 394 bool last_stage; 395 /* compression statistics since the beginning of the period */ 396 /* amount of count that no free thread to compress data */ 397 uint64_t compress_thread_busy_prev; 398 /* amount bytes after compression */ 399 uint64_t compressed_size_prev; 400 /* amount of compressed pages */ 401 uint64_t compress_pages_prev; 402 403 /* total handled target pages at the beginning of period */ 404 uint64_t target_page_count_prev; 405 /* total handled target pages since start */ 406 uint64_t target_page_count; 407 /* number of dirty bits in the bitmap */ 408 uint64_t migration_dirty_pages; 409 /* 410 * Protects: 411 * - dirty/clear bitmap 412 * - migration_dirty_pages 413 * - pss structures 414 */ 415 QemuMutex bitmap_mutex; 416 /* The RAMBlock used in the last src_page_requests */ 417 RAMBlock *last_req_rb; 418 /* Queue of outstanding page requests from the destination */ 419 QemuMutex src_page_req_mutex; 420 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 421 }; 422 typedef struct RAMState RAMState; 423 424 static RAMState *ram_state; 425 426 static NotifierWithReturnList precopy_notifier_list; 427 428 /* Whether postcopy has queued requests? */ 429 static bool postcopy_has_request(RAMState *rs) 430 { 431 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 432 } 433 434 void precopy_infrastructure_init(void) 435 { 436 notifier_with_return_list_init(&precopy_notifier_list); 437 } 438 439 void precopy_add_notifier(NotifierWithReturn *n) 440 { 441 notifier_with_return_list_add(&precopy_notifier_list, n); 442 } 443 444 void precopy_remove_notifier(NotifierWithReturn *n) 445 { 446 notifier_with_return_remove(n); 447 } 448 449 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 450 { 451 PrecopyNotifyData pnd; 452 pnd.reason = reason; 453 pnd.errp = errp; 454 455 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 456 } 457 458 uint64_t ram_bytes_remaining(void) 459 { 460 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 461 0; 462 } 463 464 void ram_transferred_add(uint64_t bytes) 465 { 466 if (runstate_is_running()) { 467 stat64_add(&mig_stats.precopy_bytes, bytes); 468 } else if (migration_in_postcopy()) { 469 stat64_add(&mig_stats.postcopy_bytes, bytes); 470 } else { 471 stat64_add(&mig_stats.downtime_bytes, bytes); 472 } 473 stat64_add(&mig_stats.transferred, bytes); 474 } 475 476 struct MigrationOps { 477 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 478 }; 479 typedef struct MigrationOps MigrationOps; 480 481 MigrationOps *migration_ops; 482 483 static int ram_save_host_page_urgent(PageSearchStatus *pss); 484 485 /* NOTE: page is the PFN not real ram_addr_t. */ 486 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 487 { 488 pss->block = rb; 489 pss->page = page; 490 pss->complete_round = false; 491 } 492 493 /* 494 * Check whether two PSSs are actively sending the same page. Return true 495 * if it is, false otherwise. 496 */ 497 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 498 { 499 return pss1->host_page_sending && pss2->host_page_sending && 500 (pss1->host_page_start == pss2->host_page_start); 501 } 502 503 /** 504 * save_page_header: write page header to wire 505 * 506 * If this is the 1st block, it also writes the block identification 507 * 508 * Returns the number of bytes written 509 * 510 * @pss: current PSS channel status 511 * @block: block that contains the page we want to send 512 * @offset: offset inside the block for the page 513 * in the lower bits, it contains flags 514 */ 515 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 516 RAMBlock *block, ram_addr_t offset) 517 { 518 size_t size, len; 519 bool same_block = (block == pss->last_sent_block); 520 521 if (same_block) { 522 offset |= RAM_SAVE_FLAG_CONTINUE; 523 } 524 qemu_put_be64(f, offset); 525 size = 8; 526 527 if (!same_block) { 528 len = strlen(block->idstr); 529 qemu_put_byte(f, len); 530 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 531 size += 1 + len; 532 pss->last_sent_block = block; 533 } 534 return size; 535 } 536 537 /** 538 * mig_throttle_guest_down: throttle down the guest 539 * 540 * Reduce amount of guest cpu execution to hopefully slow down memory 541 * writes. If guest dirty memory rate is reduced below the rate at 542 * which we can transfer pages to the destination then we should be 543 * able to complete migration. Some workloads dirty memory way too 544 * fast and will not effectively converge, even with auto-converge. 545 */ 546 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 547 uint64_t bytes_dirty_threshold) 548 { 549 uint64_t pct_initial = migrate_cpu_throttle_initial(); 550 uint64_t pct_increment = migrate_cpu_throttle_increment(); 551 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 552 int pct_max = migrate_max_cpu_throttle(); 553 554 uint64_t throttle_now = cpu_throttle_get_percentage(); 555 uint64_t cpu_now, cpu_ideal, throttle_inc; 556 557 /* We have not started throttling yet. Let's start it. */ 558 if (!cpu_throttle_active()) { 559 cpu_throttle_set(pct_initial); 560 } else { 561 /* Throttling already on, just increase the rate */ 562 if (!pct_tailslow) { 563 throttle_inc = pct_increment; 564 } else { 565 /* Compute the ideal CPU percentage used by Guest, which may 566 * make the dirty rate match the dirty rate threshold. */ 567 cpu_now = 100 - throttle_now; 568 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 569 bytes_dirty_period); 570 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 571 } 572 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 573 } 574 } 575 576 void mig_throttle_counter_reset(void) 577 { 578 RAMState *rs = ram_state; 579 580 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 581 rs->num_dirty_pages_period = 0; 582 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred); 583 } 584 585 /** 586 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 587 * 588 * @rs: current RAM state 589 * @current_addr: address for the zero page 590 * 591 * Update the xbzrle cache to reflect a page that's been sent as all 0. 592 * The important thing is that a stale (not-yet-0'd) page be replaced 593 * by the new data. 594 * As a bonus, if the page wasn't in the cache it gets added so that 595 * when a small write is made into the 0'd page it gets XBZRLE sent. 596 */ 597 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 598 { 599 /* We don't care if this fails to allocate a new cache page 600 * as long as it updated an old one */ 601 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 602 stat64_get(&mig_stats.dirty_sync_count)); 603 } 604 605 #define ENCODING_FLAG_XBZRLE 0x1 606 607 /** 608 * save_xbzrle_page: compress and send current page 609 * 610 * Returns: 1 means that we wrote the page 611 * 0 means that page is identical to the one already sent 612 * -1 means that xbzrle would be longer than normal 613 * 614 * @rs: current RAM state 615 * @pss: current PSS channel 616 * @current_data: pointer to the address of the page contents 617 * @current_addr: addr of the page 618 * @block: block that contains the page we want to send 619 * @offset: offset inside the block for the page 620 */ 621 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 622 uint8_t **current_data, ram_addr_t current_addr, 623 RAMBlock *block, ram_addr_t offset) 624 { 625 int encoded_len = 0, bytes_xbzrle; 626 uint8_t *prev_cached_page; 627 QEMUFile *file = pss->pss_channel; 628 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 629 630 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 631 xbzrle_counters.cache_miss++; 632 if (!rs->last_stage) { 633 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 634 generation) == -1) { 635 return -1; 636 } else { 637 /* update *current_data when the page has been 638 inserted into cache */ 639 *current_data = get_cached_data(XBZRLE.cache, current_addr); 640 } 641 } 642 return -1; 643 } 644 645 /* 646 * Reaching here means the page has hit the xbzrle cache, no matter what 647 * encoding result it is (normal encoding, overflow or skipping the page), 648 * count the page as encoded. This is used to calculate the encoding rate. 649 * 650 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 651 * 2nd page turns out to be skipped (i.e. no new bytes written to the 652 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 653 * skipped page included. In this way, the encoding rate can tell if the 654 * guest page is good for xbzrle encoding. 655 */ 656 xbzrle_counters.pages++; 657 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 658 659 /* save current buffer into memory */ 660 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 661 662 /* XBZRLE encoding (if there is no overflow) */ 663 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, 664 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 665 TARGET_PAGE_SIZE); 666 667 /* 668 * Update the cache contents, so that it corresponds to the data 669 * sent, in all cases except where we skip the page. 670 */ 671 if (!rs->last_stage && encoded_len != 0) { 672 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 673 /* 674 * In the case where we couldn't compress, ensure that the caller 675 * sends the data from the cache, since the guest might have 676 * changed the RAM since we copied it. 677 */ 678 *current_data = prev_cached_page; 679 } 680 681 if (encoded_len == 0) { 682 trace_save_xbzrle_page_skipping(); 683 return 0; 684 } else if (encoded_len == -1) { 685 trace_save_xbzrle_page_overflow(); 686 xbzrle_counters.overflow++; 687 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 688 return -1; 689 } 690 691 /* Send XBZRLE based compressed page */ 692 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 693 offset | RAM_SAVE_FLAG_XBZRLE); 694 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 695 qemu_put_be16(file, encoded_len); 696 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 697 bytes_xbzrle += encoded_len + 1 + 2; 698 /* 699 * Like compressed_size (please see update_compress_thread_counts), 700 * the xbzrle encoded bytes don't count the 8 byte header with 701 * RAM_SAVE_FLAG_CONTINUE. 702 */ 703 xbzrle_counters.bytes += bytes_xbzrle - 8; 704 ram_transferred_add(bytes_xbzrle); 705 706 return 1; 707 } 708 709 /** 710 * pss_find_next_dirty: find the next dirty page of current ramblock 711 * 712 * This function updates pss->page to point to the next dirty page index 713 * within the ramblock to migrate, or the end of ramblock when nothing 714 * found. Note that when pss->host_page_sending==true it means we're 715 * during sending a host page, so we won't look for dirty page that is 716 * outside the host page boundary. 717 * 718 * @pss: the current page search status 719 */ 720 static void pss_find_next_dirty(PageSearchStatus *pss) 721 { 722 RAMBlock *rb = pss->block; 723 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 724 unsigned long *bitmap = rb->bmap; 725 726 if (ramblock_is_ignored(rb)) { 727 /* Points directly to the end, so we know no dirty page */ 728 pss->page = size; 729 return; 730 } 731 732 /* 733 * If during sending a host page, only look for dirty pages within the 734 * current host page being send. 735 */ 736 if (pss->host_page_sending) { 737 assert(pss->host_page_end); 738 size = MIN(size, pss->host_page_end); 739 } 740 741 pss->page = find_next_bit(bitmap, size, pss->page); 742 } 743 744 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 745 unsigned long page) 746 { 747 uint8_t shift; 748 hwaddr size, start; 749 750 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 751 return; 752 } 753 754 shift = rb->clear_bmap_shift; 755 /* 756 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 757 * can make things easier sometimes since then start address 758 * of the small chunk will always be 64 pages aligned so the 759 * bitmap will always be aligned to unsigned long. We should 760 * even be able to remove this restriction but I'm simply 761 * keeping it. 762 */ 763 assert(shift >= 6); 764 765 size = 1ULL << (TARGET_PAGE_BITS + shift); 766 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 767 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 768 memory_region_clear_dirty_bitmap(rb->mr, start, size); 769 } 770 771 static void 772 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 773 unsigned long start, 774 unsigned long npages) 775 { 776 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 777 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 778 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 779 780 /* 781 * Clear pages from start to start + npages - 1, so the end boundary is 782 * exclusive. 783 */ 784 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 785 migration_clear_memory_region_dirty_bitmap(rb, i); 786 } 787 } 788 789 /* 790 * colo_bitmap_find_diry:find contiguous dirty pages from start 791 * 792 * Returns the page offset within memory region of the start of the contiguout 793 * dirty page 794 * 795 * @rs: current RAM state 796 * @rb: RAMBlock where to search for dirty pages 797 * @start: page where we start the search 798 * @num: the number of contiguous dirty pages 799 */ 800 static inline 801 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 802 unsigned long start, unsigned long *num) 803 { 804 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 805 unsigned long *bitmap = rb->bmap; 806 unsigned long first, next; 807 808 *num = 0; 809 810 if (ramblock_is_ignored(rb)) { 811 return size; 812 } 813 814 first = find_next_bit(bitmap, size, start); 815 if (first >= size) { 816 return first; 817 } 818 next = find_next_zero_bit(bitmap, size, first + 1); 819 assert(next >= first); 820 *num = next - first; 821 return first; 822 } 823 824 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 825 RAMBlock *rb, 826 unsigned long page) 827 { 828 bool ret; 829 830 /* 831 * Clear dirty bitmap if needed. This _must_ be called before we 832 * send any of the page in the chunk because we need to make sure 833 * we can capture further page content changes when we sync dirty 834 * log the next time. So as long as we are going to send any of 835 * the page in the chunk we clear the remote dirty bitmap for all. 836 * Clearing it earlier won't be a problem, but too late will. 837 */ 838 migration_clear_memory_region_dirty_bitmap(rb, page); 839 840 ret = test_and_clear_bit(page, rb->bmap); 841 if (ret) { 842 rs->migration_dirty_pages--; 843 } 844 845 return ret; 846 } 847 848 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 849 void *opaque) 850 { 851 const hwaddr offset = section->offset_within_region; 852 const hwaddr size = int128_get64(section->size); 853 const unsigned long start = offset >> TARGET_PAGE_BITS; 854 const unsigned long npages = size >> TARGET_PAGE_BITS; 855 RAMBlock *rb = section->mr->ram_block; 856 uint64_t *cleared_bits = opaque; 857 858 /* 859 * We don't grab ram_state->bitmap_mutex because we expect to run 860 * only when starting migration or during postcopy recovery where 861 * we don't have concurrent access. 862 */ 863 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 864 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 865 } 866 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 867 bitmap_clear(rb->bmap, start, npages); 868 } 869 870 /* 871 * Exclude all dirty pages from migration that fall into a discarded range as 872 * managed by a RamDiscardManager responsible for the mapped memory region of 873 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 874 * 875 * Discarded pages ("logically unplugged") have undefined content and must 876 * not get migrated, because even reading these pages for migration might 877 * result in undesired behavior. 878 * 879 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 880 * 881 * Note: The result is only stable while migrating (precopy/postcopy). 882 */ 883 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 884 { 885 uint64_t cleared_bits = 0; 886 887 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 888 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 889 MemoryRegionSection section = { 890 .mr = rb->mr, 891 .offset_within_region = 0, 892 .size = int128_make64(qemu_ram_get_used_length(rb)), 893 }; 894 895 ram_discard_manager_replay_discarded(rdm, §ion, 896 dirty_bitmap_clear_section, 897 &cleared_bits); 898 } 899 return cleared_bits; 900 } 901 902 /* 903 * Check if a host-page aligned page falls into a discarded range as managed by 904 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 905 * 906 * Note: The result is only stable while migrating (precopy/postcopy). 907 */ 908 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 909 { 910 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 911 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 912 MemoryRegionSection section = { 913 .mr = rb->mr, 914 .offset_within_region = start, 915 .size = int128_make64(qemu_ram_pagesize(rb)), 916 }; 917 918 return !ram_discard_manager_is_populated(rdm, §ion); 919 } 920 return false; 921 } 922 923 /* Called with RCU critical section */ 924 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 925 { 926 uint64_t new_dirty_pages = 927 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 928 929 rs->migration_dirty_pages += new_dirty_pages; 930 rs->num_dirty_pages_period += new_dirty_pages; 931 } 932 933 /** 934 * ram_pagesize_summary: calculate all the pagesizes of a VM 935 * 936 * Returns a summary bitmap of the page sizes of all RAMBlocks 937 * 938 * For VMs with just normal pages this is equivalent to the host page 939 * size. If it's got some huge pages then it's the OR of all the 940 * different page sizes. 941 */ 942 uint64_t ram_pagesize_summary(void) 943 { 944 RAMBlock *block; 945 uint64_t summary = 0; 946 947 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 948 summary |= block->page_size; 949 } 950 951 return summary; 952 } 953 954 uint64_t ram_get_total_transferred_pages(void) 955 { 956 return stat64_get(&mig_stats.normal_pages) + 957 stat64_get(&mig_stats.zero_pages) + 958 compression_counters.pages + xbzrle_counters.pages; 959 } 960 961 static void migration_update_rates(RAMState *rs, int64_t end_time) 962 { 963 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 964 double compressed_size; 965 966 /* calculate period counters */ 967 stat64_set(&mig_stats.dirty_pages_rate, 968 rs->num_dirty_pages_period * 1000 / 969 (end_time - rs->time_last_bitmap_sync)); 970 971 if (!page_count) { 972 return; 973 } 974 975 if (migrate_xbzrle()) { 976 double encoded_size, unencoded_size; 977 978 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 979 rs->xbzrle_cache_miss_prev) / page_count; 980 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 981 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 982 TARGET_PAGE_SIZE; 983 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 984 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 985 xbzrle_counters.encoding_rate = 0; 986 } else { 987 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 988 } 989 rs->xbzrle_pages_prev = xbzrle_counters.pages; 990 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 991 } 992 993 if (migrate_compress()) { 994 compression_counters.busy_rate = (double)(compression_counters.busy - 995 rs->compress_thread_busy_prev) / page_count; 996 rs->compress_thread_busy_prev = compression_counters.busy; 997 998 compressed_size = compression_counters.compressed_size - 999 rs->compressed_size_prev; 1000 if (compressed_size) { 1001 double uncompressed_size = (compression_counters.pages - 1002 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1003 1004 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1005 compression_counters.compression_rate = 1006 uncompressed_size / compressed_size; 1007 1008 rs->compress_pages_prev = compression_counters.pages; 1009 rs->compressed_size_prev = compression_counters.compressed_size; 1010 } 1011 } 1012 } 1013 1014 static void migration_trigger_throttle(RAMState *rs) 1015 { 1016 uint64_t threshold = migrate_throttle_trigger_threshold(); 1017 uint64_t bytes_xfer_period = 1018 stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev; 1019 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1020 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1021 1022 /* During block migration the auto-converge logic incorrectly detects 1023 * that ram migration makes no progress. Avoid this by disabling the 1024 * throttling logic during the bulk phase of block migration. */ 1025 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1026 /* The following detection logic can be refined later. For now: 1027 Check to see if the ratio between dirtied bytes and the approx. 1028 amount of bytes that just got transferred since the last time 1029 we were in this routine reaches the threshold. If that happens 1030 twice, start or increase throttling. */ 1031 1032 if ((bytes_dirty_period > bytes_dirty_threshold) && 1033 (++rs->dirty_rate_high_cnt >= 2)) { 1034 trace_migration_throttle(); 1035 rs->dirty_rate_high_cnt = 0; 1036 mig_throttle_guest_down(bytes_dirty_period, 1037 bytes_dirty_threshold); 1038 } 1039 } 1040 } 1041 1042 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1043 { 1044 RAMBlock *block; 1045 int64_t end_time; 1046 1047 stat64_add(&mig_stats.dirty_sync_count, 1); 1048 1049 if (!rs->time_last_bitmap_sync) { 1050 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1051 } 1052 1053 trace_migration_bitmap_sync_start(); 1054 memory_global_dirty_log_sync(last_stage); 1055 1056 qemu_mutex_lock(&rs->bitmap_mutex); 1057 WITH_RCU_READ_LOCK_GUARD() { 1058 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1059 ramblock_sync_dirty_bitmap(rs, block); 1060 } 1061 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1062 } 1063 qemu_mutex_unlock(&rs->bitmap_mutex); 1064 1065 memory_global_after_dirty_log_sync(); 1066 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1067 1068 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1069 1070 /* more than 1 second = 1000 millisecons */ 1071 if (end_time > rs->time_last_bitmap_sync + 1000) { 1072 migration_trigger_throttle(rs); 1073 1074 migration_update_rates(rs, end_time); 1075 1076 rs->target_page_count_prev = rs->target_page_count; 1077 1078 /* reset period counters */ 1079 rs->time_last_bitmap_sync = end_time; 1080 rs->num_dirty_pages_period = 0; 1081 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred); 1082 } 1083 if (migrate_events()) { 1084 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1085 qapi_event_send_migration_pass(generation); 1086 } 1087 } 1088 1089 static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage) 1090 { 1091 Error *local_err = NULL; 1092 1093 /* 1094 * The current notifier usage is just an optimization to migration, so we 1095 * don't stop the normal migration process in the error case. 1096 */ 1097 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1098 error_report_err(local_err); 1099 local_err = NULL; 1100 } 1101 1102 migration_bitmap_sync(rs, last_stage); 1103 1104 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1105 error_report_err(local_err); 1106 } 1107 } 1108 1109 void ram_release_page(const char *rbname, uint64_t offset) 1110 { 1111 if (!migrate_release_ram() || !migration_in_postcopy()) { 1112 return; 1113 } 1114 1115 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1116 } 1117 1118 /** 1119 * save_zero_page_to_file: send the zero page to the file 1120 * 1121 * Returns the size of data written to the file, 0 means the page is not 1122 * a zero page 1123 * 1124 * @pss: current PSS channel 1125 * @block: block that contains the page we want to send 1126 * @offset: offset inside the block for the page 1127 */ 1128 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1129 RAMBlock *block, ram_addr_t offset) 1130 { 1131 uint8_t *p = block->host + offset; 1132 int len = 0; 1133 1134 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1135 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1136 qemu_put_byte(file, 0); 1137 len += 1; 1138 ram_release_page(block->idstr, offset); 1139 } 1140 return len; 1141 } 1142 1143 /** 1144 * save_zero_page: send the zero page to the stream 1145 * 1146 * Returns the number of pages written. 1147 * 1148 * @pss: current PSS channel 1149 * @block: block that contains the page we want to send 1150 * @offset: offset inside the block for the page 1151 */ 1152 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1153 ram_addr_t offset) 1154 { 1155 int len = save_zero_page_to_file(pss, f, block, offset); 1156 1157 if (len) { 1158 stat64_add(&mig_stats.zero_pages, 1); 1159 ram_transferred_add(len); 1160 return 1; 1161 } 1162 return -1; 1163 } 1164 1165 /* 1166 * @pages: the number of pages written by the control path, 1167 * < 0 - error 1168 * > 0 - number of pages written 1169 * 1170 * Return true if the pages has been saved, otherwise false is returned. 1171 */ 1172 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1173 ram_addr_t offset, int *pages) 1174 { 1175 uint64_t bytes_xmit = 0; 1176 int ret; 1177 1178 *pages = -1; 1179 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1180 TARGET_PAGE_SIZE, &bytes_xmit); 1181 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1182 return false; 1183 } 1184 1185 if (bytes_xmit) { 1186 ram_transferred_add(bytes_xmit); 1187 *pages = 1; 1188 } 1189 1190 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1191 return true; 1192 } 1193 1194 if (bytes_xmit > 0) { 1195 stat64_add(&mig_stats.normal_pages, 1); 1196 } else if (bytes_xmit == 0) { 1197 stat64_add(&mig_stats.zero_pages, 1); 1198 } 1199 1200 return true; 1201 } 1202 1203 /* 1204 * directly send the page to the stream 1205 * 1206 * Returns the number of pages written. 1207 * 1208 * @pss: current PSS channel 1209 * @block: block that contains the page we want to send 1210 * @offset: offset inside the block for the page 1211 * @buf: the page to be sent 1212 * @async: send to page asyncly 1213 */ 1214 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1215 ram_addr_t offset, uint8_t *buf, bool async) 1216 { 1217 QEMUFile *file = pss->pss_channel; 1218 1219 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1220 offset | RAM_SAVE_FLAG_PAGE)); 1221 if (async) { 1222 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1223 migrate_release_ram() && 1224 migration_in_postcopy()); 1225 } else { 1226 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1227 } 1228 ram_transferred_add(TARGET_PAGE_SIZE); 1229 stat64_add(&mig_stats.normal_pages, 1); 1230 return 1; 1231 } 1232 1233 /** 1234 * ram_save_page: send the given page to the stream 1235 * 1236 * Returns the number of pages written. 1237 * < 0 - error 1238 * >=0 - Number of pages written - this might legally be 0 1239 * if xbzrle noticed the page was the same. 1240 * 1241 * @rs: current RAM state 1242 * @block: block that contains the page we want to send 1243 * @offset: offset inside the block for the page 1244 */ 1245 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1246 { 1247 int pages = -1; 1248 uint8_t *p; 1249 bool send_async = true; 1250 RAMBlock *block = pss->block; 1251 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1252 ram_addr_t current_addr = block->offset + offset; 1253 1254 p = block->host + offset; 1255 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1256 1257 XBZRLE_cache_lock(); 1258 if (rs->xbzrle_started && !migration_in_postcopy()) { 1259 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1260 block, offset); 1261 if (!rs->last_stage) { 1262 /* Can't send this cached data async, since the cache page 1263 * might get updated before it gets to the wire 1264 */ 1265 send_async = false; 1266 } 1267 } 1268 1269 /* XBZRLE overflow or normal page */ 1270 if (pages == -1) { 1271 pages = save_normal_page(pss, block, offset, p, send_async); 1272 } 1273 1274 XBZRLE_cache_unlock(); 1275 1276 return pages; 1277 } 1278 1279 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1280 ram_addr_t offset) 1281 { 1282 if (multifd_queue_page(file, block, offset) < 0) { 1283 return -1; 1284 } 1285 stat64_add(&mig_stats.normal_pages, 1); 1286 1287 return 1; 1288 } 1289 1290 static void 1291 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1292 { 1293 ram_transferred_add(bytes_xmit); 1294 1295 if (param->result == RES_ZEROPAGE) { 1296 stat64_add(&mig_stats.zero_pages, 1); 1297 return; 1298 } 1299 1300 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1301 compression_counters.compressed_size += bytes_xmit - 8; 1302 compression_counters.pages++; 1303 } 1304 1305 static bool save_page_use_compression(RAMState *rs); 1306 1307 static int send_queued_data(CompressParam *param) 1308 { 1309 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY]; 1310 MigrationState *ms = migrate_get_current(); 1311 QEMUFile *file = ms->to_dst_file; 1312 int len = 0; 1313 1314 RAMBlock *block = param->block; 1315 ram_addr_t offset = param->offset; 1316 1317 if (param->result == RES_NONE) { 1318 return 0; 1319 } 1320 1321 assert(block == pss->last_sent_block); 1322 1323 if (param->result == RES_ZEROPAGE) { 1324 assert(qemu_file_buffer_empty(param->file)); 1325 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1326 qemu_put_byte(file, 0); 1327 len += 1; 1328 ram_release_page(block->idstr, offset); 1329 } else if (param->result == RES_COMPRESS) { 1330 assert(!qemu_file_buffer_empty(param->file)); 1331 len += save_page_header(pss, file, block, 1332 offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1333 len += qemu_put_qemu_file(file, param->file); 1334 } else { 1335 abort(); 1336 } 1337 1338 update_compress_thread_counts(param, len); 1339 1340 return len; 1341 } 1342 1343 static void ram_flush_compressed_data(RAMState *rs) 1344 { 1345 if (!save_page_use_compression(rs)) { 1346 return; 1347 } 1348 1349 flush_compressed_data(send_queued_data); 1350 } 1351 1352 #define PAGE_ALL_CLEAN 0 1353 #define PAGE_TRY_AGAIN 1 1354 #define PAGE_DIRTY_FOUND 2 1355 /** 1356 * find_dirty_block: find the next dirty page and update any state 1357 * associated with the search process. 1358 * 1359 * Returns: 1360 * <0: An error happened 1361 * PAGE_ALL_CLEAN: no dirty page found, give up 1362 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1363 * PAGE_DIRTY_FOUND: dirty page found 1364 * 1365 * @rs: current RAM state 1366 * @pss: data about the state of the current dirty page scan 1367 * @again: set to false if the search has scanned the whole of RAM 1368 */ 1369 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1370 { 1371 /* Update pss->page for the next dirty bit in ramblock */ 1372 pss_find_next_dirty(pss); 1373 1374 if (pss->complete_round && pss->block == rs->last_seen_block && 1375 pss->page >= rs->last_page) { 1376 /* 1377 * We've been once around the RAM and haven't found anything. 1378 * Give up. 1379 */ 1380 return PAGE_ALL_CLEAN; 1381 } 1382 if (!offset_in_ramblock(pss->block, 1383 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1384 /* Didn't find anything in this RAM Block */ 1385 pss->page = 0; 1386 pss->block = QLIST_NEXT_RCU(pss->block, next); 1387 if (!pss->block) { 1388 if (!migrate_multifd_flush_after_each_section()) { 1389 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1390 int ret = multifd_send_sync_main(f); 1391 if (ret < 0) { 1392 return ret; 1393 } 1394 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 1395 qemu_fflush(f); 1396 } 1397 /* 1398 * If memory migration starts over, we will meet a dirtied page 1399 * which may still exists in compression threads's ring, so we 1400 * should flush the compressed data to make sure the new page 1401 * is not overwritten by the old one in the destination. 1402 * 1403 * Also If xbzrle is on, stop using the data compression at this 1404 * point. In theory, xbzrle can do better than compression. 1405 */ 1406 ram_flush_compressed_data(rs); 1407 1408 /* Hit the end of the list */ 1409 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1410 /* Flag that we've looped */ 1411 pss->complete_round = true; 1412 /* After the first round, enable XBZRLE. */ 1413 if (migrate_xbzrle()) { 1414 rs->xbzrle_started = true; 1415 } 1416 } 1417 /* Didn't find anything this time, but try again on the new block */ 1418 return PAGE_TRY_AGAIN; 1419 } else { 1420 /* We've found something */ 1421 return PAGE_DIRTY_FOUND; 1422 } 1423 } 1424 1425 /** 1426 * unqueue_page: gets a page of the queue 1427 * 1428 * Helper for 'get_queued_page' - gets a page off the queue 1429 * 1430 * Returns the block of the page (or NULL if none available) 1431 * 1432 * @rs: current RAM state 1433 * @offset: used to return the offset within the RAMBlock 1434 */ 1435 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1436 { 1437 struct RAMSrcPageRequest *entry; 1438 RAMBlock *block = NULL; 1439 1440 if (!postcopy_has_request(rs)) { 1441 return NULL; 1442 } 1443 1444 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1445 1446 /* 1447 * This should _never_ change even after we take the lock, because no one 1448 * should be taking anything off the request list other than us. 1449 */ 1450 assert(postcopy_has_request(rs)); 1451 1452 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1453 block = entry->rb; 1454 *offset = entry->offset; 1455 1456 if (entry->len > TARGET_PAGE_SIZE) { 1457 entry->len -= TARGET_PAGE_SIZE; 1458 entry->offset += TARGET_PAGE_SIZE; 1459 } else { 1460 memory_region_unref(block->mr); 1461 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1462 g_free(entry); 1463 migration_consume_urgent_request(); 1464 } 1465 1466 return block; 1467 } 1468 1469 #if defined(__linux__) 1470 /** 1471 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1472 * is found, return RAM block pointer and page offset 1473 * 1474 * Returns pointer to the RAMBlock containing faulting page, 1475 * NULL if no write faults are pending 1476 * 1477 * @rs: current RAM state 1478 * @offset: page offset from the beginning of the block 1479 */ 1480 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1481 { 1482 struct uffd_msg uffd_msg; 1483 void *page_address; 1484 RAMBlock *block; 1485 int res; 1486 1487 if (!migrate_background_snapshot()) { 1488 return NULL; 1489 } 1490 1491 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1492 if (res <= 0) { 1493 return NULL; 1494 } 1495 1496 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1497 block = qemu_ram_block_from_host(page_address, false, offset); 1498 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1499 return block; 1500 } 1501 1502 /** 1503 * ram_save_release_protection: release UFFD write protection after 1504 * a range of pages has been saved 1505 * 1506 * @rs: current RAM state 1507 * @pss: page-search-status structure 1508 * @start_page: index of the first page in the range relative to pss->block 1509 * 1510 * Returns 0 on success, negative value in case of an error 1511 */ 1512 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1513 unsigned long start_page) 1514 { 1515 int res = 0; 1516 1517 /* Check if page is from UFFD-managed region. */ 1518 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1519 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1520 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1521 1522 /* Flush async buffers before un-protect. */ 1523 qemu_fflush(pss->pss_channel); 1524 /* Un-protect memory range. */ 1525 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1526 false, false); 1527 } 1528 1529 return res; 1530 } 1531 1532 /* ram_write_tracking_available: check if kernel supports required UFFD features 1533 * 1534 * Returns true if supports, false otherwise 1535 */ 1536 bool ram_write_tracking_available(void) 1537 { 1538 uint64_t uffd_features; 1539 int res; 1540 1541 res = uffd_query_features(&uffd_features); 1542 return (res == 0 && 1543 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1544 } 1545 1546 /* ram_write_tracking_compatible: check if guest configuration is 1547 * compatible with 'write-tracking' 1548 * 1549 * Returns true if compatible, false otherwise 1550 */ 1551 bool ram_write_tracking_compatible(void) 1552 { 1553 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1554 int uffd_fd; 1555 RAMBlock *block; 1556 bool ret = false; 1557 1558 /* Open UFFD file descriptor */ 1559 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1560 if (uffd_fd < 0) { 1561 return false; 1562 } 1563 1564 RCU_READ_LOCK_GUARD(); 1565 1566 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1567 uint64_t uffd_ioctls; 1568 1569 /* Nothing to do with read-only and MMIO-writable regions */ 1570 if (block->mr->readonly || block->mr->rom_device) { 1571 continue; 1572 } 1573 /* Try to register block memory via UFFD-IO to track writes */ 1574 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1575 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1576 goto out; 1577 } 1578 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1579 goto out; 1580 } 1581 } 1582 ret = true; 1583 1584 out: 1585 uffd_close_fd(uffd_fd); 1586 return ret; 1587 } 1588 1589 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1590 ram_addr_t size) 1591 { 1592 const ram_addr_t end = offset + size; 1593 1594 /* 1595 * We read one byte of each page; this will preallocate page tables if 1596 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1597 * where no page was populated yet. This might require adaption when 1598 * supporting other mappings, like shmem. 1599 */ 1600 for (; offset < end; offset += block->page_size) { 1601 char tmp = *((char *)block->host + offset); 1602 1603 /* Don't optimize the read out */ 1604 asm volatile("" : "+r" (tmp)); 1605 } 1606 } 1607 1608 static inline int populate_read_section(MemoryRegionSection *section, 1609 void *opaque) 1610 { 1611 const hwaddr size = int128_get64(section->size); 1612 hwaddr offset = section->offset_within_region; 1613 RAMBlock *block = section->mr->ram_block; 1614 1615 populate_read_range(block, offset, size); 1616 return 0; 1617 } 1618 1619 /* 1620 * ram_block_populate_read: preallocate page tables and populate pages in the 1621 * RAM block by reading a byte of each page. 1622 * 1623 * Since it's solely used for userfault_fd WP feature, here we just 1624 * hardcode page size to qemu_real_host_page_size. 1625 * 1626 * @block: RAM block to populate 1627 */ 1628 static void ram_block_populate_read(RAMBlock *rb) 1629 { 1630 /* 1631 * Skip populating all pages that fall into a discarded range as managed by 1632 * a RamDiscardManager responsible for the mapped memory region of the 1633 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1634 * must not get populated automatically. We don't have to track 1635 * modifications via userfaultfd WP reliably, because these pages will 1636 * not be part of the migration stream either way -- see 1637 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1638 * 1639 * Note: The result is only stable while migrating (precopy/postcopy). 1640 */ 1641 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1642 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1643 MemoryRegionSection section = { 1644 .mr = rb->mr, 1645 .offset_within_region = 0, 1646 .size = rb->mr->size, 1647 }; 1648 1649 ram_discard_manager_replay_populated(rdm, §ion, 1650 populate_read_section, NULL); 1651 } else { 1652 populate_read_range(rb, 0, rb->used_length); 1653 } 1654 } 1655 1656 /* 1657 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1658 */ 1659 void ram_write_tracking_prepare(void) 1660 { 1661 RAMBlock *block; 1662 1663 RCU_READ_LOCK_GUARD(); 1664 1665 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1666 /* Nothing to do with read-only and MMIO-writable regions */ 1667 if (block->mr->readonly || block->mr->rom_device) { 1668 continue; 1669 } 1670 1671 /* 1672 * Populate pages of the RAM block before enabling userfault_fd 1673 * write protection. 1674 * 1675 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1676 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1677 * pages with pte_none() entries in page table. 1678 */ 1679 ram_block_populate_read(block); 1680 } 1681 } 1682 1683 static inline int uffd_protect_section(MemoryRegionSection *section, 1684 void *opaque) 1685 { 1686 const hwaddr size = int128_get64(section->size); 1687 const hwaddr offset = section->offset_within_region; 1688 RAMBlock *rb = section->mr->ram_block; 1689 int uffd_fd = (uintptr_t)opaque; 1690 1691 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1692 false); 1693 } 1694 1695 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1696 { 1697 assert(rb->flags & RAM_UF_WRITEPROTECT); 1698 1699 /* See ram_block_populate_read() */ 1700 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1701 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1702 MemoryRegionSection section = { 1703 .mr = rb->mr, 1704 .offset_within_region = 0, 1705 .size = rb->mr->size, 1706 }; 1707 1708 return ram_discard_manager_replay_populated(rdm, §ion, 1709 uffd_protect_section, 1710 (void *)(uintptr_t)uffd_fd); 1711 } 1712 return uffd_change_protection(uffd_fd, rb->host, 1713 rb->used_length, true, false); 1714 } 1715 1716 /* 1717 * ram_write_tracking_start: start UFFD-WP memory tracking 1718 * 1719 * Returns 0 for success or negative value in case of error 1720 */ 1721 int ram_write_tracking_start(void) 1722 { 1723 int uffd_fd; 1724 RAMState *rs = ram_state; 1725 RAMBlock *block; 1726 1727 /* Open UFFD file descriptor */ 1728 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1729 if (uffd_fd < 0) { 1730 return uffd_fd; 1731 } 1732 rs->uffdio_fd = uffd_fd; 1733 1734 RCU_READ_LOCK_GUARD(); 1735 1736 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1737 /* Nothing to do with read-only and MMIO-writable regions */ 1738 if (block->mr->readonly || block->mr->rom_device) { 1739 continue; 1740 } 1741 1742 /* Register block memory with UFFD to track writes */ 1743 if (uffd_register_memory(rs->uffdio_fd, block->host, 1744 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1745 goto fail; 1746 } 1747 block->flags |= RAM_UF_WRITEPROTECT; 1748 memory_region_ref(block->mr); 1749 1750 /* Apply UFFD write protection to the block memory range */ 1751 if (ram_block_uffd_protect(block, uffd_fd)) { 1752 goto fail; 1753 } 1754 1755 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1756 block->host, block->max_length); 1757 } 1758 1759 return 0; 1760 1761 fail: 1762 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1763 1764 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1765 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1766 continue; 1767 } 1768 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1769 /* Cleanup flags and remove reference */ 1770 block->flags &= ~RAM_UF_WRITEPROTECT; 1771 memory_region_unref(block->mr); 1772 } 1773 1774 uffd_close_fd(uffd_fd); 1775 rs->uffdio_fd = -1; 1776 return -1; 1777 } 1778 1779 /** 1780 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1781 */ 1782 void ram_write_tracking_stop(void) 1783 { 1784 RAMState *rs = ram_state; 1785 RAMBlock *block; 1786 1787 RCU_READ_LOCK_GUARD(); 1788 1789 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1790 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1791 continue; 1792 } 1793 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1794 1795 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1796 block->host, block->max_length); 1797 1798 /* Cleanup flags and remove reference */ 1799 block->flags &= ~RAM_UF_WRITEPROTECT; 1800 memory_region_unref(block->mr); 1801 } 1802 1803 /* Finally close UFFD file descriptor */ 1804 uffd_close_fd(rs->uffdio_fd); 1805 rs->uffdio_fd = -1; 1806 } 1807 1808 #else 1809 /* No target OS support, stubs just fail or ignore */ 1810 1811 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1812 { 1813 (void) rs; 1814 (void) offset; 1815 1816 return NULL; 1817 } 1818 1819 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1820 unsigned long start_page) 1821 { 1822 (void) rs; 1823 (void) pss; 1824 (void) start_page; 1825 1826 return 0; 1827 } 1828 1829 bool ram_write_tracking_available(void) 1830 { 1831 return false; 1832 } 1833 1834 bool ram_write_tracking_compatible(void) 1835 { 1836 assert(0); 1837 return false; 1838 } 1839 1840 int ram_write_tracking_start(void) 1841 { 1842 assert(0); 1843 return -1; 1844 } 1845 1846 void ram_write_tracking_stop(void) 1847 { 1848 assert(0); 1849 } 1850 #endif /* defined(__linux__) */ 1851 1852 /** 1853 * get_queued_page: unqueue a page from the postcopy requests 1854 * 1855 * Skips pages that are already sent (!dirty) 1856 * 1857 * Returns true if a queued page is found 1858 * 1859 * @rs: current RAM state 1860 * @pss: data about the state of the current dirty page scan 1861 */ 1862 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1863 { 1864 RAMBlock *block; 1865 ram_addr_t offset; 1866 bool dirty; 1867 1868 do { 1869 block = unqueue_page(rs, &offset); 1870 /* 1871 * We're sending this page, and since it's postcopy nothing else 1872 * will dirty it, and we must make sure it doesn't get sent again 1873 * even if this queue request was received after the background 1874 * search already sent it. 1875 */ 1876 if (block) { 1877 unsigned long page; 1878 1879 page = offset >> TARGET_PAGE_BITS; 1880 dirty = test_bit(page, block->bmap); 1881 if (!dirty) { 1882 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1883 page); 1884 } else { 1885 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1886 } 1887 } 1888 1889 } while (block && !dirty); 1890 1891 if (!block) { 1892 /* 1893 * Poll write faults too if background snapshot is enabled; that's 1894 * when we have vcpus got blocked by the write protected pages. 1895 */ 1896 block = poll_fault_page(rs, &offset); 1897 } 1898 1899 if (block) { 1900 /* 1901 * We want the background search to continue from the queued page 1902 * since the guest is likely to want other pages near to the page 1903 * it just requested. 1904 */ 1905 pss->block = block; 1906 pss->page = offset >> TARGET_PAGE_BITS; 1907 1908 /* 1909 * This unqueued page would break the "one round" check, even is 1910 * really rare. 1911 */ 1912 pss->complete_round = false; 1913 } 1914 1915 return !!block; 1916 } 1917 1918 /** 1919 * migration_page_queue_free: drop any remaining pages in the ram 1920 * request queue 1921 * 1922 * It should be empty at the end anyway, but in error cases there may 1923 * be some left. in case that there is any page left, we drop it. 1924 * 1925 */ 1926 static void migration_page_queue_free(RAMState *rs) 1927 { 1928 struct RAMSrcPageRequest *mspr, *next_mspr; 1929 /* This queue generally should be empty - but in the case of a failed 1930 * migration might have some droppings in. 1931 */ 1932 RCU_READ_LOCK_GUARD(); 1933 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1934 memory_region_unref(mspr->rb->mr); 1935 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1936 g_free(mspr); 1937 } 1938 } 1939 1940 /** 1941 * ram_save_queue_pages: queue the page for transmission 1942 * 1943 * A request from postcopy destination for example. 1944 * 1945 * Returns zero on success or negative on error 1946 * 1947 * @rbname: Name of the RAMBLock of the request. NULL means the 1948 * same that last one. 1949 * @start: starting address from the start of the RAMBlock 1950 * @len: length (in bytes) to send 1951 */ 1952 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1953 { 1954 RAMBlock *ramblock; 1955 RAMState *rs = ram_state; 1956 1957 stat64_add(&mig_stats.postcopy_requests, 1); 1958 RCU_READ_LOCK_GUARD(); 1959 1960 if (!rbname) { 1961 /* Reuse last RAMBlock */ 1962 ramblock = rs->last_req_rb; 1963 1964 if (!ramblock) { 1965 /* 1966 * Shouldn't happen, we can't reuse the last RAMBlock if 1967 * it's the 1st request. 1968 */ 1969 error_report("ram_save_queue_pages no previous block"); 1970 return -1; 1971 } 1972 } else { 1973 ramblock = qemu_ram_block_by_name(rbname); 1974 1975 if (!ramblock) { 1976 /* We shouldn't be asked for a non-existent RAMBlock */ 1977 error_report("ram_save_queue_pages no block '%s'", rbname); 1978 return -1; 1979 } 1980 rs->last_req_rb = ramblock; 1981 } 1982 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1983 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1984 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1985 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1986 __func__, start, len, ramblock->used_length); 1987 return -1; 1988 } 1989 1990 /* 1991 * When with postcopy preempt, we send back the page directly in the 1992 * rp-return thread. 1993 */ 1994 if (postcopy_preempt_active()) { 1995 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1996 size_t page_size = qemu_ram_pagesize(ramblock); 1997 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1998 int ret = 0; 1999 2000 qemu_mutex_lock(&rs->bitmap_mutex); 2001 2002 pss_init(pss, ramblock, page_start); 2003 /* 2004 * Always use the preempt channel, and make sure it's there. It's 2005 * safe to access without lock, because when rp-thread is running 2006 * we should be the only one who operates on the qemufile 2007 */ 2008 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2009 assert(pss->pss_channel); 2010 2011 /* 2012 * It must be either one or multiple of host page size. Just 2013 * assert; if something wrong we're mostly split brain anyway. 2014 */ 2015 assert(len % page_size == 0); 2016 while (len) { 2017 if (ram_save_host_page_urgent(pss)) { 2018 error_report("%s: ram_save_host_page_urgent() failed: " 2019 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2020 __func__, ramblock->idstr, start); 2021 ret = -1; 2022 break; 2023 } 2024 /* 2025 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2026 * will automatically be moved and point to the next host page 2027 * we're going to send, so no need to update here. 2028 * 2029 * Normally QEMU never sends >1 host page in requests, so 2030 * logically we don't even need that as the loop should only 2031 * run once, but just to be consistent. 2032 */ 2033 len -= page_size; 2034 }; 2035 qemu_mutex_unlock(&rs->bitmap_mutex); 2036 2037 return ret; 2038 } 2039 2040 struct RAMSrcPageRequest *new_entry = 2041 g_new0(struct RAMSrcPageRequest, 1); 2042 new_entry->rb = ramblock; 2043 new_entry->offset = start; 2044 new_entry->len = len; 2045 2046 memory_region_ref(ramblock->mr); 2047 qemu_mutex_lock(&rs->src_page_req_mutex); 2048 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2049 migration_make_urgent_request(); 2050 qemu_mutex_unlock(&rs->src_page_req_mutex); 2051 2052 return 0; 2053 } 2054 2055 static bool save_page_use_compression(RAMState *rs) 2056 { 2057 if (!migrate_compress()) { 2058 return false; 2059 } 2060 2061 /* 2062 * If xbzrle is enabled (e.g., after first round of migration), stop 2063 * using the data compression. In theory, xbzrle can do better than 2064 * compression. 2065 */ 2066 if (rs->xbzrle_started) { 2067 return false; 2068 } 2069 2070 return true; 2071 } 2072 2073 /* 2074 * try to compress the page before posting it out, return true if the page 2075 * has been properly handled by compression, otherwise needs other 2076 * paths to handle it 2077 */ 2078 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2079 RAMBlock *block, ram_addr_t offset) 2080 { 2081 if (!save_page_use_compression(rs)) { 2082 return false; 2083 } 2084 2085 /* 2086 * When starting the process of a new block, the first page of 2087 * the block should be sent out before other pages in the same 2088 * block, and all the pages in last block should have been sent 2089 * out, keeping this order is important, because the 'cont' flag 2090 * is used to avoid resending the block name. 2091 * 2092 * We post the fist page as normal page as compression will take 2093 * much CPU resource. 2094 */ 2095 if (block != pss->last_sent_block) { 2096 ram_flush_compressed_data(rs); 2097 return false; 2098 } 2099 2100 if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) { 2101 return true; 2102 } 2103 2104 compression_counters.busy++; 2105 return false; 2106 } 2107 2108 /** 2109 * ram_save_target_page_legacy: save one target page 2110 * 2111 * Returns the number of pages written 2112 * 2113 * @rs: current RAM state 2114 * @pss: data about the page we want to send 2115 */ 2116 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2117 { 2118 RAMBlock *block = pss->block; 2119 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2120 int res; 2121 2122 if (control_save_page(pss, block, offset, &res)) { 2123 return res; 2124 } 2125 2126 if (save_compress_page(rs, pss, block, offset)) { 2127 return 1; 2128 } 2129 2130 res = save_zero_page(pss, pss->pss_channel, block, offset); 2131 if (res > 0) { 2132 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2133 * page would be stale 2134 */ 2135 if (rs->xbzrle_started) { 2136 XBZRLE_cache_lock(); 2137 xbzrle_cache_zero_page(rs, block->offset + offset); 2138 XBZRLE_cache_unlock(); 2139 } 2140 return res; 2141 } 2142 2143 /* 2144 * Do not use multifd in postcopy as one whole host page should be 2145 * placed. Meanwhile postcopy requires atomic update of pages, so even 2146 * if host page size == guest page size the dest guest during run may 2147 * still see partially copied pages which is data corruption. 2148 */ 2149 if (migrate_multifd() && !migration_in_postcopy()) { 2150 return ram_save_multifd_page(pss->pss_channel, block, offset); 2151 } 2152 2153 return ram_save_page(rs, pss); 2154 } 2155 2156 /* Should be called before sending a host page */ 2157 static void pss_host_page_prepare(PageSearchStatus *pss) 2158 { 2159 /* How many guest pages are there in one host page? */ 2160 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2161 2162 pss->host_page_sending = true; 2163 if (guest_pfns <= 1) { 2164 /* 2165 * This covers both when guest psize == host psize, or when guest 2166 * has larger psize than the host (guest_pfns==0). 2167 * 2168 * For the latter, we always send one whole guest page per 2169 * iteration of the host page (example: an Alpha VM on x86 host 2170 * will have guest psize 8K while host psize 4K). 2171 */ 2172 pss->host_page_start = pss->page; 2173 pss->host_page_end = pss->page + 1; 2174 } else { 2175 /* 2176 * The host page spans over multiple guest pages, we send them 2177 * within the same host page iteration. 2178 */ 2179 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2180 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2181 } 2182 } 2183 2184 /* 2185 * Whether the page pointed by PSS is within the host page being sent. 2186 * Must be called after a previous pss_host_page_prepare(). 2187 */ 2188 static bool pss_within_range(PageSearchStatus *pss) 2189 { 2190 ram_addr_t ram_addr; 2191 2192 assert(pss->host_page_sending); 2193 2194 /* Over host-page boundary? */ 2195 if (pss->page >= pss->host_page_end) { 2196 return false; 2197 } 2198 2199 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2200 2201 return offset_in_ramblock(pss->block, ram_addr); 2202 } 2203 2204 static void pss_host_page_finish(PageSearchStatus *pss) 2205 { 2206 pss->host_page_sending = false; 2207 /* This is not needed, but just to reset it */ 2208 pss->host_page_start = pss->host_page_end = 0; 2209 } 2210 2211 /* 2212 * Send an urgent host page specified by `pss'. Need to be called with 2213 * bitmap_mutex held. 2214 * 2215 * Returns 0 if save host page succeeded, false otherwise. 2216 */ 2217 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2218 { 2219 bool page_dirty, sent = false; 2220 RAMState *rs = ram_state; 2221 int ret = 0; 2222 2223 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2224 pss_host_page_prepare(pss); 2225 2226 /* 2227 * If precopy is sending the same page, let it be done in precopy, or 2228 * we could send the same page in two channels and none of them will 2229 * receive the whole page. 2230 */ 2231 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2232 trace_postcopy_preempt_hit(pss->block->idstr, 2233 pss->page << TARGET_PAGE_BITS); 2234 return 0; 2235 } 2236 2237 do { 2238 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2239 2240 if (page_dirty) { 2241 /* Be strict to return code; it must be 1, or what else? */ 2242 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2243 error_report_once("%s: ram_save_target_page failed", __func__); 2244 ret = -1; 2245 goto out; 2246 } 2247 sent = true; 2248 } 2249 pss_find_next_dirty(pss); 2250 } while (pss_within_range(pss)); 2251 out: 2252 pss_host_page_finish(pss); 2253 /* For urgent requests, flush immediately if sent */ 2254 if (sent) { 2255 qemu_fflush(pss->pss_channel); 2256 } 2257 return ret; 2258 } 2259 2260 /** 2261 * ram_save_host_page: save a whole host page 2262 * 2263 * Starting at *offset send pages up to the end of the current host 2264 * page. It's valid for the initial offset to point into the middle of 2265 * a host page in which case the remainder of the hostpage is sent. 2266 * Only dirty target pages are sent. Note that the host page size may 2267 * be a huge page for this block. 2268 * 2269 * The saving stops at the boundary of the used_length of the block 2270 * if the RAMBlock isn't a multiple of the host page size. 2271 * 2272 * The caller must be with ram_state.bitmap_mutex held to call this 2273 * function. Note that this function can temporarily release the lock, but 2274 * when the function is returned it'll make sure the lock is still held. 2275 * 2276 * Returns the number of pages written or negative on error 2277 * 2278 * @rs: current RAM state 2279 * @pss: data about the page we want to send 2280 */ 2281 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2282 { 2283 bool page_dirty, preempt_active = postcopy_preempt_active(); 2284 int tmppages, pages = 0; 2285 size_t pagesize_bits = 2286 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2287 unsigned long start_page = pss->page; 2288 int res; 2289 2290 if (ramblock_is_ignored(pss->block)) { 2291 error_report("block %s should not be migrated !", pss->block->idstr); 2292 return 0; 2293 } 2294 2295 /* Update host page boundary information */ 2296 pss_host_page_prepare(pss); 2297 2298 do { 2299 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2300 2301 /* Check the pages is dirty and if it is send it */ 2302 if (page_dirty) { 2303 /* 2304 * Properly yield the lock only in postcopy preempt mode 2305 * because both migration thread and rp-return thread can 2306 * operate on the bitmaps. 2307 */ 2308 if (preempt_active) { 2309 qemu_mutex_unlock(&rs->bitmap_mutex); 2310 } 2311 tmppages = migration_ops->ram_save_target_page(rs, pss); 2312 if (tmppages >= 0) { 2313 pages += tmppages; 2314 /* 2315 * Allow rate limiting to happen in the middle of huge pages if 2316 * something is sent in the current iteration. 2317 */ 2318 if (pagesize_bits > 1 && tmppages > 0) { 2319 migration_rate_limit(); 2320 } 2321 } 2322 if (preempt_active) { 2323 qemu_mutex_lock(&rs->bitmap_mutex); 2324 } 2325 } else { 2326 tmppages = 0; 2327 } 2328 2329 if (tmppages < 0) { 2330 pss_host_page_finish(pss); 2331 return tmppages; 2332 } 2333 2334 pss_find_next_dirty(pss); 2335 } while (pss_within_range(pss)); 2336 2337 pss_host_page_finish(pss); 2338 2339 res = ram_save_release_protection(rs, pss, start_page); 2340 return (res < 0 ? res : pages); 2341 } 2342 2343 /** 2344 * ram_find_and_save_block: finds a dirty page and sends it to f 2345 * 2346 * Called within an RCU critical section. 2347 * 2348 * Returns the number of pages written where zero means no dirty pages, 2349 * or negative on error 2350 * 2351 * @rs: current RAM state 2352 * 2353 * On systems where host-page-size > target-page-size it will send all the 2354 * pages in a host page that are dirty. 2355 */ 2356 static int ram_find_and_save_block(RAMState *rs) 2357 { 2358 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2359 int pages = 0; 2360 2361 /* No dirty page as there is zero RAM */ 2362 if (!rs->ram_bytes_total) { 2363 return pages; 2364 } 2365 2366 /* 2367 * Always keep last_seen_block/last_page valid during this procedure, 2368 * because find_dirty_block() relies on these values (e.g., we compare 2369 * last_seen_block with pss.block to see whether we searched all the 2370 * ramblocks) to detect the completion of migration. Having NULL value 2371 * of last_seen_block can conditionally cause below loop to run forever. 2372 */ 2373 if (!rs->last_seen_block) { 2374 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2375 rs->last_page = 0; 2376 } 2377 2378 pss_init(pss, rs->last_seen_block, rs->last_page); 2379 2380 while (true){ 2381 if (!get_queued_page(rs, pss)) { 2382 /* priority queue empty, so just search for something dirty */ 2383 int res = find_dirty_block(rs, pss); 2384 if (res != PAGE_DIRTY_FOUND) { 2385 if (res == PAGE_ALL_CLEAN) { 2386 break; 2387 } else if (res == PAGE_TRY_AGAIN) { 2388 continue; 2389 } else if (res < 0) { 2390 pages = res; 2391 break; 2392 } 2393 } 2394 } 2395 pages = ram_save_host_page(rs, pss); 2396 if (pages) { 2397 break; 2398 } 2399 } 2400 2401 rs->last_seen_block = pss->block; 2402 rs->last_page = pss->page; 2403 2404 return pages; 2405 } 2406 2407 static uint64_t ram_bytes_total_with_ignored(void) 2408 { 2409 RAMBlock *block; 2410 uint64_t total = 0; 2411 2412 RCU_READ_LOCK_GUARD(); 2413 2414 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2415 total += block->used_length; 2416 } 2417 return total; 2418 } 2419 2420 uint64_t ram_bytes_total(void) 2421 { 2422 RAMBlock *block; 2423 uint64_t total = 0; 2424 2425 RCU_READ_LOCK_GUARD(); 2426 2427 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2428 total += block->used_length; 2429 } 2430 return total; 2431 } 2432 2433 static void xbzrle_load_setup(void) 2434 { 2435 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2436 } 2437 2438 static void xbzrle_load_cleanup(void) 2439 { 2440 g_free(XBZRLE.decoded_buf); 2441 XBZRLE.decoded_buf = NULL; 2442 } 2443 2444 static void ram_state_cleanup(RAMState **rsp) 2445 { 2446 if (*rsp) { 2447 migration_page_queue_free(*rsp); 2448 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2449 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2450 g_free(*rsp); 2451 *rsp = NULL; 2452 } 2453 } 2454 2455 static void xbzrle_cleanup(void) 2456 { 2457 XBZRLE_cache_lock(); 2458 if (XBZRLE.cache) { 2459 cache_fini(XBZRLE.cache); 2460 g_free(XBZRLE.encoded_buf); 2461 g_free(XBZRLE.current_buf); 2462 g_free(XBZRLE.zero_target_page); 2463 XBZRLE.cache = NULL; 2464 XBZRLE.encoded_buf = NULL; 2465 XBZRLE.current_buf = NULL; 2466 XBZRLE.zero_target_page = NULL; 2467 } 2468 XBZRLE_cache_unlock(); 2469 } 2470 2471 static void ram_save_cleanup(void *opaque) 2472 { 2473 RAMState **rsp = opaque; 2474 RAMBlock *block; 2475 2476 /* We don't use dirty log with background snapshots */ 2477 if (!migrate_background_snapshot()) { 2478 /* caller have hold iothread lock or is in a bh, so there is 2479 * no writing race against the migration bitmap 2480 */ 2481 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2482 /* 2483 * do not stop dirty log without starting it, since 2484 * memory_global_dirty_log_stop will assert that 2485 * memory_global_dirty_log_start/stop used in pairs 2486 */ 2487 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2488 } 2489 } 2490 2491 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2492 g_free(block->clear_bmap); 2493 block->clear_bmap = NULL; 2494 g_free(block->bmap); 2495 block->bmap = NULL; 2496 } 2497 2498 xbzrle_cleanup(); 2499 compress_threads_save_cleanup(); 2500 ram_state_cleanup(rsp); 2501 g_free(migration_ops); 2502 migration_ops = NULL; 2503 } 2504 2505 static void ram_state_reset(RAMState *rs) 2506 { 2507 int i; 2508 2509 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2510 rs->pss[i].last_sent_block = NULL; 2511 } 2512 2513 rs->last_seen_block = NULL; 2514 rs->last_page = 0; 2515 rs->last_version = ram_list.version; 2516 rs->xbzrle_started = false; 2517 } 2518 2519 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2520 2521 /* **** functions for postcopy ***** */ 2522 2523 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2524 { 2525 struct RAMBlock *block; 2526 2527 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2528 unsigned long *bitmap = block->bmap; 2529 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2530 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2531 2532 while (run_start < range) { 2533 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2534 ram_discard_range(block->idstr, 2535 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2536 ((ram_addr_t)(run_end - run_start)) 2537 << TARGET_PAGE_BITS); 2538 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2539 } 2540 } 2541 } 2542 2543 /** 2544 * postcopy_send_discard_bm_ram: discard a RAMBlock 2545 * 2546 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2547 * 2548 * @ms: current migration state 2549 * @block: RAMBlock to discard 2550 */ 2551 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2552 { 2553 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2554 unsigned long current; 2555 unsigned long *bitmap = block->bmap; 2556 2557 for (current = 0; current < end; ) { 2558 unsigned long one = find_next_bit(bitmap, end, current); 2559 unsigned long zero, discard_length; 2560 2561 if (one >= end) { 2562 break; 2563 } 2564 2565 zero = find_next_zero_bit(bitmap, end, one + 1); 2566 2567 if (zero >= end) { 2568 discard_length = end - one; 2569 } else { 2570 discard_length = zero - one; 2571 } 2572 postcopy_discard_send_range(ms, one, discard_length); 2573 current = one + discard_length; 2574 } 2575 } 2576 2577 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2578 2579 /** 2580 * postcopy_each_ram_send_discard: discard all RAMBlocks 2581 * 2582 * Utility for the outgoing postcopy code. 2583 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2584 * passing it bitmap indexes and name. 2585 * (qemu_ram_foreach_block ends up passing unscaled lengths 2586 * which would mean postcopy code would have to deal with target page) 2587 * 2588 * @ms: current migration state 2589 */ 2590 static void postcopy_each_ram_send_discard(MigrationState *ms) 2591 { 2592 struct RAMBlock *block; 2593 2594 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2595 postcopy_discard_send_init(ms, block->idstr); 2596 2597 /* 2598 * Deal with TPS != HPS and huge pages. It discard any partially sent 2599 * host-page size chunks, mark any partially dirty host-page size 2600 * chunks as all dirty. In this case the host-page is the host-page 2601 * for the particular RAMBlock, i.e. it might be a huge page. 2602 */ 2603 postcopy_chunk_hostpages_pass(ms, block); 2604 2605 /* 2606 * Postcopy sends chunks of bitmap over the wire, but it 2607 * just needs indexes at this point, avoids it having 2608 * target page specific code. 2609 */ 2610 postcopy_send_discard_bm_ram(ms, block); 2611 postcopy_discard_send_finish(ms); 2612 } 2613 } 2614 2615 /** 2616 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2617 * 2618 * Helper for postcopy_chunk_hostpages; it's called twice to 2619 * canonicalize the two bitmaps, that are similar, but one is 2620 * inverted. 2621 * 2622 * Postcopy requires that all target pages in a hostpage are dirty or 2623 * clean, not a mix. This function canonicalizes the bitmaps. 2624 * 2625 * @ms: current migration state 2626 * @block: block that contains the page we want to canonicalize 2627 */ 2628 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2629 { 2630 RAMState *rs = ram_state; 2631 unsigned long *bitmap = block->bmap; 2632 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2633 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2634 unsigned long run_start; 2635 2636 if (block->page_size == TARGET_PAGE_SIZE) { 2637 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2638 return; 2639 } 2640 2641 /* Find a dirty page */ 2642 run_start = find_next_bit(bitmap, pages, 0); 2643 2644 while (run_start < pages) { 2645 2646 /* 2647 * If the start of this run of pages is in the middle of a host 2648 * page, then we need to fixup this host page. 2649 */ 2650 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2651 /* Find the end of this run */ 2652 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2653 /* 2654 * If the end isn't at the start of a host page, then the 2655 * run doesn't finish at the end of a host page 2656 * and we need to discard. 2657 */ 2658 } 2659 2660 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2661 unsigned long page; 2662 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2663 host_ratio); 2664 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2665 2666 /* Clean up the bitmap */ 2667 for (page = fixup_start_addr; 2668 page < fixup_start_addr + host_ratio; page++) { 2669 /* 2670 * Remark them as dirty, updating the count for any pages 2671 * that weren't previously dirty. 2672 */ 2673 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2674 } 2675 } 2676 2677 /* Find the next dirty page for the next iteration */ 2678 run_start = find_next_bit(bitmap, pages, run_start); 2679 } 2680 } 2681 2682 /** 2683 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2684 * 2685 * Transmit the set of pages to be discarded after precopy to the target 2686 * these are pages that: 2687 * a) Have been previously transmitted but are now dirty again 2688 * b) Pages that have never been transmitted, this ensures that 2689 * any pages on the destination that have been mapped by background 2690 * tasks get discarded (transparent huge pages is the specific concern) 2691 * Hopefully this is pretty sparse 2692 * 2693 * @ms: current migration state 2694 */ 2695 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2696 { 2697 RAMState *rs = ram_state; 2698 2699 RCU_READ_LOCK_GUARD(); 2700 2701 /* This should be our last sync, the src is now paused */ 2702 migration_bitmap_sync(rs, false); 2703 2704 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2705 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2706 rs->last_seen_block = NULL; 2707 rs->last_page = 0; 2708 2709 postcopy_each_ram_send_discard(ms); 2710 2711 trace_ram_postcopy_send_discard_bitmap(); 2712 } 2713 2714 /** 2715 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2716 * 2717 * Returns zero on success 2718 * 2719 * @rbname: name of the RAMBlock of the request. NULL means the 2720 * same that last one. 2721 * @start: RAMBlock starting page 2722 * @length: RAMBlock size 2723 */ 2724 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2725 { 2726 trace_ram_discard_range(rbname, start, length); 2727 2728 RCU_READ_LOCK_GUARD(); 2729 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2730 2731 if (!rb) { 2732 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2733 return -1; 2734 } 2735 2736 /* 2737 * On source VM, we don't need to update the received bitmap since 2738 * we don't even have one. 2739 */ 2740 if (rb->receivedmap) { 2741 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2742 length >> qemu_target_page_bits()); 2743 } 2744 2745 return ram_block_discard_range(rb, start, length); 2746 } 2747 2748 /* 2749 * For every allocation, we will try not to crash the VM if the 2750 * allocation failed. 2751 */ 2752 static int xbzrle_init(void) 2753 { 2754 Error *local_err = NULL; 2755 2756 if (!migrate_xbzrle()) { 2757 return 0; 2758 } 2759 2760 XBZRLE_cache_lock(); 2761 2762 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2763 if (!XBZRLE.zero_target_page) { 2764 error_report("%s: Error allocating zero page", __func__); 2765 goto err_out; 2766 } 2767 2768 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2769 TARGET_PAGE_SIZE, &local_err); 2770 if (!XBZRLE.cache) { 2771 error_report_err(local_err); 2772 goto free_zero_page; 2773 } 2774 2775 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2776 if (!XBZRLE.encoded_buf) { 2777 error_report("%s: Error allocating encoded_buf", __func__); 2778 goto free_cache; 2779 } 2780 2781 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2782 if (!XBZRLE.current_buf) { 2783 error_report("%s: Error allocating current_buf", __func__); 2784 goto free_encoded_buf; 2785 } 2786 2787 /* We are all good */ 2788 XBZRLE_cache_unlock(); 2789 return 0; 2790 2791 free_encoded_buf: 2792 g_free(XBZRLE.encoded_buf); 2793 XBZRLE.encoded_buf = NULL; 2794 free_cache: 2795 cache_fini(XBZRLE.cache); 2796 XBZRLE.cache = NULL; 2797 free_zero_page: 2798 g_free(XBZRLE.zero_target_page); 2799 XBZRLE.zero_target_page = NULL; 2800 err_out: 2801 XBZRLE_cache_unlock(); 2802 return -ENOMEM; 2803 } 2804 2805 static int ram_state_init(RAMState **rsp) 2806 { 2807 *rsp = g_try_new0(RAMState, 1); 2808 2809 if (!*rsp) { 2810 error_report("%s: Init ramstate fail", __func__); 2811 return -1; 2812 } 2813 2814 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2815 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2816 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2817 (*rsp)->ram_bytes_total = ram_bytes_total(); 2818 2819 /* 2820 * Count the total number of pages used by ram blocks not including any 2821 * gaps due to alignment or unplugs. 2822 * This must match with the initial values of dirty bitmap. 2823 */ 2824 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2825 ram_state_reset(*rsp); 2826 2827 return 0; 2828 } 2829 2830 static void ram_list_init_bitmaps(void) 2831 { 2832 MigrationState *ms = migrate_get_current(); 2833 RAMBlock *block; 2834 unsigned long pages; 2835 uint8_t shift; 2836 2837 /* Skip setting bitmap if there is no RAM */ 2838 if (ram_bytes_total()) { 2839 shift = ms->clear_bitmap_shift; 2840 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2841 error_report("clear_bitmap_shift (%u) too big, using " 2842 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2843 shift = CLEAR_BITMAP_SHIFT_MAX; 2844 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2845 error_report("clear_bitmap_shift (%u) too small, using " 2846 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2847 shift = CLEAR_BITMAP_SHIFT_MIN; 2848 } 2849 2850 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2851 pages = block->max_length >> TARGET_PAGE_BITS; 2852 /* 2853 * The initial dirty bitmap for migration must be set with all 2854 * ones to make sure we'll migrate every guest RAM page to 2855 * destination. 2856 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2857 * new migration after a failed migration, ram_list. 2858 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2859 * guest memory. 2860 */ 2861 block->bmap = bitmap_new(pages); 2862 bitmap_set(block->bmap, 0, pages); 2863 block->clear_bmap_shift = shift; 2864 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2865 } 2866 } 2867 } 2868 2869 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2870 { 2871 unsigned long pages; 2872 RAMBlock *rb; 2873 2874 RCU_READ_LOCK_GUARD(); 2875 2876 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2877 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2878 rs->migration_dirty_pages -= pages; 2879 } 2880 } 2881 2882 static void ram_init_bitmaps(RAMState *rs) 2883 { 2884 /* For memory_global_dirty_log_start below. */ 2885 qemu_mutex_lock_iothread(); 2886 qemu_mutex_lock_ramlist(); 2887 2888 WITH_RCU_READ_LOCK_GUARD() { 2889 ram_list_init_bitmaps(); 2890 /* We don't use dirty log with background snapshots */ 2891 if (!migrate_background_snapshot()) { 2892 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 2893 migration_bitmap_sync_precopy(rs, false); 2894 } 2895 } 2896 qemu_mutex_unlock_ramlist(); 2897 qemu_mutex_unlock_iothread(); 2898 2899 /* 2900 * After an eventual first bitmap sync, fixup the initial bitmap 2901 * containing all 1s to exclude any discarded pages from migration. 2902 */ 2903 migration_bitmap_clear_discarded_pages(rs); 2904 } 2905 2906 static int ram_init_all(RAMState **rsp) 2907 { 2908 if (ram_state_init(rsp)) { 2909 return -1; 2910 } 2911 2912 if (xbzrle_init()) { 2913 ram_state_cleanup(rsp); 2914 return -1; 2915 } 2916 2917 ram_init_bitmaps(*rsp); 2918 2919 return 0; 2920 } 2921 2922 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2923 { 2924 RAMBlock *block; 2925 uint64_t pages = 0; 2926 2927 /* 2928 * Postcopy is not using xbzrle/compression, so no need for that. 2929 * Also, since source are already halted, we don't need to care 2930 * about dirty page logging as well. 2931 */ 2932 2933 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2934 pages += bitmap_count_one(block->bmap, 2935 block->used_length >> TARGET_PAGE_BITS); 2936 } 2937 2938 /* This may not be aligned with current bitmaps. Recalculate. */ 2939 rs->migration_dirty_pages = pages; 2940 2941 ram_state_reset(rs); 2942 2943 /* Update RAMState cache of output QEMUFile */ 2944 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2945 2946 trace_ram_state_resume_prepare(pages); 2947 } 2948 2949 /* 2950 * This function clears bits of the free pages reported by the caller from the 2951 * migration dirty bitmap. @addr is the host address corresponding to the 2952 * start of the continuous guest free pages, and @len is the total bytes of 2953 * those pages. 2954 */ 2955 void qemu_guest_free_page_hint(void *addr, size_t len) 2956 { 2957 RAMBlock *block; 2958 ram_addr_t offset; 2959 size_t used_len, start, npages; 2960 MigrationState *s = migrate_get_current(); 2961 2962 /* This function is currently expected to be used during live migration */ 2963 if (!migration_is_setup_or_active(s->state)) { 2964 return; 2965 } 2966 2967 for (; len > 0; len -= used_len, addr += used_len) { 2968 block = qemu_ram_block_from_host(addr, false, &offset); 2969 if (unlikely(!block || offset >= block->used_length)) { 2970 /* 2971 * The implementation might not support RAMBlock resize during 2972 * live migration, but it could happen in theory with future 2973 * updates. So we add a check here to capture that case. 2974 */ 2975 error_report_once("%s unexpected error", __func__); 2976 return; 2977 } 2978 2979 if (len <= block->used_length - offset) { 2980 used_len = len; 2981 } else { 2982 used_len = block->used_length - offset; 2983 } 2984 2985 start = offset >> TARGET_PAGE_BITS; 2986 npages = used_len >> TARGET_PAGE_BITS; 2987 2988 qemu_mutex_lock(&ram_state->bitmap_mutex); 2989 /* 2990 * The skipped free pages are equavalent to be sent from clear_bmap's 2991 * perspective, so clear the bits from the memory region bitmap which 2992 * are initially set. Otherwise those skipped pages will be sent in 2993 * the next round after syncing from the memory region bitmap. 2994 */ 2995 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2996 ram_state->migration_dirty_pages -= 2997 bitmap_count_one_with_offset(block->bmap, start, npages); 2998 bitmap_clear(block->bmap, start, npages); 2999 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3000 } 3001 } 3002 3003 /* 3004 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3005 * long-running RCU critical section. When rcu-reclaims in the code 3006 * start to become numerous it will be necessary to reduce the 3007 * granularity of these critical sections. 3008 */ 3009 3010 /** 3011 * ram_save_setup: Setup RAM for migration 3012 * 3013 * Returns zero to indicate success and negative for error 3014 * 3015 * @f: QEMUFile where to send the data 3016 * @opaque: RAMState pointer 3017 */ 3018 static int ram_save_setup(QEMUFile *f, void *opaque) 3019 { 3020 RAMState **rsp = opaque; 3021 RAMBlock *block; 3022 int ret; 3023 3024 if (compress_threads_save_setup()) { 3025 return -1; 3026 } 3027 3028 /* migration has already setup the bitmap, reuse it. */ 3029 if (!migration_in_colo_state()) { 3030 if (ram_init_all(rsp) != 0) { 3031 compress_threads_save_cleanup(); 3032 return -1; 3033 } 3034 } 3035 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3036 3037 WITH_RCU_READ_LOCK_GUARD() { 3038 qemu_put_be64(f, ram_bytes_total_with_ignored() 3039 | RAM_SAVE_FLAG_MEM_SIZE); 3040 3041 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3042 qemu_put_byte(f, strlen(block->idstr)); 3043 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3044 qemu_put_be64(f, block->used_length); 3045 if (migrate_postcopy_ram() && block->page_size != 3046 qemu_host_page_size) { 3047 qemu_put_be64(f, block->page_size); 3048 } 3049 if (migrate_ignore_shared()) { 3050 qemu_put_be64(f, block->mr->addr); 3051 } 3052 } 3053 } 3054 3055 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3056 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3057 3058 migration_ops = g_malloc0(sizeof(MigrationOps)); 3059 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3060 ret = multifd_send_sync_main(f); 3061 if (ret < 0) { 3062 return ret; 3063 } 3064 3065 if (!migrate_multifd_flush_after_each_section()) { 3066 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3067 } 3068 3069 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3070 qemu_fflush(f); 3071 3072 return 0; 3073 } 3074 3075 /** 3076 * ram_save_iterate: iterative stage for migration 3077 * 3078 * Returns zero to indicate success and negative for error 3079 * 3080 * @f: QEMUFile where to send the data 3081 * @opaque: RAMState pointer 3082 */ 3083 static int ram_save_iterate(QEMUFile *f, void *opaque) 3084 { 3085 RAMState **temp = opaque; 3086 RAMState *rs = *temp; 3087 int ret = 0; 3088 int i; 3089 int64_t t0; 3090 int done = 0; 3091 3092 if (blk_mig_bulk_active()) { 3093 /* Avoid transferring ram during bulk phase of block migration as 3094 * the bulk phase will usually take a long time and transferring 3095 * ram updates during that time is pointless. */ 3096 goto out; 3097 } 3098 3099 /* 3100 * We'll take this lock a little bit long, but it's okay for two reasons. 3101 * Firstly, the only possible other thread to take it is who calls 3102 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3103 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3104 * guarantees that we'll at least released it in a regular basis. 3105 */ 3106 qemu_mutex_lock(&rs->bitmap_mutex); 3107 WITH_RCU_READ_LOCK_GUARD() { 3108 if (ram_list.version != rs->last_version) { 3109 ram_state_reset(rs); 3110 } 3111 3112 /* Read version before ram_list.blocks */ 3113 smp_rmb(); 3114 3115 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3116 3117 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3118 i = 0; 3119 while ((ret = qemu_file_rate_limit(f)) == 0 || 3120 postcopy_has_request(rs)) { 3121 int pages; 3122 3123 if (qemu_file_get_error(f)) { 3124 break; 3125 } 3126 3127 pages = ram_find_and_save_block(rs); 3128 /* no more pages to sent */ 3129 if (pages == 0) { 3130 done = 1; 3131 break; 3132 } 3133 3134 if (pages < 0) { 3135 qemu_file_set_error(f, pages); 3136 break; 3137 } 3138 3139 rs->target_page_count += pages; 3140 3141 /* 3142 * During postcopy, it is necessary to make sure one whole host 3143 * page is sent in one chunk. 3144 */ 3145 if (migrate_postcopy_ram()) { 3146 ram_flush_compressed_data(rs); 3147 } 3148 3149 /* 3150 * we want to check in the 1st loop, just in case it was the 1st 3151 * time and we had to sync the dirty bitmap. 3152 * qemu_clock_get_ns() is a bit expensive, so we only check each 3153 * some iterations 3154 */ 3155 if ((i & 63) == 0) { 3156 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3157 1000000; 3158 if (t1 > MAX_WAIT) { 3159 trace_ram_save_iterate_big_wait(t1, i); 3160 break; 3161 } 3162 } 3163 i++; 3164 } 3165 } 3166 qemu_mutex_unlock(&rs->bitmap_mutex); 3167 3168 /* 3169 * Must occur before EOS (or any QEMUFile operation) 3170 * because of RDMA protocol. 3171 */ 3172 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3173 3174 out: 3175 if (ret >= 0 3176 && migration_is_setup_or_active(migrate_get_current()->state)) { 3177 if (migrate_multifd_flush_after_each_section()) { 3178 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3179 if (ret < 0) { 3180 return ret; 3181 } 3182 } 3183 3184 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3185 qemu_fflush(f); 3186 ram_transferred_add(8); 3187 3188 ret = qemu_file_get_error(f); 3189 } 3190 if (ret < 0) { 3191 return ret; 3192 } 3193 3194 return done; 3195 } 3196 3197 /** 3198 * ram_save_complete: function called to send the remaining amount of ram 3199 * 3200 * Returns zero to indicate success or negative on error 3201 * 3202 * Called with iothread lock 3203 * 3204 * @f: QEMUFile where to send the data 3205 * @opaque: RAMState pointer 3206 */ 3207 static int ram_save_complete(QEMUFile *f, void *opaque) 3208 { 3209 RAMState **temp = opaque; 3210 RAMState *rs = *temp; 3211 int ret = 0; 3212 3213 rs->last_stage = !migration_in_colo_state(); 3214 3215 WITH_RCU_READ_LOCK_GUARD() { 3216 if (!migration_in_postcopy()) { 3217 migration_bitmap_sync_precopy(rs, true); 3218 } 3219 3220 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3221 3222 /* try transferring iterative blocks of memory */ 3223 3224 /* flush all remaining blocks regardless of rate limiting */ 3225 qemu_mutex_lock(&rs->bitmap_mutex); 3226 while (true) { 3227 int pages; 3228 3229 pages = ram_find_and_save_block(rs); 3230 /* no more blocks to sent */ 3231 if (pages == 0) { 3232 break; 3233 } 3234 if (pages < 0) { 3235 ret = pages; 3236 break; 3237 } 3238 } 3239 qemu_mutex_unlock(&rs->bitmap_mutex); 3240 3241 ram_flush_compressed_data(rs); 3242 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3243 } 3244 3245 if (ret < 0) { 3246 return ret; 3247 } 3248 3249 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3250 if (ret < 0) { 3251 return ret; 3252 } 3253 3254 if (!migrate_multifd_flush_after_each_section()) { 3255 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3256 } 3257 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3258 qemu_fflush(f); 3259 3260 return 0; 3261 } 3262 3263 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3264 uint64_t *can_postcopy) 3265 { 3266 RAMState **temp = opaque; 3267 RAMState *rs = *temp; 3268 3269 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3270 3271 if (migrate_postcopy_ram()) { 3272 /* We can do postcopy, and all the data is postcopiable */ 3273 *can_postcopy += remaining_size; 3274 } else { 3275 *must_precopy += remaining_size; 3276 } 3277 } 3278 3279 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3280 uint64_t *can_postcopy) 3281 { 3282 MigrationState *s = migrate_get_current(); 3283 RAMState **temp = opaque; 3284 RAMState *rs = *temp; 3285 3286 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3287 3288 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3289 qemu_mutex_lock_iothread(); 3290 WITH_RCU_READ_LOCK_GUARD() { 3291 migration_bitmap_sync_precopy(rs, false); 3292 } 3293 qemu_mutex_unlock_iothread(); 3294 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3295 } 3296 3297 if (migrate_postcopy_ram()) { 3298 /* We can do postcopy, and all the data is postcopiable */ 3299 *can_postcopy += remaining_size; 3300 } else { 3301 *must_precopy += remaining_size; 3302 } 3303 } 3304 3305 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3306 { 3307 unsigned int xh_len; 3308 int xh_flags; 3309 uint8_t *loaded_data; 3310 3311 /* extract RLE header */ 3312 xh_flags = qemu_get_byte(f); 3313 xh_len = qemu_get_be16(f); 3314 3315 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3316 error_report("Failed to load XBZRLE page - wrong compression!"); 3317 return -1; 3318 } 3319 3320 if (xh_len > TARGET_PAGE_SIZE) { 3321 error_report("Failed to load XBZRLE page - len overflow!"); 3322 return -1; 3323 } 3324 loaded_data = XBZRLE.decoded_buf; 3325 /* load data and decode */ 3326 /* it can change loaded_data to point to an internal buffer */ 3327 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3328 3329 /* decode RLE */ 3330 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3331 TARGET_PAGE_SIZE) == -1) { 3332 error_report("Failed to load XBZRLE page - decode error!"); 3333 return -1; 3334 } 3335 3336 return 0; 3337 } 3338 3339 /** 3340 * ram_block_from_stream: read a RAMBlock id from the migration stream 3341 * 3342 * Must be called from within a rcu critical section. 3343 * 3344 * Returns a pointer from within the RCU-protected ram_list. 3345 * 3346 * @mis: the migration incoming state pointer 3347 * @f: QEMUFile where to read the data from 3348 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3349 * @channel: the channel we're using 3350 */ 3351 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3352 QEMUFile *f, int flags, 3353 int channel) 3354 { 3355 RAMBlock *block = mis->last_recv_block[channel]; 3356 char id[256]; 3357 uint8_t len; 3358 3359 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3360 if (!block) { 3361 error_report("Ack, bad migration stream!"); 3362 return NULL; 3363 } 3364 return block; 3365 } 3366 3367 len = qemu_get_byte(f); 3368 qemu_get_buffer(f, (uint8_t *)id, len); 3369 id[len] = 0; 3370 3371 block = qemu_ram_block_by_name(id); 3372 if (!block) { 3373 error_report("Can't find block %s", id); 3374 return NULL; 3375 } 3376 3377 if (ramblock_is_ignored(block)) { 3378 error_report("block %s should not be migrated !", id); 3379 return NULL; 3380 } 3381 3382 mis->last_recv_block[channel] = block; 3383 3384 return block; 3385 } 3386 3387 static inline void *host_from_ram_block_offset(RAMBlock *block, 3388 ram_addr_t offset) 3389 { 3390 if (!offset_in_ramblock(block, offset)) { 3391 return NULL; 3392 } 3393 3394 return block->host + offset; 3395 } 3396 3397 static void *host_page_from_ram_block_offset(RAMBlock *block, 3398 ram_addr_t offset) 3399 { 3400 /* Note: Explicitly no check against offset_in_ramblock(). */ 3401 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3402 block->page_size); 3403 } 3404 3405 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3406 ram_addr_t offset) 3407 { 3408 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3409 } 3410 3411 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3412 { 3413 qemu_mutex_lock(&ram_state->bitmap_mutex); 3414 for (int i = 0; i < pages; i++) { 3415 ram_addr_t offset = normal[i]; 3416 ram_state->migration_dirty_pages += !test_and_set_bit( 3417 offset >> TARGET_PAGE_BITS, 3418 block->bmap); 3419 } 3420 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3421 } 3422 3423 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3424 ram_addr_t offset, bool record_bitmap) 3425 { 3426 if (!offset_in_ramblock(block, offset)) { 3427 return NULL; 3428 } 3429 if (!block->colo_cache) { 3430 error_report("%s: colo_cache is NULL in block :%s", 3431 __func__, block->idstr); 3432 return NULL; 3433 } 3434 3435 /* 3436 * During colo checkpoint, we need bitmap of these migrated pages. 3437 * It help us to decide which pages in ram cache should be flushed 3438 * into VM's RAM later. 3439 */ 3440 if (record_bitmap) { 3441 colo_record_bitmap(block, &offset, 1); 3442 } 3443 return block->colo_cache + offset; 3444 } 3445 3446 /** 3447 * ram_handle_compressed: handle the zero page case 3448 * 3449 * If a page (or a whole RDMA chunk) has been 3450 * determined to be zero, then zap it. 3451 * 3452 * @host: host address for the zero page 3453 * @ch: what the page is filled from. We only support zero 3454 * @size: size of the zero page 3455 */ 3456 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3457 { 3458 if (ch != 0 || !buffer_is_zero(host, size)) { 3459 memset(host, ch, size); 3460 } 3461 } 3462 3463 static void colo_init_ram_state(void) 3464 { 3465 ram_state_init(&ram_state); 3466 } 3467 3468 /* 3469 * colo cache: this is for secondary VM, we cache the whole 3470 * memory of the secondary VM, it is need to hold the global lock 3471 * to call this helper. 3472 */ 3473 int colo_init_ram_cache(void) 3474 { 3475 RAMBlock *block; 3476 3477 WITH_RCU_READ_LOCK_GUARD() { 3478 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3479 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3480 NULL, false, false); 3481 if (!block->colo_cache) { 3482 error_report("%s: Can't alloc memory for COLO cache of block %s," 3483 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3484 block->used_length); 3485 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3486 if (block->colo_cache) { 3487 qemu_anon_ram_free(block->colo_cache, block->used_length); 3488 block->colo_cache = NULL; 3489 } 3490 } 3491 return -errno; 3492 } 3493 if (!machine_dump_guest_core(current_machine)) { 3494 qemu_madvise(block->colo_cache, block->used_length, 3495 QEMU_MADV_DONTDUMP); 3496 } 3497 } 3498 } 3499 3500 /* 3501 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3502 * with to decide which page in cache should be flushed into SVM's RAM. Here 3503 * we use the same name 'ram_bitmap' as for migration. 3504 */ 3505 if (ram_bytes_total()) { 3506 RAMBlock *block; 3507 3508 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3509 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3510 block->bmap = bitmap_new(pages); 3511 } 3512 } 3513 3514 colo_init_ram_state(); 3515 return 0; 3516 } 3517 3518 /* TODO: duplicated with ram_init_bitmaps */ 3519 void colo_incoming_start_dirty_log(void) 3520 { 3521 RAMBlock *block = NULL; 3522 /* For memory_global_dirty_log_start below. */ 3523 qemu_mutex_lock_iothread(); 3524 qemu_mutex_lock_ramlist(); 3525 3526 memory_global_dirty_log_sync(false); 3527 WITH_RCU_READ_LOCK_GUARD() { 3528 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3529 ramblock_sync_dirty_bitmap(ram_state, block); 3530 /* Discard this dirty bitmap record */ 3531 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3532 } 3533 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3534 } 3535 ram_state->migration_dirty_pages = 0; 3536 qemu_mutex_unlock_ramlist(); 3537 qemu_mutex_unlock_iothread(); 3538 } 3539 3540 /* It is need to hold the global lock to call this helper */ 3541 void colo_release_ram_cache(void) 3542 { 3543 RAMBlock *block; 3544 3545 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3546 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3547 g_free(block->bmap); 3548 block->bmap = NULL; 3549 } 3550 3551 WITH_RCU_READ_LOCK_GUARD() { 3552 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3553 if (block->colo_cache) { 3554 qemu_anon_ram_free(block->colo_cache, block->used_length); 3555 block->colo_cache = NULL; 3556 } 3557 } 3558 } 3559 ram_state_cleanup(&ram_state); 3560 } 3561 3562 /** 3563 * ram_load_setup: Setup RAM for migration incoming side 3564 * 3565 * Returns zero to indicate success and negative for error 3566 * 3567 * @f: QEMUFile where to receive the data 3568 * @opaque: RAMState pointer 3569 */ 3570 static int ram_load_setup(QEMUFile *f, void *opaque) 3571 { 3572 xbzrle_load_setup(); 3573 ramblock_recv_map_init(); 3574 3575 return 0; 3576 } 3577 3578 static int ram_load_cleanup(void *opaque) 3579 { 3580 RAMBlock *rb; 3581 3582 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3583 qemu_ram_block_writeback(rb); 3584 } 3585 3586 xbzrle_load_cleanup(); 3587 3588 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3589 g_free(rb->receivedmap); 3590 rb->receivedmap = NULL; 3591 } 3592 3593 return 0; 3594 } 3595 3596 /** 3597 * ram_postcopy_incoming_init: allocate postcopy data structures 3598 * 3599 * Returns 0 for success and negative if there was one error 3600 * 3601 * @mis: current migration incoming state 3602 * 3603 * Allocate data structures etc needed by incoming migration with 3604 * postcopy-ram. postcopy-ram's similarly names 3605 * postcopy_ram_incoming_init does the work. 3606 */ 3607 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3608 { 3609 return postcopy_ram_incoming_init(mis); 3610 } 3611 3612 /** 3613 * ram_load_postcopy: load a page in postcopy case 3614 * 3615 * Returns 0 for success or -errno in case of error 3616 * 3617 * Called in postcopy mode by ram_load(). 3618 * rcu_read_lock is taken prior to this being called. 3619 * 3620 * @f: QEMUFile where to send the data 3621 * @channel: the channel to use for loading 3622 */ 3623 int ram_load_postcopy(QEMUFile *f, int channel) 3624 { 3625 int flags = 0, ret = 0; 3626 bool place_needed = false; 3627 bool matches_target_page_size = false; 3628 MigrationIncomingState *mis = migration_incoming_get_current(); 3629 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3630 3631 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3632 ram_addr_t addr; 3633 void *page_buffer = NULL; 3634 void *place_source = NULL; 3635 RAMBlock *block = NULL; 3636 uint8_t ch; 3637 int len; 3638 3639 addr = qemu_get_be64(f); 3640 3641 /* 3642 * If qemu file error, we should stop here, and then "addr" 3643 * may be invalid 3644 */ 3645 ret = qemu_file_get_error(f); 3646 if (ret) { 3647 break; 3648 } 3649 3650 flags = addr & ~TARGET_PAGE_MASK; 3651 addr &= TARGET_PAGE_MASK; 3652 3653 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3654 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3655 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3656 block = ram_block_from_stream(mis, f, flags, channel); 3657 if (!block) { 3658 ret = -EINVAL; 3659 break; 3660 } 3661 3662 /* 3663 * Relying on used_length is racy and can result in false positives. 3664 * We might place pages beyond used_length in case RAM was shrunk 3665 * while in postcopy, which is fine - trying to place via 3666 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3667 */ 3668 if (!block->host || addr >= block->postcopy_length) { 3669 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3670 ret = -EINVAL; 3671 break; 3672 } 3673 tmp_page->target_pages++; 3674 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3675 /* 3676 * Postcopy requires that we place whole host pages atomically; 3677 * these may be huge pages for RAMBlocks that are backed by 3678 * hugetlbfs. 3679 * To make it atomic, the data is read into a temporary page 3680 * that's moved into place later. 3681 * The migration protocol uses, possibly smaller, target-pages 3682 * however the source ensures it always sends all the components 3683 * of a host page in one chunk. 3684 */ 3685 page_buffer = tmp_page->tmp_huge_page + 3686 host_page_offset_from_ram_block_offset(block, addr); 3687 /* If all TP are zero then we can optimise the place */ 3688 if (tmp_page->target_pages == 1) { 3689 tmp_page->host_addr = 3690 host_page_from_ram_block_offset(block, addr); 3691 } else if (tmp_page->host_addr != 3692 host_page_from_ram_block_offset(block, addr)) { 3693 /* not the 1st TP within the HP */ 3694 error_report("Non-same host page detected on channel %d: " 3695 "Target host page %p, received host page %p " 3696 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3697 channel, tmp_page->host_addr, 3698 host_page_from_ram_block_offset(block, addr), 3699 block->idstr, addr, tmp_page->target_pages); 3700 ret = -EINVAL; 3701 break; 3702 } 3703 3704 /* 3705 * If it's the last part of a host page then we place the host 3706 * page 3707 */ 3708 if (tmp_page->target_pages == 3709 (block->page_size / TARGET_PAGE_SIZE)) { 3710 place_needed = true; 3711 } 3712 place_source = tmp_page->tmp_huge_page; 3713 } 3714 3715 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3716 case RAM_SAVE_FLAG_ZERO: 3717 ch = qemu_get_byte(f); 3718 /* 3719 * Can skip to set page_buffer when 3720 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3721 */ 3722 if (ch || !matches_target_page_size) { 3723 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3724 } 3725 if (ch) { 3726 tmp_page->all_zero = false; 3727 } 3728 break; 3729 3730 case RAM_SAVE_FLAG_PAGE: 3731 tmp_page->all_zero = false; 3732 if (!matches_target_page_size) { 3733 /* For huge pages, we always use temporary buffer */ 3734 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3735 } else { 3736 /* 3737 * For small pages that matches target page size, we 3738 * avoid the qemu_file copy. Instead we directly use 3739 * the buffer of QEMUFile to place the page. Note: we 3740 * cannot do any QEMUFile operation before using that 3741 * buffer to make sure the buffer is valid when 3742 * placing the page. 3743 */ 3744 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3745 TARGET_PAGE_SIZE); 3746 } 3747 break; 3748 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3749 tmp_page->all_zero = false; 3750 len = qemu_get_be32(f); 3751 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3752 error_report("Invalid compressed data length: %d", len); 3753 ret = -EINVAL; 3754 break; 3755 } 3756 decompress_data_with_multi_threads(f, page_buffer, len); 3757 break; 3758 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 3759 multifd_recv_sync_main(); 3760 break; 3761 case RAM_SAVE_FLAG_EOS: 3762 /* normal exit */ 3763 if (migrate_multifd_flush_after_each_section()) { 3764 multifd_recv_sync_main(); 3765 } 3766 break; 3767 default: 3768 error_report("Unknown combination of migration flags: 0x%x" 3769 " (postcopy mode)", flags); 3770 ret = -EINVAL; 3771 break; 3772 } 3773 3774 /* Got the whole host page, wait for decompress before placing. */ 3775 if (place_needed) { 3776 ret |= wait_for_decompress_done(); 3777 } 3778 3779 /* Detect for any possible file errors */ 3780 if (!ret && qemu_file_get_error(f)) { 3781 ret = qemu_file_get_error(f); 3782 } 3783 3784 if (!ret && place_needed) { 3785 if (tmp_page->all_zero) { 3786 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3787 } else { 3788 ret = postcopy_place_page(mis, tmp_page->host_addr, 3789 place_source, block); 3790 } 3791 place_needed = false; 3792 postcopy_temp_page_reset(tmp_page); 3793 } 3794 } 3795 3796 return ret; 3797 } 3798 3799 static bool postcopy_is_running(void) 3800 { 3801 PostcopyState ps = postcopy_state_get(); 3802 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3803 } 3804 3805 /* 3806 * Flush content of RAM cache into SVM's memory. 3807 * Only flush the pages that be dirtied by PVM or SVM or both. 3808 */ 3809 void colo_flush_ram_cache(void) 3810 { 3811 RAMBlock *block = NULL; 3812 void *dst_host; 3813 void *src_host; 3814 unsigned long offset = 0; 3815 3816 memory_global_dirty_log_sync(false); 3817 qemu_mutex_lock(&ram_state->bitmap_mutex); 3818 WITH_RCU_READ_LOCK_GUARD() { 3819 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3820 ramblock_sync_dirty_bitmap(ram_state, block); 3821 } 3822 } 3823 3824 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3825 WITH_RCU_READ_LOCK_GUARD() { 3826 block = QLIST_FIRST_RCU(&ram_list.blocks); 3827 3828 while (block) { 3829 unsigned long num = 0; 3830 3831 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3832 if (!offset_in_ramblock(block, 3833 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3834 offset = 0; 3835 num = 0; 3836 block = QLIST_NEXT_RCU(block, next); 3837 } else { 3838 unsigned long i = 0; 3839 3840 for (i = 0; i < num; i++) { 3841 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3842 } 3843 dst_host = block->host 3844 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3845 src_host = block->colo_cache 3846 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3847 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3848 offset += num; 3849 } 3850 } 3851 } 3852 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3853 trace_colo_flush_ram_cache_end(); 3854 } 3855 3856 /** 3857 * ram_load_precopy: load pages in precopy case 3858 * 3859 * Returns 0 for success or -errno in case of error 3860 * 3861 * Called in precopy mode by ram_load(). 3862 * rcu_read_lock is taken prior to this being called. 3863 * 3864 * @f: QEMUFile where to send the data 3865 */ 3866 static int ram_load_precopy(QEMUFile *f) 3867 { 3868 MigrationIncomingState *mis = migration_incoming_get_current(); 3869 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3870 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3871 bool postcopy_advised = migration_incoming_postcopy_advised(); 3872 if (!migrate_compress()) { 3873 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3874 } 3875 3876 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3877 ram_addr_t addr, total_ram_bytes; 3878 void *host = NULL, *host_bak = NULL; 3879 uint8_t ch; 3880 3881 /* 3882 * Yield periodically to let main loop run, but an iteration of 3883 * the main loop is expensive, so do it each some iterations 3884 */ 3885 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3886 aio_co_schedule(qemu_get_current_aio_context(), 3887 qemu_coroutine_self()); 3888 qemu_coroutine_yield(); 3889 } 3890 i++; 3891 3892 addr = qemu_get_be64(f); 3893 flags = addr & ~TARGET_PAGE_MASK; 3894 addr &= TARGET_PAGE_MASK; 3895 3896 if (flags & invalid_flags) { 3897 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3898 error_report("Received an unexpected compressed page"); 3899 } 3900 3901 ret = -EINVAL; 3902 break; 3903 } 3904 3905 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3906 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3907 RAMBlock *block = ram_block_from_stream(mis, f, flags, 3908 RAM_CHANNEL_PRECOPY); 3909 3910 host = host_from_ram_block_offset(block, addr); 3911 /* 3912 * After going into COLO stage, we should not load the page 3913 * into SVM's memory directly, we put them into colo_cache firstly. 3914 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3915 * Previously, we copied all these memory in preparing stage of COLO 3916 * while we need to stop VM, which is a time-consuming process. 3917 * Here we optimize it by a trick, back-up every page while in 3918 * migration process while COLO is enabled, though it affects the 3919 * speed of the migration, but it obviously reduce the downtime of 3920 * back-up all SVM'S memory in COLO preparing stage. 3921 */ 3922 if (migration_incoming_colo_enabled()) { 3923 if (migration_incoming_in_colo_state()) { 3924 /* In COLO stage, put all pages into cache temporarily */ 3925 host = colo_cache_from_block_offset(block, addr, true); 3926 } else { 3927 /* 3928 * In migration stage but before COLO stage, 3929 * Put all pages into both cache and SVM's memory. 3930 */ 3931 host_bak = colo_cache_from_block_offset(block, addr, false); 3932 } 3933 } 3934 if (!host) { 3935 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3936 ret = -EINVAL; 3937 break; 3938 } 3939 if (!migration_incoming_in_colo_state()) { 3940 ramblock_recv_bitmap_set(block, host); 3941 } 3942 3943 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3944 } 3945 3946 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3947 case RAM_SAVE_FLAG_MEM_SIZE: 3948 /* Synchronize RAM block list */ 3949 total_ram_bytes = addr; 3950 while (!ret && total_ram_bytes) { 3951 RAMBlock *block; 3952 char id[256]; 3953 ram_addr_t length; 3954 3955 len = qemu_get_byte(f); 3956 qemu_get_buffer(f, (uint8_t *)id, len); 3957 id[len] = 0; 3958 length = qemu_get_be64(f); 3959 3960 block = qemu_ram_block_by_name(id); 3961 if (block && !qemu_ram_is_migratable(block)) { 3962 error_report("block %s should not be migrated !", id); 3963 ret = -EINVAL; 3964 } else if (block) { 3965 if (length != block->used_length) { 3966 Error *local_err = NULL; 3967 3968 ret = qemu_ram_resize(block, length, 3969 &local_err); 3970 if (local_err) { 3971 error_report_err(local_err); 3972 } 3973 } 3974 /* For postcopy we need to check hugepage sizes match */ 3975 if (postcopy_advised && migrate_postcopy_ram() && 3976 block->page_size != qemu_host_page_size) { 3977 uint64_t remote_page_size = qemu_get_be64(f); 3978 if (remote_page_size != block->page_size) { 3979 error_report("Mismatched RAM page size %s " 3980 "(local) %zd != %" PRId64, 3981 id, block->page_size, 3982 remote_page_size); 3983 ret = -EINVAL; 3984 } 3985 } 3986 if (migrate_ignore_shared()) { 3987 hwaddr addr = qemu_get_be64(f); 3988 if (ramblock_is_ignored(block) && 3989 block->mr->addr != addr) { 3990 error_report("Mismatched GPAs for block %s " 3991 "%" PRId64 "!= %" PRId64, 3992 id, (uint64_t)addr, 3993 (uint64_t)block->mr->addr); 3994 ret = -EINVAL; 3995 } 3996 } 3997 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3998 block->idstr); 3999 } else { 4000 error_report("Unknown ramblock \"%s\", cannot " 4001 "accept migration", id); 4002 ret = -EINVAL; 4003 } 4004 4005 total_ram_bytes -= length; 4006 } 4007 break; 4008 4009 case RAM_SAVE_FLAG_ZERO: 4010 ch = qemu_get_byte(f); 4011 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4012 break; 4013 4014 case RAM_SAVE_FLAG_PAGE: 4015 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4016 break; 4017 4018 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4019 len = qemu_get_be32(f); 4020 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4021 error_report("Invalid compressed data length: %d", len); 4022 ret = -EINVAL; 4023 break; 4024 } 4025 decompress_data_with_multi_threads(f, host, len); 4026 break; 4027 4028 case RAM_SAVE_FLAG_XBZRLE: 4029 if (load_xbzrle(f, addr, host) < 0) { 4030 error_report("Failed to decompress XBZRLE page at " 4031 RAM_ADDR_FMT, addr); 4032 ret = -EINVAL; 4033 break; 4034 } 4035 break; 4036 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4037 multifd_recv_sync_main(); 4038 break; 4039 case RAM_SAVE_FLAG_EOS: 4040 /* normal exit */ 4041 if (migrate_multifd_flush_after_each_section()) { 4042 multifd_recv_sync_main(); 4043 } 4044 break; 4045 case RAM_SAVE_FLAG_HOOK: 4046 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4047 break; 4048 default: 4049 error_report("Unknown combination of migration flags: 0x%x", flags); 4050 ret = -EINVAL; 4051 } 4052 if (!ret) { 4053 ret = qemu_file_get_error(f); 4054 } 4055 if (!ret && host_bak) { 4056 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4057 } 4058 } 4059 4060 ret |= wait_for_decompress_done(); 4061 return ret; 4062 } 4063 4064 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4065 { 4066 int ret = 0; 4067 static uint64_t seq_iter; 4068 /* 4069 * If system is running in postcopy mode, page inserts to host memory must 4070 * be atomic 4071 */ 4072 bool postcopy_running = postcopy_is_running(); 4073 4074 seq_iter++; 4075 4076 if (version_id != 4) { 4077 return -EINVAL; 4078 } 4079 4080 /* 4081 * This RCU critical section can be very long running. 4082 * When RCU reclaims in the code start to become numerous, 4083 * it will be necessary to reduce the granularity of this 4084 * critical section. 4085 */ 4086 WITH_RCU_READ_LOCK_GUARD() { 4087 if (postcopy_running) { 4088 /* 4089 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4090 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4091 * service fast page faults. 4092 */ 4093 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4094 } else { 4095 ret = ram_load_precopy(f); 4096 } 4097 } 4098 trace_ram_load_complete(ret, seq_iter); 4099 4100 return ret; 4101 } 4102 4103 static bool ram_has_postcopy(void *opaque) 4104 { 4105 RAMBlock *rb; 4106 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4107 if (ramblock_is_pmem(rb)) { 4108 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4109 "is not supported now!", rb->idstr, rb->host); 4110 return false; 4111 } 4112 } 4113 4114 return migrate_postcopy_ram(); 4115 } 4116 4117 /* Sync all the dirty bitmap with destination VM. */ 4118 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4119 { 4120 RAMBlock *block; 4121 QEMUFile *file = s->to_dst_file; 4122 int ramblock_count = 0; 4123 4124 trace_ram_dirty_bitmap_sync_start(); 4125 4126 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4127 qemu_savevm_send_recv_bitmap(file, block->idstr); 4128 trace_ram_dirty_bitmap_request(block->idstr); 4129 ramblock_count++; 4130 } 4131 4132 trace_ram_dirty_bitmap_sync_wait(); 4133 4134 /* Wait until all the ramblocks' dirty bitmap synced */ 4135 while (ramblock_count--) { 4136 qemu_sem_wait(&s->rp_state.rp_sem); 4137 } 4138 4139 trace_ram_dirty_bitmap_sync_complete(); 4140 4141 return 0; 4142 } 4143 4144 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4145 { 4146 qemu_sem_post(&s->rp_state.rp_sem); 4147 } 4148 4149 /* 4150 * Read the received bitmap, revert it as the initial dirty bitmap. 4151 * This is only used when the postcopy migration is paused but wants 4152 * to resume from a middle point. 4153 */ 4154 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4155 { 4156 int ret = -EINVAL; 4157 /* from_dst_file is always valid because we're within rp_thread */ 4158 QEMUFile *file = s->rp_state.from_dst_file; 4159 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4160 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4161 uint64_t size, end_mark; 4162 4163 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4164 4165 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4166 error_report("%s: incorrect state %s", __func__, 4167 MigrationStatus_str(s->state)); 4168 return -EINVAL; 4169 } 4170 4171 /* 4172 * Note: see comments in ramblock_recv_bitmap_send() on why we 4173 * need the endianness conversion, and the paddings. 4174 */ 4175 local_size = ROUND_UP(local_size, 8); 4176 4177 /* Add paddings */ 4178 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4179 4180 size = qemu_get_be64(file); 4181 4182 /* The size of the bitmap should match with our ramblock */ 4183 if (size != local_size) { 4184 error_report("%s: ramblock '%s' bitmap size mismatch " 4185 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4186 block->idstr, size, local_size); 4187 ret = -EINVAL; 4188 goto out; 4189 } 4190 4191 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4192 end_mark = qemu_get_be64(file); 4193 4194 ret = qemu_file_get_error(file); 4195 if (ret || size != local_size) { 4196 error_report("%s: read bitmap failed for ramblock '%s': %d" 4197 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4198 __func__, block->idstr, ret, local_size, size); 4199 ret = -EIO; 4200 goto out; 4201 } 4202 4203 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4204 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4205 __func__, block->idstr, end_mark); 4206 ret = -EINVAL; 4207 goto out; 4208 } 4209 4210 /* 4211 * Endianness conversion. We are during postcopy (though paused). 4212 * The dirty bitmap won't change. We can directly modify it. 4213 */ 4214 bitmap_from_le(block->bmap, le_bitmap, nbits); 4215 4216 /* 4217 * What we received is "received bitmap". Revert it as the initial 4218 * dirty bitmap for this ramblock. 4219 */ 4220 bitmap_complement(block->bmap, block->bmap, nbits); 4221 4222 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4223 ramblock_dirty_bitmap_clear_discarded_pages(block); 4224 4225 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4226 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4227 4228 /* 4229 * We succeeded to sync bitmap for current ramblock. If this is 4230 * the last one to sync, we need to notify the main send thread. 4231 */ 4232 ram_dirty_bitmap_reload_notify(s); 4233 4234 ret = 0; 4235 out: 4236 g_free(le_bitmap); 4237 return ret; 4238 } 4239 4240 static int ram_resume_prepare(MigrationState *s, void *opaque) 4241 { 4242 RAMState *rs = *(RAMState **)opaque; 4243 int ret; 4244 4245 ret = ram_dirty_bitmap_sync_all(s, rs); 4246 if (ret) { 4247 return ret; 4248 } 4249 4250 ram_state_resume_prepare(rs, s->to_dst_file); 4251 4252 return 0; 4253 } 4254 4255 void postcopy_preempt_shutdown_file(MigrationState *s) 4256 { 4257 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4258 qemu_fflush(s->postcopy_qemufile_src); 4259 } 4260 4261 static SaveVMHandlers savevm_ram_handlers = { 4262 .save_setup = ram_save_setup, 4263 .save_live_iterate = ram_save_iterate, 4264 .save_live_complete_postcopy = ram_save_complete, 4265 .save_live_complete_precopy = ram_save_complete, 4266 .has_postcopy = ram_has_postcopy, 4267 .state_pending_exact = ram_state_pending_exact, 4268 .state_pending_estimate = ram_state_pending_estimate, 4269 .load_state = ram_load, 4270 .save_cleanup = ram_save_cleanup, 4271 .load_setup = ram_load_setup, 4272 .load_cleanup = ram_load_cleanup, 4273 .resume_prepare = ram_resume_prepare, 4274 }; 4275 4276 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4277 size_t old_size, size_t new_size) 4278 { 4279 PostcopyState ps = postcopy_state_get(); 4280 ram_addr_t offset; 4281 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4282 Error *err = NULL; 4283 4284 if (ramblock_is_ignored(rb)) { 4285 return; 4286 } 4287 4288 if (!migration_is_idle()) { 4289 /* 4290 * Precopy code on the source cannot deal with the size of RAM blocks 4291 * changing at random points in time - especially after sending the 4292 * RAM block sizes in the migration stream, they must no longer change. 4293 * Abort and indicate a proper reason. 4294 */ 4295 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4296 migration_cancel(err); 4297 error_free(err); 4298 } 4299 4300 switch (ps) { 4301 case POSTCOPY_INCOMING_ADVISE: 4302 /* 4303 * Update what ram_postcopy_incoming_init()->init_range() does at the 4304 * time postcopy was advised. Syncing RAM blocks with the source will 4305 * result in RAM resizes. 4306 */ 4307 if (old_size < new_size) { 4308 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4309 error_report("RAM block '%s' discard of resized RAM failed", 4310 rb->idstr); 4311 } 4312 } 4313 rb->postcopy_length = new_size; 4314 break; 4315 case POSTCOPY_INCOMING_NONE: 4316 case POSTCOPY_INCOMING_RUNNING: 4317 case POSTCOPY_INCOMING_END: 4318 /* 4319 * Once our guest is running, postcopy does no longer care about 4320 * resizes. When growing, the new memory was not available on the 4321 * source, no handler needed. 4322 */ 4323 break; 4324 default: 4325 error_report("RAM block '%s' resized during postcopy state: %d", 4326 rb->idstr, ps); 4327 exit(-1); 4328 } 4329 } 4330 4331 static RAMBlockNotifier ram_mig_ram_notifier = { 4332 .ram_block_resized = ram_mig_ram_block_resized, 4333 }; 4334 4335 void ram_mig_init(void) 4336 { 4337 qemu_mutex_init(&XBZRLE.lock); 4338 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4339 ram_block_notifier_add(&ram_mig_ram_notifier); 4340 } 4341