1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #include "hw/boards.h" /* for machine_dump_guest_core() */ 62 63 #if defined(__linux__) 64 #include "qemu/userfaultfd.h" 65 #endif /* defined(__linux__) */ 66 67 /***********************************************************/ 68 /* ram save/restore */ 69 70 /* 71 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 72 * worked for pages that were filled with the same char. We switched 73 * it to only search for the zero value. And to avoid confusion with 74 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 75 */ 76 /* 77 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 78 */ 79 #define RAM_SAVE_FLAG_FULL 0x01 80 #define RAM_SAVE_FLAG_ZERO 0x02 81 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 82 #define RAM_SAVE_FLAG_PAGE 0x08 83 #define RAM_SAVE_FLAG_EOS 0x10 84 #define RAM_SAVE_FLAG_CONTINUE 0x20 85 #define RAM_SAVE_FLAG_XBZRLE 0x40 86 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 87 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 88 /* We can't use any flag that is bigger than 0x200 */ 89 90 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, 91 uint8_t *, int) = xbzrle_encode_buffer; 92 #if defined(CONFIG_AVX512BW_OPT) 93 #include "qemu/cpuid.h" 94 static void __attribute__((constructor)) init_cpu_flag(void) 95 { 96 unsigned max = __get_cpuid_max(0, NULL); 97 int a, b, c, d; 98 if (max >= 1) { 99 __cpuid(1, a, b, c, d); 100 /* We must check that AVX is not just available, but usable. */ 101 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { 102 int bv; 103 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); 104 __cpuid_count(7, 0, a, b, c, d); 105 /* 0xe6: 106 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 107 * and ZMM16-ZMM31 state are enabled by OS) 108 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) 109 */ 110 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { 111 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; 112 } 113 } 114 } 115 } 116 #endif 117 118 XBZRLECacheStats xbzrle_counters; 119 120 /* used by the search for pages to send */ 121 struct PageSearchStatus { 122 /* The migration channel used for a specific host page */ 123 QEMUFile *pss_channel; 124 /* Last block from where we have sent data */ 125 RAMBlock *last_sent_block; 126 /* Current block being searched */ 127 RAMBlock *block; 128 /* Current page to search from */ 129 unsigned long page; 130 /* Set once we wrap around */ 131 bool complete_round; 132 /* Whether we're sending a host page */ 133 bool host_page_sending; 134 /* The start/end of current host page. Invalid if host_page_sending==false */ 135 unsigned long host_page_start; 136 unsigned long host_page_end; 137 }; 138 typedef struct PageSearchStatus PageSearchStatus; 139 140 /* struct contains XBZRLE cache and a static page 141 used by the compression */ 142 static struct { 143 /* buffer used for XBZRLE encoding */ 144 uint8_t *encoded_buf; 145 /* buffer for storing page content */ 146 uint8_t *current_buf; 147 /* Cache for XBZRLE, Protected by lock. */ 148 PageCache *cache; 149 QemuMutex lock; 150 /* it will store a page full of zeros */ 151 uint8_t *zero_target_page; 152 /* buffer used for XBZRLE decoding */ 153 uint8_t *decoded_buf; 154 } XBZRLE; 155 156 static void XBZRLE_cache_lock(void) 157 { 158 if (migrate_use_xbzrle()) { 159 qemu_mutex_lock(&XBZRLE.lock); 160 } 161 } 162 163 static void XBZRLE_cache_unlock(void) 164 { 165 if (migrate_use_xbzrle()) { 166 qemu_mutex_unlock(&XBZRLE.lock); 167 } 168 } 169 170 /** 171 * xbzrle_cache_resize: resize the xbzrle cache 172 * 173 * This function is called from migrate_params_apply in main 174 * thread, possibly while a migration is in progress. A running 175 * migration may be using the cache and might finish during this call, 176 * hence changes to the cache are protected by XBZRLE.lock(). 177 * 178 * Returns 0 for success or -1 for error 179 * 180 * @new_size: new cache size 181 * @errp: set *errp if the check failed, with reason 182 */ 183 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 184 { 185 PageCache *new_cache; 186 int64_t ret = 0; 187 188 /* Check for truncation */ 189 if (new_size != (size_t)new_size) { 190 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 191 "exceeding address space"); 192 return -1; 193 } 194 195 if (new_size == migrate_xbzrle_cache_size()) { 196 /* nothing to do */ 197 return 0; 198 } 199 200 XBZRLE_cache_lock(); 201 202 if (XBZRLE.cache != NULL) { 203 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 204 if (!new_cache) { 205 ret = -1; 206 goto out; 207 } 208 209 cache_fini(XBZRLE.cache); 210 XBZRLE.cache = new_cache; 211 } 212 out: 213 XBZRLE_cache_unlock(); 214 return ret; 215 } 216 217 static bool postcopy_preempt_active(void) 218 { 219 return migrate_postcopy_preempt() && migration_in_postcopy(); 220 } 221 222 bool ramblock_is_ignored(RAMBlock *block) 223 { 224 return !qemu_ram_is_migratable(block) || 225 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 226 } 227 228 #undef RAMBLOCK_FOREACH 229 230 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 231 { 232 RAMBlock *block; 233 int ret = 0; 234 235 RCU_READ_LOCK_GUARD(); 236 237 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 238 ret = func(block, opaque); 239 if (ret) { 240 break; 241 } 242 } 243 return ret; 244 } 245 246 static void ramblock_recv_map_init(void) 247 { 248 RAMBlock *rb; 249 250 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 251 assert(!rb->receivedmap); 252 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 253 } 254 } 255 256 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 257 { 258 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 259 rb->receivedmap); 260 } 261 262 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 263 { 264 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 265 } 266 267 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 268 { 269 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 270 } 271 272 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 273 size_t nr) 274 { 275 bitmap_set_atomic(rb->receivedmap, 276 ramblock_recv_bitmap_offset(host_addr, rb), 277 nr); 278 } 279 280 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 281 282 /* 283 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 284 * 285 * Returns >0 if success with sent bytes, or <0 if error. 286 */ 287 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 288 const char *block_name) 289 { 290 RAMBlock *block = qemu_ram_block_by_name(block_name); 291 unsigned long *le_bitmap, nbits; 292 uint64_t size; 293 294 if (!block) { 295 error_report("%s: invalid block name: %s", __func__, block_name); 296 return -1; 297 } 298 299 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 300 301 /* 302 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 303 * machines we may need 4 more bytes for padding (see below 304 * comment). So extend it a bit before hand. 305 */ 306 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 307 308 /* 309 * Always use little endian when sending the bitmap. This is 310 * required that when source and destination VMs are not using the 311 * same endianness. (Note: big endian won't work.) 312 */ 313 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 314 315 /* Size of the bitmap, in bytes */ 316 size = DIV_ROUND_UP(nbits, 8); 317 318 /* 319 * size is always aligned to 8 bytes for 64bit machines, but it 320 * may not be true for 32bit machines. We need this padding to 321 * make sure the migration can survive even between 32bit and 322 * 64bit machines. 323 */ 324 size = ROUND_UP(size, 8); 325 326 qemu_put_be64(file, size); 327 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 328 /* 329 * Mark as an end, in case the middle part is screwed up due to 330 * some "mysterious" reason. 331 */ 332 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 333 qemu_fflush(file); 334 335 g_free(le_bitmap); 336 337 if (qemu_file_get_error(file)) { 338 return qemu_file_get_error(file); 339 } 340 341 return size + sizeof(size); 342 } 343 344 /* 345 * An outstanding page request, on the source, having been received 346 * and queued 347 */ 348 struct RAMSrcPageRequest { 349 RAMBlock *rb; 350 hwaddr offset; 351 hwaddr len; 352 353 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 354 }; 355 356 /* State of RAM for migration */ 357 struct RAMState { 358 /* 359 * PageSearchStatus structures for the channels when send pages. 360 * Protected by the bitmap_mutex. 361 */ 362 PageSearchStatus pss[RAM_CHANNEL_MAX]; 363 /* UFFD file descriptor, used in 'write-tracking' migration */ 364 int uffdio_fd; 365 /* total ram size in bytes */ 366 uint64_t ram_bytes_total; 367 /* Last block that we have visited searching for dirty pages */ 368 RAMBlock *last_seen_block; 369 /* Last dirty target page we have sent */ 370 ram_addr_t last_page; 371 /* last ram version we have seen */ 372 uint32_t last_version; 373 /* How many times we have dirty too many pages */ 374 int dirty_rate_high_cnt; 375 /* these variables are used for bitmap sync */ 376 /* last time we did a full bitmap_sync */ 377 int64_t time_last_bitmap_sync; 378 /* bytes transferred at start_time */ 379 uint64_t bytes_xfer_prev; 380 /* number of dirty pages since start_time */ 381 uint64_t num_dirty_pages_period; 382 /* xbzrle misses since the beginning of the period */ 383 uint64_t xbzrle_cache_miss_prev; 384 /* Amount of xbzrle pages since the beginning of the period */ 385 uint64_t xbzrle_pages_prev; 386 /* Amount of xbzrle encoded bytes since the beginning of the period */ 387 uint64_t xbzrle_bytes_prev; 388 /* Start using XBZRLE (e.g., after the first round). */ 389 bool xbzrle_enabled; 390 /* Are we on the last stage of migration */ 391 bool last_stage; 392 /* compression statistics since the beginning of the period */ 393 /* amount of count that no free thread to compress data */ 394 uint64_t compress_thread_busy_prev; 395 /* amount bytes after compression */ 396 uint64_t compressed_size_prev; 397 /* amount of compressed pages */ 398 uint64_t compress_pages_prev; 399 400 /* total handled target pages at the beginning of period */ 401 uint64_t target_page_count_prev; 402 /* total handled target pages since start */ 403 uint64_t target_page_count; 404 /* number of dirty bits in the bitmap */ 405 uint64_t migration_dirty_pages; 406 /* 407 * Protects: 408 * - dirty/clear bitmap 409 * - migration_dirty_pages 410 * - pss structures 411 */ 412 QemuMutex bitmap_mutex; 413 /* The RAMBlock used in the last src_page_requests */ 414 RAMBlock *last_req_rb; 415 /* Queue of outstanding page requests from the destination */ 416 QemuMutex src_page_req_mutex; 417 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 418 }; 419 typedef struct RAMState RAMState; 420 421 static RAMState *ram_state; 422 423 static NotifierWithReturnList precopy_notifier_list; 424 425 /* Whether postcopy has queued requests? */ 426 static bool postcopy_has_request(RAMState *rs) 427 { 428 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 429 } 430 431 void precopy_infrastructure_init(void) 432 { 433 notifier_with_return_list_init(&precopy_notifier_list); 434 } 435 436 void precopy_add_notifier(NotifierWithReturn *n) 437 { 438 notifier_with_return_list_add(&precopy_notifier_list, n); 439 } 440 441 void precopy_remove_notifier(NotifierWithReturn *n) 442 { 443 notifier_with_return_remove(n); 444 } 445 446 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 447 { 448 PrecopyNotifyData pnd; 449 pnd.reason = reason; 450 pnd.errp = errp; 451 452 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 453 } 454 455 uint64_t ram_bytes_remaining(void) 456 { 457 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 458 0; 459 } 460 461 /* 462 * NOTE: not all stats in ram_counters are used in reality. See comments 463 * for struct MigrationAtomicStats. The ultimate result of ram migration 464 * counters will be a merged version with both ram_counters and the atomic 465 * fields in ram_atomic_counters. 466 */ 467 MigrationStats ram_counters; 468 MigrationAtomicStats ram_atomic_counters; 469 470 void ram_transferred_add(uint64_t bytes) 471 { 472 if (runstate_is_running()) { 473 ram_counters.precopy_bytes += bytes; 474 } else if (migration_in_postcopy()) { 475 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes); 476 } else { 477 ram_counters.downtime_bytes += bytes; 478 } 479 stat64_add(&ram_atomic_counters.transferred, bytes); 480 } 481 482 void dirty_sync_missed_zero_copy(void) 483 { 484 ram_counters.dirty_sync_missed_zero_copy++; 485 } 486 487 struct MigrationOps { 488 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 489 }; 490 typedef struct MigrationOps MigrationOps; 491 492 MigrationOps *migration_ops; 493 494 CompressionStats compression_counters; 495 496 struct CompressParam { 497 bool done; 498 bool quit; 499 bool zero_page; 500 QEMUFile *file; 501 QemuMutex mutex; 502 QemuCond cond; 503 RAMBlock *block; 504 ram_addr_t offset; 505 506 /* internally used fields */ 507 z_stream stream; 508 uint8_t *originbuf; 509 }; 510 typedef struct CompressParam CompressParam; 511 512 struct DecompressParam { 513 bool done; 514 bool quit; 515 QemuMutex mutex; 516 QemuCond cond; 517 void *des; 518 uint8_t *compbuf; 519 int len; 520 z_stream stream; 521 }; 522 typedef struct DecompressParam DecompressParam; 523 524 static CompressParam *comp_param; 525 static QemuThread *compress_threads; 526 /* comp_done_cond is used to wake up the migration thread when 527 * one of the compression threads has finished the compression. 528 * comp_done_lock is used to co-work with comp_done_cond. 529 */ 530 static QemuMutex comp_done_lock; 531 static QemuCond comp_done_cond; 532 533 static QEMUFile *decomp_file; 534 static DecompressParam *decomp_param; 535 static QemuThread *decompress_threads; 536 static QemuMutex decomp_done_lock; 537 static QemuCond decomp_done_cond; 538 539 static int ram_save_host_page_urgent(PageSearchStatus *pss); 540 541 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 542 ram_addr_t offset, uint8_t *source_buf); 543 544 /* NOTE: page is the PFN not real ram_addr_t. */ 545 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 546 { 547 pss->block = rb; 548 pss->page = page; 549 pss->complete_round = false; 550 } 551 552 /* 553 * Check whether two PSSs are actively sending the same page. Return true 554 * if it is, false otherwise. 555 */ 556 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 557 { 558 return pss1->host_page_sending && pss2->host_page_sending && 559 (pss1->host_page_start == pss2->host_page_start); 560 } 561 562 static void *do_data_compress(void *opaque) 563 { 564 CompressParam *param = opaque; 565 RAMBlock *block; 566 ram_addr_t offset; 567 bool zero_page; 568 569 qemu_mutex_lock(¶m->mutex); 570 while (!param->quit) { 571 if (param->block) { 572 block = param->block; 573 offset = param->offset; 574 param->block = NULL; 575 qemu_mutex_unlock(¶m->mutex); 576 577 zero_page = do_compress_ram_page(param->file, ¶m->stream, 578 block, offset, param->originbuf); 579 580 qemu_mutex_lock(&comp_done_lock); 581 param->done = true; 582 param->zero_page = zero_page; 583 qemu_cond_signal(&comp_done_cond); 584 qemu_mutex_unlock(&comp_done_lock); 585 586 qemu_mutex_lock(¶m->mutex); 587 } else { 588 qemu_cond_wait(¶m->cond, ¶m->mutex); 589 } 590 } 591 qemu_mutex_unlock(¶m->mutex); 592 593 return NULL; 594 } 595 596 static void compress_threads_save_cleanup(void) 597 { 598 int i, thread_count; 599 600 if (!migrate_use_compression() || !comp_param) { 601 return; 602 } 603 604 thread_count = migrate_compress_threads(); 605 for (i = 0; i < thread_count; i++) { 606 /* 607 * we use it as a indicator which shows if the thread is 608 * properly init'd or not 609 */ 610 if (!comp_param[i].file) { 611 break; 612 } 613 614 qemu_mutex_lock(&comp_param[i].mutex); 615 comp_param[i].quit = true; 616 qemu_cond_signal(&comp_param[i].cond); 617 qemu_mutex_unlock(&comp_param[i].mutex); 618 619 qemu_thread_join(compress_threads + i); 620 qemu_mutex_destroy(&comp_param[i].mutex); 621 qemu_cond_destroy(&comp_param[i].cond); 622 deflateEnd(&comp_param[i].stream); 623 g_free(comp_param[i].originbuf); 624 qemu_fclose(comp_param[i].file); 625 comp_param[i].file = NULL; 626 } 627 qemu_mutex_destroy(&comp_done_lock); 628 qemu_cond_destroy(&comp_done_cond); 629 g_free(compress_threads); 630 g_free(comp_param); 631 compress_threads = NULL; 632 comp_param = NULL; 633 } 634 635 static int compress_threads_save_setup(void) 636 { 637 int i, thread_count; 638 639 if (!migrate_use_compression()) { 640 return 0; 641 } 642 thread_count = migrate_compress_threads(); 643 compress_threads = g_new0(QemuThread, thread_count); 644 comp_param = g_new0(CompressParam, thread_count); 645 qemu_cond_init(&comp_done_cond); 646 qemu_mutex_init(&comp_done_lock); 647 for (i = 0; i < thread_count; i++) { 648 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 649 if (!comp_param[i].originbuf) { 650 goto exit; 651 } 652 653 if (deflateInit(&comp_param[i].stream, 654 migrate_compress_level()) != Z_OK) { 655 g_free(comp_param[i].originbuf); 656 goto exit; 657 } 658 659 /* comp_param[i].file is just used as a dummy buffer to save data, 660 * set its ops to empty. 661 */ 662 comp_param[i].file = qemu_file_new_output( 663 QIO_CHANNEL(qio_channel_null_new())); 664 comp_param[i].done = true; 665 comp_param[i].quit = false; 666 qemu_mutex_init(&comp_param[i].mutex); 667 qemu_cond_init(&comp_param[i].cond); 668 qemu_thread_create(compress_threads + i, "compress", 669 do_data_compress, comp_param + i, 670 QEMU_THREAD_JOINABLE); 671 } 672 return 0; 673 674 exit: 675 compress_threads_save_cleanup(); 676 return -1; 677 } 678 679 /** 680 * save_page_header: write page header to wire 681 * 682 * If this is the 1st block, it also writes the block identification 683 * 684 * Returns the number of bytes written 685 * 686 * @pss: current PSS channel status 687 * @block: block that contains the page we want to send 688 * @offset: offset inside the block for the page 689 * in the lower bits, it contains flags 690 */ 691 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block, 692 ram_addr_t offset) 693 { 694 size_t size, len; 695 bool same_block = (block == pss->last_sent_block); 696 QEMUFile *f = pss->pss_channel; 697 698 if (same_block) { 699 offset |= RAM_SAVE_FLAG_CONTINUE; 700 } 701 qemu_put_be64(f, offset); 702 size = 8; 703 704 if (!same_block) { 705 len = strlen(block->idstr); 706 qemu_put_byte(f, len); 707 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 708 size += 1 + len; 709 pss->last_sent_block = block; 710 } 711 return size; 712 } 713 714 /** 715 * mig_throttle_guest_down: throttle down the guest 716 * 717 * Reduce amount of guest cpu execution to hopefully slow down memory 718 * writes. If guest dirty memory rate is reduced below the rate at 719 * which we can transfer pages to the destination then we should be 720 * able to complete migration. Some workloads dirty memory way too 721 * fast and will not effectively converge, even with auto-converge. 722 */ 723 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 724 uint64_t bytes_dirty_threshold) 725 { 726 MigrationState *s = migrate_get_current(); 727 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 728 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 729 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 730 int pct_max = s->parameters.max_cpu_throttle; 731 732 uint64_t throttle_now = cpu_throttle_get_percentage(); 733 uint64_t cpu_now, cpu_ideal, throttle_inc; 734 735 /* We have not started throttling yet. Let's start it. */ 736 if (!cpu_throttle_active()) { 737 cpu_throttle_set(pct_initial); 738 } else { 739 /* Throttling already on, just increase the rate */ 740 if (!pct_tailslow) { 741 throttle_inc = pct_increment; 742 } else { 743 /* Compute the ideal CPU percentage used by Guest, which may 744 * make the dirty rate match the dirty rate threshold. */ 745 cpu_now = 100 - throttle_now; 746 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 747 bytes_dirty_period); 748 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 749 } 750 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 751 } 752 } 753 754 void mig_throttle_counter_reset(void) 755 { 756 RAMState *rs = ram_state; 757 758 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 759 rs->num_dirty_pages_period = 0; 760 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); 761 } 762 763 /** 764 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 765 * 766 * @rs: current RAM state 767 * @current_addr: address for the zero page 768 * 769 * Update the xbzrle cache to reflect a page that's been sent as all 0. 770 * The important thing is that a stale (not-yet-0'd) page be replaced 771 * by the new data. 772 * As a bonus, if the page wasn't in the cache it gets added so that 773 * when a small write is made into the 0'd page it gets XBZRLE sent. 774 */ 775 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 776 { 777 /* We don't care if this fails to allocate a new cache page 778 * as long as it updated an old one */ 779 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 780 ram_counters.dirty_sync_count); 781 } 782 783 #define ENCODING_FLAG_XBZRLE 0x1 784 785 /** 786 * save_xbzrle_page: compress and send current page 787 * 788 * Returns: 1 means that we wrote the page 789 * 0 means that page is identical to the one already sent 790 * -1 means that xbzrle would be longer than normal 791 * 792 * @rs: current RAM state 793 * @pss: current PSS channel 794 * @current_data: pointer to the address of the page contents 795 * @current_addr: addr of the page 796 * @block: block that contains the page we want to send 797 * @offset: offset inside the block for the page 798 */ 799 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 800 uint8_t **current_data, ram_addr_t current_addr, 801 RAMBlock *block, ram_addr_t offset) 802 { 803 int encoded_len = 0, bytes_xbzrle; 804 uint8_t *prev_cached_page; 805 QEMUFile *file = pss->pss_channel; 806 807 if (!cache_is_cached(XBZRLE.cache, current_addr, 808 ram_counters.dirty_sync_count)) { 809 xbzrle_counters.cache_miss++; 810 if (!rs->last_stage) { 811 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 812 ram_counters.dirty_sync_count) == -1) { 813 return -1; 814 } else { 815 /* update *current_data when the page has been 816 inserted into cache */ 817 *current_data = get_cached_data(XBZRLE.cache, current_addr); 818 } 819 } 820 return -1; 821 } 822 823 /* 824 * Reaching here means the page has hit the xbzrle cache, no matter what 825 * encoding result it is (normal encoding, overflow or skipping the page), 826 * count the page as encoded. This is used to calculate the encoding rate. 827 * 828 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 829 * 2nd page turns out to be skipped (i.e. no new bytes written to the 830 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 831 * skipped page included. In this way, the encoding rate can tell if the 832 * guest page is good for xbzrle encoding. 833 */ 834 xbzrle_counters.pages++; 835 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 836 837 /* save current buffer into memory */ 838 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 839 840 /* XBZRLE encoding (if there is no overflow) */ 841 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, 842 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 843 TARGET_PAGE_SIZE); 844 845 /* 846 * Update the cache contents, so that it corresponds to the data 847 * sent, in all cases except where we skip the page. 848 */ 849 if (!rs->last_stage && encoded_len != 0) { 850 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 851 /* 852 * In the case where we couldn't compress, ensure that the caller 853 * sends the data from the cache, since the guest might have 854 * changed the RAM since we copied it. 855 */ 856 *current_data = prev_cached_page; 857 } 858 859 if (encoded_len == 0) { 860 trace_save_xbzrle_page_skipping(); 861 return 0; 862 } else if (encoded_len == -1) { 863 trace_save_xbzrle_page_overflow(); 864 xbzrle_counters.overflow++; 865 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 866 return -1; 867 } 868 869 /* Send XBZRLE based compressed page */ 870 bytes_xbzrle = save_page_header(pss, block, 871 offset | RAM_SAVE_FLAG_XBZRLE); 872 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 873 qemu_put_be16(file, encoded_len); 874 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 875 bytes_xbzrle += encoded_len + 1 + 2; 876 /* 877 * Like compressed_size (please see update_compress_thread_counts), 878 * the xbzrle encoded bytes don't count the 8 byte header with 879 * RAM_SAVE_FLAG_CONTINUE. 880 */ 881 xbzrle_counters.bytes += bytes_xbzrle - 8; 882 ram_transferred_add(bytes_xbzrle); 883 884 return 1; 885 } 886 887 /** 888 * pss_find_next_dirty: find the next dirty page of current ramblock 889 * 890 * This function updates pss->page to point to the next dirty page index 891 * within the ramblock to migrate, or the end of ramblock when nothing 892 * found. Note that when pss->host_page_sending==true it means we're 893 * during sending a host page, so we won't look for dirty page that is 894 * outside the host page boundary. 895 * 896 * @pss: the current page search status 897 */ 898 static void pss_find_next_dirty(PageSearchStatus *pss) 899 { 900 RAMBlock *rb = pss->block; 901 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 902 unsigned long *bitmap = rb->bmap; 903 904 if (ramblock_is_ignored(rb)) { 905 /* Points directly to the end, so we know no dirty page */ 906 pss->page = size; 907 return; 908 } 909 910 /* 911 * If during sending a host page, only look for dirty pages within the 912 * current host page being send. 913 */ 914 if (pss->host_page_sending) { 915 assert(pss->host_page_end); 916 size = MIN(size, pss->host_page_end); 917 } 918 919 pss->page = find_next_bit(bitmap, size, pss->page); 920 } 921 922 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 923 unsigned long page) 924 { 925 uint8_t shift; 926 hwaddr size, start; 927 928 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 929 return; 930 } 931 932 shift = rb->clear_bmap_shift; 933 /* 934 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 935 * can make things easier sometimes since then start address 936 * of the small chunk will always be 64 pages aligned so the 937 * bitmap will always be aligned to unsigned long. We should 938 * even be able to remove this restriction but I'm simply 939 * keeping it. 940 */ 941 assert(shift >= 6); 942 943 size = 1ULL << (TARGET_PAGE_BITS + shift); 944 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 945 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 946 memory_region_clear_dirty_bitmap(rb->mr, start, size); 947 } 948 949 static void 950 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 951 unsigned long start, 952 unsigned long npages) 953 { 954 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 955 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 956 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 957 958 /* 959 * Clear pages from start to start + npages - 1, so the end boundary is 960 * exclusive. 961 */ 962 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 963 migration_clear_memory_region_dirty_bitmap(rb, i); 964 } 965 } 966 967 /* 968 * colo_bitmap_find_diry:find contiguous dirty pages from start 969 * 970 * Returns the page offset within memory region of the start of the contiguout 971 * dirty page 972 * 973 * @rs: current RAM state 974 * @rb: RAMBlock where to search for dirty pages 975 * @start: page where we start the search 976 * @num: the number of contiguous dirty pages 977 */ 978 static inline 979 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 980 unsigned long start, unsigned long *num) 981 { 982 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 983 unsigned long *bitmap = rb->bmap; 984 unsigned long first, next; 985 986 *num = 0; 987 988 if (ramblock_is_ignored(rb)) { 989 return size; 990 } 991 992 first = find_next_bit(bitmap, size, start); 993 if (first >= size) { 994 return first; 995 } 996 next = find_next_zero_bit(bitmap, size, first + 1); 997 assert(next >= first); 998 *num = next - first; 999 return first; 1000 } 1001 1002 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 1003 RAMBlock *rb, 1004 unsigned long page) 1005 { 1006 bool ret; 1007 1008 /* 1009 * Clear dirty bitmap if needed. This _must_ be called before we 1010 * send any of the page in the chunk because we need to make sure 1011 * we can capture further page content changes when we sync dirty 1012 * log the next time. So as long as we are going to send any of 1013 * the page in the chunk we clear the remote dirty bitmap for all. 1014 * Clearing it earlier won't be a problem, but too late will. 1015 */ 1016 migration_clear_memory_region_dirty_bitmap(rb, page); 1017 1018 ret = test_and_clear_bit(page, rb->bmap); 1019 if (ret) { 1020 rs->migration_dirty_pages--; 1021 } 1022 1023 return ret; 1024 } 1025 1026 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 1027 void *opaque) 1028 { 1029 const hwaddr offset = section->offset_within_region; 1030 const hwaddr size = int128_get64(section->size); 1031 const unsigned long start = offset >> TARGET_PAGE_BITS; 1032 const unsigned long npages = size >> TARGET_PAGE_BITS; 1033 RAMBlock *rb = section->mr->ram_block; 1034 uint64_t *cleared_bits = opaque; 1035 1036 /* 1037 * We don't grab ram_state->bitmap_mutex because we expect to run 1038 * only when starting migration or during postcopy recovery where 1039 * we don't have concurrent access. 1040 */ 1041 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1042 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1043 } 1044 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1045 bitmap_clear(rb->bmap, start, npages); 1046 } 1047 1048 /* 1049 * Exclude all dirty pages from migration that fall into a discarded range as 1050 * managed by a RamDiscardManager responsible for the mapped memory region of 1051 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1052 * 1053 * Discarded pages ("logically unplugged") have undefined content and must 1054 * not get migrated, because even reading these pages for migration might 1055 * result in undesired behavior. 1056 * 1057 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1058 * 1059 * Note: The result is only stable while migrating (precopy/postcopy). 1060 */ 1061 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1062 { 1063 uint64_t cleared_bits = 0; 1064 1065 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1066 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1067 MemoryRegionSection section = { 1068 .mr = rb->mr, 1069 .offset_within_region = 0, 1070 .size = int128_make64(qemu_ram_get_used_length(rb)), 1071 }; 1072 1073 ram_discard_manager_replay_discarded(rdm, §ion, 1074 dirty_bitmap_clear_section, 1075 &cleared_bits); 1076 } 1077 return cleared_bits; 1078 } 1079 1080 /* 1081 * Check if a host-page aligned page falls into a discarded range as managed by 1082 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1083 * 1084 * Note: The result is only stable while migrating (precopy/postcopy). 1085 */ 1086 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1087 { 1088 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1089 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1090 MemoryRegionSection section = { 1091 .mr = rb->mr, 1092 .offset_within_region = start, 1093 .size = int128_make64(qemu_ram_pagesize(rb)), 1094 }; 1095 1096 return !ram_discard_manager_is_populated(rdm, §ion); 1097 } 1098 return false; 1099 } 1100 1101 /* Called with RCU critical section */ 1102 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1103 { 1104 uint64_t new_dirty_pages = 1105 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1106 1107 rs->migration_dirty_pages += new_dirty_pages; 1108 rs->num_dirty_pages_period += new_dirty_pages; 1109 } 1110 1111 /** 1112 * ram_pagesize_summary: calculate all the pagesizes of a VM 1113 * 1114 * Returns a summary bitmap of the page sizes of all RAMBlocks 1115 * 1116 * For VMs with just normal pages this is equivalent to the host page 1117 * size. If it's got some huge pages then it's the OR of all the 1118 * different page sizes. 1119 */ 1120 uint64_t ram_pagesize_summary(void) 1121 { 1122 RAMBlock *block; 1123 uint64_t summary = 0; 1124 1125 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1126 summary |= block->page_size; 1127 } 1128 1129 return summary; 1130 } 1131 1132 uint64_t ram_get_total_transferred_pages(void) 1133 { 1134 return stat64_get(&ram_atomic_counters.normal) + 1135 stat64_get(&ram_atomic_counters.duplicate) + 1136 compression_counters.pages + xbzrle_counters.pages; 1137 } 1138 1139 static void migration_update_rates(RAMState *rs, int64_t end_time) 1140 { 1141 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1142 double compressed_size; 1143 1144 /* calculate period counters */ 1145 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1146 / (end_time - rs->time_last_bitmap_sync); 1147 1148 if (!page_count) { 1149 return; 1150 } 1151 1152 if (migrate_use_xbzrle()) { 1153 double encoded_size, unencoded_size; 1154 1155 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1156 rs->xbzrle_cache_miss_prev) / page_count; 1157 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1158 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1159 TARGET_PAGE_SIZE; 1160 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1161 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1162 xbzrle_counters.encoding_rate = 0; 1163 } else { 1164 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1165 } 1166 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1167 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1168 } 1169 1170 if (migrate_use_compression()) { 1171 compression_counters.busy_rate = (double)(compression_counters.busy - 1172 rs->compress_thread_busy_prev) / page_count; 1173 rs->compress_thread_busy_prev = compression_counters.busy; 1174 1175 compressed_size = compression_counters.compressed_size - 1176 rs->compressed_size_prev; 1177 if (compressed_size) { 1178 double uncompressed_size = (compression_counters.pages - 1179 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1180 1181 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1182 compression_counters.compression_rate = 1183 uncompressed_size / compressed_size; 1184 1185 rs->compress_pages_prev = compression_counters.pages; 1186 rs->compressed_size_prev = compression_counters.compressed_size; 1187 } 1188 } 1189 } 1190 1191 static void migration_trigger_throttle(RAMState *rs) 1192 { 1193 MigrationState *s = migrate_get_current(); 1194 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1195 uint64_t bytes_xfer_period = 1196 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev; 1197 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1198 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1199 1200 /* During block migration the auto-converge logic incorrectly detects 1201 * that ram migration makes no progress. Avoid this by disabling the 1202 * throttling logic during the bulk phase of block migration. */ 1203 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1204 /* The following detection logic can be refined later. For now: 1205 Check to see if the ratio between dirtied bytes and the approx. 1206 amount of bytes that just got transferred since the last time 1207 we were in this routine reaches the threshold. If that happens 1208 twice, start or increase throttling. */ 1209 1210 if ((bytes_dirty_period > bytes_dirty_threshold) && 1211 (++rs->dirty_rate_high_cnt >= 2)) { 1212 trace_migration_throttle(); 1213 rs->dirty_rate_high_cnt = 0; 1214 mig_throttle_guest_down(bytes_dirty_period, 1215 bytes_dirty_threshold); 1216 } 1217 } 1218 } 1219 1220 static void migration_bitmap_sync(RAMState *rs) 1221 { 1222 RAMBlock *block; 1223 int64_t end_time; 1224 1225 ram_counters.dirty_sync_count++; 1226 1227 if (!rs->time_last_bitmap_sync) { 1228 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1229 } 1230 1231 trace_migration_bitmap_sync_start(); 1232 memory_global_dirty_log_sync(); 1233 1234 qemu_mutex_lock(&rs->bitmap_mutex); 1235 WITH_RCU_READ_LOCK_GUARD() { 1236 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1237 ramblock_sync_dirty_bitmap(rs, block); 1238 } 1239 ram_counters.remaining = ram_bytes_remaining(); 1240 } 1241 qemu_mutex_unlock(&rs->bitmap_mutex); 1242 1243 memory_global_after_dirty_log_sync(); 1244 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1245 1246 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1247 1248 /* more than 1 second = 1000 millisecons */ 1249 if (end_time > rs->time_last_bitmap_sync + 1000) { 1250 migration_trigger_throttle(rs); 1251 1252 migration_update_rates(rs, end_time); 1253 1254 rs->target_page_count_prev = rs->target_page_count; 1255 1256 /* reset period counters */ 1257 rs->time_last_bitmap_sync = end_time; 1258 rs->num_dirty_pages_period = 0; 1259 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); 1260 } 1261 if (migrate_use_events()) { 1262 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1263 } 1264 } 1265 1266 static void migration_bitmap_sync_precopy(RAMState *rs) 1267 { 1268 Error *local_err = NULL; 1269 1270 /* 1271 * The current notifier usage is just an optimization to migration, so we 1272 * don't stop the normal migration process in the error case. 1273 */ 1274 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1275 error_report_err(local_err); 1276 local_err = NULL; 1277 } 1278 1279 migration_bitmap_sync(rs); 1280 1281 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1282 error_report_err(local_err); 1283 } 1284 } 1285 1286 void ram_release_page(const char *rbname, uint64_t offset) 1287 { 1288 if (!migrate_release_ram() || !migration_in_postcopy()) { 1289 return; 1290 } 1291 1292 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1293 } 1294 1295 /** 1296 * save_zero_page_to_file: send the zero page to the file 1297 * 1298 * Returns the size of data written to the file, 0 means the page is not 1299 * a zero page 1300 * 1301 * @pss: current PSS channel 1302 * @block: block that contains the page we want to send 1303 * @offset: offset inside the block for the page 1304 */ 1305 static int save_zero_page_to_file(PageSearchStatus *pss, 1306 RAMBlock *block, ram_addr_t offset) 1307 { 1308 uint8_t *p = block->host + offset; 1309 QEMUFile *file = pss->pss_channel; 1310 int len = 0; 1311 1312 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1313 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO); 1314 qemu_put_byte(file, 0); 1315 len += 1; 1316 ram_release_page(block->idstr, offset); 1317 } 1318 return len; 1319 } 1320 1321 /** 1322 * save_zero_page: send the zero page to the stream 1323 * 1324 * Returns the number of pages written. 1325 * 1326 * @pss: current PSS channel 1327 * @block: block that contains the page we want to send 1328 * @offset: offset inside the block for the page 1329 */ 1330 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block, 1331 ram_addr_t offset) 1332 { 1333 int len = save_zero_page_to_file(pss, block, offset); 1334 1335 if (len) { 1336 stat64_add(&ram_atomic_counters.duplicate, 1); 1337 ram_transferred_add(len); 1338 return 1; 1339 } 1340 return -1; 1341 } 1342 1343 /* 1344 * @pages: the number of pages written by the control path, 1345 * < 0 - error 1346 * > 0 - number of pages written 1347 * 1348 * Return true if the pages has been saved, otherwise false is returned. 1349 */ 1350 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1351 ram_addr_t offset, int *pages) 1352 { 1353 uint64_t bytes_xmit = 0; 1354 int ret; 1355 1356 *pages = -1; 1357 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1358 TARGET_PAGE_SIZE, &bytes_xmit); 1359 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1360 return false; 1361 } 1362 1363 if (bytes_xmit) { 1364 ram_transferred_add(bytes_xmit); 1365 *pages = 1; 1366 } 1367 1368 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1369 return true; 1370 } 1371 1372 if (bytes_xmit > 0) { 1373 stat64_add(&ram_atomic_counters.normal, 1); 1374 } else if (bytes_xmit == 0) { 1375 stat64_add(&ram_atomic_counters.duplicate, 1); 1376 } 1377 1378 return true; 1379 } 1380 1381 /* 1382 * directly send the page to the stream 1383 * 1384 * Returns the number of pages written. 1385 * 1386 * @pss: current PSS channel 1387 * @block: block that contains the page we want to send 1388 * @offset: offset inside the block for the page 1389 * @buf: the page to be sent 1390 * @async: send to page asyncly 1391 */ 1392 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1393 ram_addr_t offset, uint8_t *buf, bool async) 1394 { 1395 QEMUFile *file = pss->pss_channel; 1396 1397 ram_transferred_add(save_page_header(pss, block, 1398 offset | RAM_SAVE_FLAG_PAGE)); 1399 if (async) { 1400 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1401 migrate_release_ram() && 1402 migration_in_postcopy()); 1403 } else { 1404 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1405 } 1406 ram_transferred_add(TARGET_PAGE_SIZE); 1407 stat64_add(&ram_atomic_counters.normal, 1); 1408 return 1; 1409 } 1410 1411 /** 1412 * ram_save_page: send the given page to the stream 1413 * 1414 * Returns the number of pages written. 1415 * < 0 - error 1416 * >=0 - Number of pages written - this might legally be 0 1417 * if xbzrle noticed the page was the same. 1418 * 1419 * @rs: current RAM state 1420 * @block: block that contains the page we want to send 1421 * @offset: offset inside the block for the page 1422 */ 1423 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1424 { 1425 int pages = -1; 1426 uint8_t *p; 1427 bool send_async = true; 1428 RAMBlock *block = pss->block; 1429 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1430 ram_addr_t current_addr = block->offset + offset; 1431 1432 p = block->host + offset; 1433 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1434 1435 XBZRLE_cache_lock(); 1436 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1437 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1438 block, offset); 1439 if (!rs->last_stage) { 1440 /* Can't send this cached data async, since the cache page 1441 * might get updated before it gets to the wire 1442 */ 1443 send_async = false; 1444 } 1445 } 1446 1447 /* XBZRLE overflow or normal page */ 1448 if (pages == -1) { 1449 pages = save_normal_page(pss, block, offset, p, send_async); 1450 } 1451 1452 XBZRLE_cache_unlock(); 1453 1454 return pages; 1455 } 1456 1457 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1458 ram_addr_t offset) 1459 { 1460 if (multifd_queue_page(file, block, offset) < 0) { 1461 return -1; 1462 } 1463 stat64_add(&ram_atomic_counters.normal, 1); 1464 1465 return 1; 1466 } 1467 1468 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1469 ram_addr_t offset, uint8_t *source_buf) 1470 { 1471 RAMState *rs = ram_state; 1472 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1473 uint8_t *p = block->host + offset; 1474 int ret; 1475 1476 if (save_zero_page_to_file(pss, block, offset)) { 1477 return true; 1478 } 1479 1480 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1481 1482 /* 1483 * copy it to a internal buffer to avoid it being modified by VM 1484 * so that we can catch up the error during compression and 1485 * decompression 1486 */ 1487 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1488 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1489 if (ret < 0) { 1490 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1491 error_report("compressed data failed!"); 1492 } 1493 return false; 1494 } 1495 1496 static void 1497 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1498 { 1499 ram_transferred_add(bytes_xmit); 1500 1501 if (param->zero_page) { 1502 stat64_add(&ram_atomic_counters.duplicate, 1); 1503 return; 1504 } 1505 1506 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1507 compression_counters.compressed_size += bytes_xmit - 8; 1508 compression_counters.pages++; 1509 } 1510 1511 static bool save_page_use_compression(RAMState *rs); 1512 1513 static void flush_compressed_data(RAMState *rs) 1514 { 1515 MigrationState *ms = migrate_get_current(); 1516 int idx, len, thread_count; 1517 1518 if (!save_page_use_compression(rs)) { 1519 return; 1520 } 1521 thread_count = migrate_compress_threads(); 1522 1523 qemu_mutex_lock(&comp_done_lock); 1524 for (idx = 0; idx < thread_count; idx++) { 1525 while (!comp_param[idx].done) { 1526 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1527 } 1528 } 1529 qemu_mutex_unlock(&comp_done_lock); 1530 1531 for (idx = 0; idx < thread_count; idx++) { 1532 qemu_mutex_lock(&comp_param[idx].mutex); 1533 if (!comp_param[idx].quit) { 1534 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1535 /* 1536 * it's safe to fetch zero_page without holding comp_done_lock 1537 * as there is no further request submitted to the thread, 1538 * i.e, the thread should be waiting for a request at this point. 1539 */ 1540 update_compress_thread_counts(&comp_param[idx], len); 1541 } 1542 qemu_mutex_unlock(&comp_param[idx].mutex); 1543 } 1544 } 1545 1546 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1547 ram_addr_t offset) 1548 { 1549 param->block = block; 1550 param->offset = offset; 1551 } 1552 1553 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1554 { 1555 int idx, thread_count, bytes_xmit = -1, pages = -1; 1556 bool wait = migrate_compress_wait_thread(); 1557 MigrationState *ms = migrate_get_current(); 1558 1559 thread_count = migrate_compress_threads(); 1560 qemu_mutex_lock(&comp_done_lock); 1561 retry: 1562 for (idx = 0; idx < thread_count; idx++) { 1563 if (comp_param[idx].done) { 1564 comp_param[idx].done = false; 1565 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1566 comp_param[idx].file); 1567 qemu_mutex_lock(&comp_param[idx].mutex); 1568 set_compress_params(&comp_param[idx], block, offset); 1569 qemu_cond_signal(&comp_param[idx].cond); 1570 qemu_mutex_unlock(&comp_param[idx].mutex); 1571 pages = 1; 1572 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1573 break; 1574 } 1575 } 1576 1577 /* 1578 * wait for the free thread if the user specifies 'compress-wait-thread', 1579 * otherwise we will post the page out in the main thread as normal page. 1580 */ 1581 if (pages < 0 && wait) { 1582 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1583 goto retry; 1584 } 1585 qemu_mutex_unlock(&comp_done_lock); 1586 1587 return pages; 1588 } 1589 1590 #define PAGE_ALL_CLEAN 0 1591 #define PAGE_TRY_AGAIN 1 1592 #define PAGE_DIRTY_FOUND 2 1593 /** 1594 * find_dirty_block: find the next dirty page and update any state 1595 * associated with the search process. 1596 * 1597 * Returns: 1598 * PAGE_ALL_CLEAN: no dirty page found, give up 1599 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1600 * PAGE_DIRTY_FOUND: dirty page found 1601 * 1602 * @rs: current RAM state 1603 * @pss: data about the state of the current dirty page scan 1604 * @again: set to false if the search has scanned the whole of RAM 1605 */ 1606 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1607 { 1608 /* Update pss->page for the next dirty bit in ramblock */ 1609 pss_find_next_dirty(pss); 1610 1611 if (pss->complete_round && pss->block == rs->last_seen_block && 1612 pss->page >= rs->last_page) { 1613 /* 1614 * We've been once around the RAM and haven't found anything. 1615 * Give up. 1616 */ 1617 return PAGE_ALL_CLEAN; 1618 } 1619 if (!offset_in_ramblock(pss->block, 1620 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1621 /* Didn't find anything in this RAM Block */ 1622 pss->page = 0; 1623 pss->block = QLIST_NEXT_RCU(pss->block, next); 1624 if (!pss->block) { 1625 /* 1626 * If memory migration starts over, we will meet a dirtied page 1627 * which may still exists in compression threads's ring, so we 1628 * should flush the compressed data to make sure the new page 1629 * is not overwritten by the old one in the destination. 1630 * 1631 * Also If xbzrle is on, stop using the data compression at this 1632 * point. In theory, xbzrle can do better than compression. 1633 */ 1634 flush_compressed_data(rs); 1635 1636 /* Hit the end of the list */ 1637 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1638 /* Flag that we've looped */ 1639 pss->complete_round = true; 1640 /* After the first round, enable XBZRLE. */ 1641 if (migrate_use_xbzrle()) { 1642 rs->xbzrle_enabled = true; 1643 } 1644 } 1645 /* Didn't find anything this time, but try again on the new block */ 1646 return PAGE_TRY_AGAIN; 1647 } else { 1648 /* We've found something */ 1649 return PAGE_DIRTY_FOUND; 1650 } 1651 } 1652 1653 /** 1654 * unqueue_page: gets a page of the queue 1655 * 1656 * Helper for 'get_queued_page' - gets a page off the queue 1657 * 1658 * Returns the block of the page (or NULL if none available) 1659 * 1660 * @rs: current RAM state 1661 * @offset: used to return the offset within the RAMBlock 1662 */ 1663 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1664 { 1665 struct RAMSrcPageRequest *entry; 1666 RAMBlock *block = NULL; 1667 1668 if (!postcopy_has_request(rs)) { 1669 return NULL; 1670 } 1671 1672 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1673 1674 /* 1675 * This should _never_ change even after we take the lock, because no one 1676 * should be taking anything off the request list other than us. 1677 */ 1678 assert(postcopy_has_request(rs)); 1679 1680 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1681 block = entry->rb; 1682 *offset = entry->offset; 1683 1684 if (entry->len > TARGET_PAGE_SIZE) { 1685 entry->len -= TARGET_PAGE_SIZE; 1686 entry->offset += TARGET_PAGE_SIZE; 1687 } else { 1688 memory_region_unref(block->mr); 1689 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1690 g_free(entry); 1691 migration_consume_urgent_request(); 1692 } 1693 1694 return block; 1695 } 1696 1697 #if defined(__linux__) 1698 /** 1699 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1700 * is found, return RAM block pointer and page offset 1701 * 1702 * Returns pointer to the RAMBlock containing faulting page, 1703 * NULL if no write faults are pending 1704 * 1705 * @rs: current RAM state 1706 * @offset: page offset from the beginning of the block 1707 */ 1708 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1709 { 1710 struct uffd_msg uffd_msg; 1711 void *page_address; 1712 RAMBlock *block; 1713 int res; 1714 1715 if (!migrate_background_snapshot()) { 1716 return NULL; 1717 } 1718 1719 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1720 if (res <= 0) { 1721 return NULL; 1722 } 1723 1724 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1725 block = qemu_ram_block_from_host(page_address, false, offset); 1726 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1727 return block; 1728 } 1729 1730 /** 1731 * ram_save_release_protection: release UFFD write protection after 1732 * a range of pages has been saved 1733 * 1734 * @rs: current RAM state 1735 * @pss: page-search-status structure 1736 * @start_page: index of the first page in the range relative to pss->block 1737 * 1738 * Returns 0 on success, negative value in case of an error 1739 */ 1740 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1741 unsigned long start_page) 1742 { 1743 int res = 0; 1744 1745 /* Check if page is from UFFD-managed region. */ 1746 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1747 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1748 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1749 1750 /* Flush async buffers before un-protect. */ 1751 qemu_fflush(pss->pss_channel); 1752 /* Un-protect memory range. */ 1753 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1754 false, false); 1755 } 1756 1757 return res; 1758 } 1759 1760 /* ram_write_tracking_available: check if kernel supports required UFFD features 1761 * 1762 * Returns true if supports, false otherwise 1763 */ 1764 bool ram_write_tracking_available(void) 1765 { 1766 uint64_t uffd_features; 1767 int res; 1768 1769 res = uffd_query_features(&uffd_features); 1770 return (res == 0 && 1771 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1772 } 1773 1774 /* ram_write_tracking_compatible: check if guest configuration is 1775 * compatible with 'write-tracking' 1776 * 1777 * Returns true if compatible, false otherwise 1778 */ 1779 bool ram_write_tracking_compatible(void) 1780 { 1781 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1782 int uffd_fd; 1783 RAMBlock *block; 1784 bool ret = false; 1785 1786 /* Open UFFD file descriptor */ 1787 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1788 if (uffd_fd < 0) { 1789 return false; 1790 } 1791 1792 RCU_READ_LOCK_GUARD(); 1793 1794 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1795 uint64_t uffd_ioctls; 1796 1797 /* Nothing to do with read-only and MMIO-writable regions */ 1798 if (block->mr->readonly || block->mr->rom_device) { 1799 continue; 1800 } 1801 /* Try to register block memory via UFFD-IO to track writes */ 1802 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1803 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1804 goto out; 1805 } 1806 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1807 goto out; 1808 } 1809 } 1810 ret = true; 1811 1812 out: 1813 uffd_close_fd(uffd_fd); 1814 return ret; 1815 } 1816 1817 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1818 ram_addr_t size) 1819 { 1820 const ram_addr_t end = offset + size; 1821 1822 /* 1823 * We read one byte of each page; this will preallocate page tables if 1824 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1825 * where no page was populated yet. This might require adaption when 1826 * supporting other mappings, like shmem. 1827 */ 1828 for (; offset < end; offset += block->page_size) { 1829 char tmp = *((char *)block->host + offset); 1830 1831 /* Don't optimize the read out */ 1832 asm volatile("" : "+r" (tmp)); 1833 } 1834 } 1835 1836 static inline int populate_read_section(MemoryRegionSection *section, 1837 void *opaque) 1838 { 1839 const hwaddr size = int128_get64(section->size); 1840 hwaddr offset = section->offset_within_region; 1841 RAMBlock *block = section->mr->ram_block; 1842 1843 populate_read_range(block, offset, size); 1844 return 0; 1845 } 1846 1847 /* 1848 * ram_block_populate_read: preallocate page tables and populate pages in the 1849 * RAM block by reading a byte of each page. 1850 * 1851 * Since it's solely used for userfault_fd WP feature, here we just 1852 * hardcode page size to qemu_real_host_page_size. 1853 * 1854 * @block: RAM block to populate 1855 */ 1856 static void ram_block_populate_read(RAMBlock *rb) 1857 { 1858 /* 1859 * Skip populating all pages that fall into a discarded range as managed by 1860 * a RamDiscardManager responsible for the mapped memory region of the 1861 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1862 * must not get populated automatically. We don't have to track 1863 * modifications via userfaultfd WP reliably, because these pages will 1864 * not be part of the migration stream either way -- see 1865 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1866 * 1867 * Note: The result is only stable while migrating (precopy/postcopy). 1868 */ 1869 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1870 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1871 MemoryRegionSection section = { 1872 .mr = rb->mr, 1873 .offset_within_region = 0, 1874 .size = rb->mr->size, 1875 }; 1876 1877 ram_discard_manager_replay_populated(rdm, §ion, 1878 populate_read_section, NULL); 1879 } else { 1880 populate_read_range(rb, 0, rb->used_length); 1881 } 1882 } 1883 1884 /* 1885 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1886 */ 1887 void ram_write_tracking_prepare(void) 1888 { 1889 RAMBlock *block; 1890 1891 RCU_READ_LOCK_GUARD(); 1892 1893 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1894 /* Nothing to do with read-only and MMIO-writable regions */ 1895 if (block->mr->readonly || block->mr->rom_device) { 1896 continue; 1897 } 1898 1899 /* 1900 * Populate pages of the RAM block before enabling userfault_fd 1901 * write protection. 1902 * 1903 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1904 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1905 * pages with pte_none() entries in page table. 1906 */ 1907 ram_block_populate_read(block); 1908 } 1909 } 1910 1911 static inline int uffd_protect_section(MemoryRegionSection *section, 1912 void *opaque) 1913 { 1914 const hwaddr size = int128_get64(section->size); 1915 const hwaddr offset = section->offset_within_region; 1916 RAMBlock *rb = section->mr->ram_block; 1917 int uffd_fd = (uintptr_t)opaque; 1918 1919 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1920 false); 1921 } 1922 1923 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1924 { 1925 assert(rb->flags & RAM_UF_WRITEPROTECT); 1926 1927 /* See ram_block_populate_read() */ 1928 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1929 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1930 MemoryRegionSection section = { 1931 .mr = rb->mr, 1932 .offset_within_region = 0, 1933 .size = rb->mr->size, 1934 }; 1935 1936 return ram_discard_manager_replay_populated(rdm, §ion, 1937 uffd_protect_section, 1938 (void *)(uintptr_t)uffd_fd); 1939 } 1940 return uffd_change_protection(uffd_fd, rb->host, 1941 rb->used_length, true, false); 1942 } 1943 1944 /* 1945 * ram_write_tracking_start: start UFFD-WP memory tracking 1946 * 1947 * Returns 0 for success or negative value in case of error 1948 */ 1949 int ram_write_tracking_start(void) 1950 { 1951 int uffd_fd; 1952 RAMState *rs = ram_state; 1953 RAMBlock *block; 1954 1955 /* Open UFFD file descriptor */ 1956 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1957 if (uffd_fd < 0) { 1958 return uffd_fd; 1959 } 1960 rs->uffdio_fd = uffd_fd; 1961 1962 RCU_READ_LOCK_GUARD(); 1963 1964 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1965 /* Nothing to do with read-only and MMIO-writable regions */ 1966 if (block->mr->readonly || block->mr->rom_device) { 1967 continue; 1968 } 1969 1970 /* Register block memory with UFFD to track writes */ 1971 if (uffd_register_memory(rs->uffdio_fd, block->host, 1972 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1973 goto fail; 1974 } 1975 block->flags |= RAM_UF_WRITEPROTECT; 1976 memory_region_ref(block->mr); 1977 1978 /* Apply UFFD write protection to the block memory range */ 1979 if (ram_block_uffd_protect(block, uffd_fd)) { 1980 goto fail; 1981 } 1982 1983 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1984 block->host, block->max_length); 1985 } 1986 1987 return 0; 1988 1989 fail: 1990 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1991 1992 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1993 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1994 continue; 1995 } 1996 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1997 /* Cleanup flags and remove reference */ 1998 block->flags &= ~RAM_UF_WRITEPROTECT; 1999 memory_region_unref(block->mr); 2000 } 2001 2002 uffd_close_fd(uffd_fd); 2003 rs->uffdio_fd = -1; 2004 return -1; 2005 } 2006 2007 /** 2008 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 2009 */ 2010 void ram_write_tracking_stop(void) 2011 { 2012 RAMState *rs = ram_state; 2013 RAMBlock *block; 2014 2015 RCU_READ_LOCK_GUARD(); 2016 2017 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2018 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 2019 continue; 2020 } 2021 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 2022 2023 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 2024 block->host, block->max_length); 2025 2026 /* Cleanup flags and remove reference */ 2027 block->flags &= ~RAM_UF_WRITEPROTECT; 2028 memory_region_unref(block->mr); 2029 } 2030 2031 /* Finally close UFFD file descriptor */ 2032 uffd_close_fd(rs->uffdio_fd); 2033 rs->uffdio_fd = -1; 2034 } 2035 2036 #else 2037 /* No target OS support, stubs just fail or ignore */ 2038 2039 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 2040 { 2041 (void) rs; 2042 (void) offset; 2043 2044 return NULL; 2045 } 2046 2047 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 2048 unsigned long start_page) 2049 { 2050 (void) rs; 2051 (void) pss; 2052 (void) start_page; 2053 2054 return 0; 2055 } 2056 2057 bool ram_write_tracking_available(void) 2058 { 2059 return false; 2060 } 2061 2062 bool ram_write_tracking_compatible(void) 2063 { 2064 assert(0); 2065 return false; 2066 } 2067 2068 int ram_write_tracking_start(void) 2069 { 2070 assert(0); 2071 return -1; 2072 } 2073 2074 void ram_write_tracking_stop(void) 2075 { 2076 assert(0); 2077 } 2078 #endif /* defined(__linux__) */ 2079 2080 /** 2081 * get_queued_page: unqueue a page from the postcopy requests 2082 * 2083 * Skips pages that are already sent (!dirty) 2084 * 2085 * Returns true if a queued page is found 2086 * 2087 * @rs: current RAM state 2088 * @pss: data about the state of the current dirty page scan 2089 */ 2090 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2091 { 2092 RAMBlock *block; 2093 ram_addr_t offset; 2094 bool dirty; 2095 2096 do { 2097 block = unqueue_page(rs, &offset); 2098 /* 2099 * We're sending this page, and since it's postcopy nothing else 2100 * will dirty it, and we must make sure it doesn't get sent again 2101 * even if this queue request was received after the background 2102 * search already sent it. 2103 */ 2104 if (block) { 2105 unsigned long page; 2106 2107 page = offset >> TARGET_PAGE_BITS; 2108 dirty = test_bit(page, block->bmap); 2109 if (!dirty) { 2110 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2111 page); 2112 } else { 2113 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2114 } 2115 } 2116 2117 } while (block && !dirty); 2118 2119 if (!block) { 2120 /* 2121 * Poll write faults too if background snapshot is enabled; that's 2122 * when we have vcpus got blocked by the write protected pages. 2123 */ 2124 block = poll_fault_page(rs, &offset); 2125 } 2126 2127 if (block) { 2128 /* 2129 * We want the background search to continue from the queued page 2130 * since the guest is likely to want other pages near to the page 2131 * it just requested. 2132 */ 2133 pss->block = block; 2134 pss->page = offset >> TARGET_PAGE_BITS; 2135 2136 /* 2137 * This unqueued page would break the "one round" check, even is 2138 * really rare. 2139 */ 2140 pss->complete_round = false; 2141 } 2142 2143 return !!block; 2144 } 2145 2146 /** 2147 * migration_page_queue_free: drop any remaining pages in the ram 2148 * request queue 2149 * 2150 * It should be empty at the end anyway, but in error cases there may 2151 * be some left. in case that there is any page left, we drop it. 2152 * 2153 */ 2154 static void migration_page_queue_free(RAMState *rs) 2155 { 2156 struct RAMSrcPageRequest *mspr, *next_mspr; 2157 /* This queue generally should be empty - but in the case of a failed 2158 * migration might have some droppings in. 2159 */ 2160 RCU_READ_LOCK_GUARD(); 2161 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2162 memory_region_unref(mspr->rb->mr); 2163 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2164 g_free(mspr); 2165 } 2166 } 2167 2168 /** 2169 * ram_save_queue_pages: queue the page for transmission 2170 * 2171 * A request from postcopy destination for example. 2172 * 2173 * Returns zero on success or negative on error 2174 * 2175 * @rbname: Name of the RAMBLock of the request. NULL means the 2176 * same that last one. 2177 * @start: starting address from the start of the RAMBlock 2178 * @len: length (in bytes) to send 2179 */ 2180 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2181 { 2182 RAMBlock *ramblock; 2183 RAMState *rs = ram_state; 2184 2185 ram_counters.postcopy_requests++; 2186 RCU_READ_LOCK_GUARD(); 2187 2188 if (!rbname) { 2189 /* Reuse last RAMBlock */ 2190 ramblock = rs->last_req_rb; 2191 2192 if (!ramblock) { 2193 /* 2194 * Shouldn't happen, we can't reuse the last RAMBlock if 2195 * it's the 1st request. 2196 */ 2197 error_report("ram_save_queue_pages no previous block"); 2198 return -1; 2199 } 2200 } else { 2201 ramblock = qemu_ram_block_by_name(rbname); 2202 2203 if (!ramblock) { 2204 /* We shouldn't be asked for a non-existent RAMBlock */ 2205 error_report("ram_save_queue_pages no block '%s'", rbname); 2206 return -1; 2207 } 2208 rs->last_req_rb = ramblock; 2209 } 2210 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2211 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2212 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2213 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2214 __func__, start, len, ramblock->used_length); 2215 return -1; 2216 } 2217 2218 /* 2219 * When with postcopy preempt, we send back the page directly in the 2220 * rp-return thread. 2221 */ 2222 if (postcopy_preempt_active()) { 2223 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2224 size_t page_size = qemu_ram_pagesize(ramblock); 2225 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2226 int ret = 0; 2227 2228 qemu_mutex_lock(&rs->bitmap_mutex); 2229 2230 pss_init(pss, ramblock, page_start); 2231 /* 2232 * Always use the preempt channel, and make sure it's there. It's 2233 * safe to access without lock, because when rp-thread is running 2234 * we should be the only one who operates on the qemufile 2235 */ 2236 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2237 assert(pss->pss_channel); 2238 2239 /* 2240 * It must be either one or multiple of host page size. Just 2241 * assert; if something wrong we're mostly split brain anyway. 2242 */ 2243 assert(len % page_size == 0); 2244 while (len) { 2245 if (ram_save_host_page_urgent(pss)) { 2246 error_report("%s: ram_save_host_page_urgent() failed: " 2247 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2248 __func__, ramblock->idstr, start); 2249 ret = -1; 2250 break; 2251 } 2252 /* 2253 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2254 * will automatically be moved and point to the next host page 2255 * we're going to send, so no need to update here. 2256 * 2257 * Normally QEMU never sends >1 host page in requests, so 2258 * logically we don't even need that as the loop should only 2259 * run once, but just to be consistent. 2260 */ 2261 len -= page_size; 2262 }; 2263 qemu_mutex_unlock(&rs->bitmap_mutex); 2264 2265 return ret; 2266 } 2267 2268 struct RAMSrcPageRequest *new_entry = 2269 g_new0(struct RAMSrcPageRequest, 1); 2270 new_entry->rb = ramblock; 2271 new_entry->offset = start; 2272 new_entry->len = len; 2273 2274 memory_region_ref(ramblock->mr); 2275 qemu_mutex_lock(&rs->src_page_req_mutex); 2276 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2277 migration_make_urgent_request(); 2278 qemu_mutex_unlock(&rs->src_page_req_mutex); 2279 2280 return 0; 2281 } 2282 2283 static bool save_page_use_compression(RAMState *rs) 2284 { 2285 if (!migrate_use_compression()) { 2286 return false; 2287 } 2288 2289 /* 2290 * If xbzrle is enabled (e.g., after first round of migration), stop 2291 * using the data compression. In theory, xbzrle can do better than 2292 * compression. 2293 */ 2294 if (rs->xbzrle_enabled) { 2295 return false; 2296 } 2297 2298 return true; 2299 } 2300 2301 /* 2302 * try to compress the page before posting it out, return true if the page 2303 * has been properly handled by compression, otherwise needs other 2304 * paths to handle it 2305 */ 2306 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2307 RAMBlock *block, ram_addr_t offset) 2308 { 2309 if (!save_page_use_compression(rs)) { 2310 return false; 2311 } 2312 2313 /* 2314 * When starting the process of a new block, the first page of 2315 * the block should be sent out before other pages in the same 2316 * block, and all the pages in last block should have been sent 2317 * out, keeping this order is important, because the 'cont' flag 2318 * is used to avoid resending the block name. 2319 * 2320 * We post the fist page as normal page as compression will take 2321 * much CPU resource. 2322 */ 2323 if (block != pss->last_sent_block) { 2324 flush_compressed_data(rs); 2325 return false; 2326 } 2327 2328 if (compress_page_with_multi_thread(block, offset) > 0) { 2329 return true; 2330 } 2331 2332 compression_counters.busy++; 2333 return false; 2334 } 2335 2336 /** 2337 * ram_save_target_page_legacy: save one target page 2338 * 2339 * Returns the number of pages written 2340 * 2341 * @rs: current RAM state 2342 * @pss: data about the page we want to send 2343 */ 2344 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2345 { 2346 RAMBlock *block = pss->block; 2347 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2348 int res; 2349 2350 if (control_save_page(pss, block, offset, &res)) { 2351 return res; 2352 } 2353 2354 if (save_compress_page(rs, pss, block, offset)) { 2355 return 1; 2356 } 2357 2358 res = save_zero_page(pss, block, offset); 2359 if (res > 0) { 2360 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2361 * page would be stale 2362 */ 2363 if (rs->xbzrle_enabled) { 2364 XBZRLE_cache_lock(); 2365 xbzrle_cache_zero_page(rs, block->offset + offset); 2366 XBZRLE_cache_unlock(); 2367 } 2368 return res; 2369 } 2370 2371 /* 2372 * Do not use multifd in postcopy as one whole host page should be 2373 * placed. Meanwhile postcopy requires atomic update of pages, so even 2374 * if host page size == guest page size the dest guest during run may 2375 * still see partially copied pages which is data corruption. 2376 */ 2377 if (migrate_use_multifd() && !migration_in_postcopy()) { 2378 return ram_save_multifd_page(pss->pss_channel, block, offset); 2379 } 2380 2381 return ram_save_page(rs, pss); 2382 } 2383 2384 /* Should be called before sending a host page */ 2385 static void pss_host_page_prepare(PageSearchStatus *pss) 2386 { 2387 /* How many guest pages are there in one host page? */ 2388 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2389 2390 pss->host_page_sending = true; 2391 if (guest_pfns <= 1) { 2392 /* 2393 * This covers both when guest psize == host psize, or when guest 2394 * has larger psize than the host (guest_pfns==0). 2395 * 2396 * For the latter, we always send one whole guest page per 2397 * iteration of the host page (example: an Alpha VM on x86 host 2398 * will have guest psize 8K while host psize 4K). 2399 */ 2400 pss->host_page_start = pss->page; 2401 pss->host_page_end = pss->page + 1; 2402 } else { 2403 /* 2404 * The host page spans over multiple guest pages, we send them 2405 * within the same host page iteration. 2406 */ 2407 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2408 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2409 } 2410 } 2411 2412 /* 2413 * Whether the page pointed by PSS is within the host page being sent. 2414 * Must be called after a previous pss_host_page_prepare(). 2415 */ 2416 static bool pss_within_range(PageSearchStatus *pss) 2417 { 2418 ram_addr_t ram_addr; 2419 2420 assert(pss->host_page_sending); 2421 2422 /* Over host-page boundary? */ 2423 if (pss->page >= pss->host_page_end) { 2424 return false; 2425 } 2426 2427 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2428 2429 return offset_in_ramblock(pss->block, ram_addr); 2430 } 2431 2432 static void pss_host_page_finish(PageSearchStatus *pss) 2433 { 2434 pss->host_page_sending = false; 2435 /* This is not needed, but just to reset it */ 2436 pss->host_page_start = pss->host_page_end = 0; 2437 } 2438 2439 /* 2440 * Send an urgent host page specified by `pss'. Need to be called with 2441 * bitmap_mutex held. 2442 * 2443 * Returns 0 if save host page succeeded, false otherwise. 2444 */ 2445 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2446 { 2447 bool page_dirty, sent = false; 2448 RAMState *rs = ram_state; 2449 int ret = 0; 2450 2451 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2452 pss_host_page_prepare(pss); 2453 2454 /* 2455 * If precopy is sending the same page, let it be done in precopy, or 2456 * we could send the same page in two channels and none of them will 2457 * receive the whole page. 2458 */ 2459 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2460 trace_postcopy_preempt_hit(pss->block->idstr, 2461 pss->page << TARGET_PAGE_BITS); 2462 return 0; 2463 } 2464 2465 do { 2466 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2467 2468 if (page_dirty) { 2469 /* Be strict to return code; it must be 1, or what else? */ 2470 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2471 error_report_once("%s: ram_save_target_page failed", __func__); 2472 ret = -1; 2473 goto out; 2474 } 2475 sent = true; 2476 } 2477 pss_find_next_dirty(pss); 2478 } while (pss_within_range(pss)); 2479 out: 2480 pss_host_page_finish(pss); 2481 /* For urgent requests, flush immediately if sent */ 2482 if (sent) { 2483 qemu_fflush(pss->pss_channel); 2484 } 2485 return ret; 2486 } 2487 2488 /** 2489 * ram_save_host_page: save a whole host page 2490 * 2491 * Starting at *offset send pages up to the end of the current host 2492 * page. It's valid for the initial offset to point into the middle of 2493 * a host page in which case the remainder of the hostpage is sent. 2494 * Only dirty target pages are sent. Note that the host page size may 2495 * be a huge page for this block. 2496 * 2497 * The saving stops at the boundary of the used_length of the block 2498 * if the RAMBlock isn't a multiple of the host page size. 2499 * 2500 * The caller must be with ram_state.bitmap_mutex held to call this 2501 * function. Note that this function can temporarily release the lock, but 2502 * when the function is returned it'll make sure the lock is still held. 2503 * 2504 * Returns the number of pages written or negative on error 2505 * 2506 * @rs: current RAM state 2507 * @pss: data about the page we want to send 2508 */ 2509 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2510 { 2511 bool page_dirty, preempt_active = postcopy_preempt_active(); 2512 int tmppages, pages = 0; 2513 size_t pagesize_bits = 2514 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2515 unsigned long start_page = pss->page; 2516 int res; 2517 2518 if (ramblock_is_ignored(pss->block)) { 2519 error_report("block %s should not be migrated !", pss->block->idstr); 2520 return 0; 2521 } 2522 2523 /* Update host page boundary information */ 2524 pss_host_page_prepare(pss); 2525 2526 do { 2527 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2528 2529 /* Check the pages is dirty and if it is send it */ 2530 if (page_dirty) { 2531 /* 2532 * Properly yield the lock only in postcopy preempt mode 2533 * because both migration thread and rp-return thread can 2534 * operate on the bitmaps. 2535 */ 2536 if (preempt_active) { 2537 qemu_mutex_unlock(&rs->bitmap_mutex); 2538 } 2539 tmppages = migration_ops->ram_save_target_page(rs, pss); 2540 if (tmppages >= 0) { 2541 pages += tmppages; 2542 /* 2543 * Allow rate limiting to happen in the middle of huge pages if 2544 * something is sent in the current iteration. 2545 */ 2546 if (pagesize_bits > 1 && tmppages > 0) { 2547 migration_rate_limit(); 2548 } 2549 } 2550 if (preempt_active) { 2551 qemu_mutex_lock(&rs->bitmap_mutex); 2552 } 2553 } else { 2554 tmppages = 0; 2555 } 2556 2557 if (tmppages < 0) { 2558 pss_host_page_finish(pss); 2559 return tmppages; 2560 } 2561 2562 pss_find_next_dirty(pss); 2563 } while (pss_within_range(pss)); 2564 2565 pss_host_page_finish(pss); 2566 2567 res = ram_save_release_protection(rs, pss, start_page); 2568 return (res < 0 ? res : pages); 2569 } 2570 2571 /** 2572 * ram_find_and_save_block: finds a dirty page and sends it to f 2573 * 2574 * Called within an RCU critical section. 2575 * 2576 * Returns the number of pages written where zero means no dirty pages, 2577 * or negative on error 2578 * 2579 * @rs: current RAM state 2580 * 2581 * On systems where host-page-size > target-page-size it will send all the 2582 * pages in a host page that are dirty. 2583 */ 2584 static int ram_find_and_save_block(RAMState *rs) 2585 { 2586 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2587 int pages = 0; 2588 2589 /* No dirty page as there is zero RAM */ 2590 if (!rs->ram_bytes_total) { 2591 return pages; 2592 } 2593 2594 /* 2595 * Always keep last_seen_block/last_page valid during this procedure, 2596 * because find_dirty_block() relies on these values (e.g., we compare 2597 * last_seen_block with pss.block to see whether we searched all the 2598 * ramblocks) to detect the completion of migration. Having NULL value 2599 * of last_seen_block can conditionally cause below loop to run forever. 2600 */ 2601 if (!rs->last_seen_block) { 2602 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2603 rs->last_page = 0; 2604 } 2605 2606 pss_init(pss, rs->last_seen_block, rs->last_page); 2607 2608 while (true){ 2609 if (!get_queued_page(rs, pss)) { 2610 /* priority queue empty, so just search for something dirty */ 2611 int res = find_dirty_block(rs, pss); 2612 if (res != PAGE_DIRTY_FOUND) { 2613 if (res == PAGE_ALL_CLEAN) { 2614 break; 2615 } else if (res == PAGE_TRY_AGAIN) { 2616 continue; 2617 } 2618 } 2619 } 2620 pages = ram_save_host_page(rs, pss); 2621 if (pages) { 2622 break; 2623 } 2624 } 2625 2626 rs->last_seen_block = pss->block; 2627 rs->last_page = pss->page; 2628 2629 return pages; 2630 } 2631 2632 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2633 { 2634 uint64_t pages = size / TARGET_PAGE_SIZE; 2635 2636 if (zero) { 2637 stat64_add(&ram_atomic_counters.duplicate, pages); 2638 } else { 2639 stat64_add(&ram_atomic_counters.normal, pages); 2640 ram_transferred_add(size); 2641 qemu_file_credit_transfer(f, size); 2642 } 2643 } 2644 2645 static uint64_t ram_bytes_total_with_ignored(void) 2646 { 2647 RAMBlock *block; 2648 uint64_t total = 0; 2649 2650 RCU_READ_LOCK_GUARD(); 2651 2652 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2653 total += block->used_length; 2654 } 2655 return total; 2656 } 2657 2658 uint64_t ram_bytes_total(void) 2659 { 2660 RAMBlock *block; 2661 uint64_t total = 0; 2662 2663 RCU_READ_LOCK_GUARD(); 2664 2665 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2666 total += block->used_length; 2667 } 2668 return total; 2669 } 2670 2671 static void xbzrle_load_setup(void) 2672 { 2673 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2674 } 2675 2676 static void xbzrle_load_cleanup(void) 2677 { 2678 g_free(XBZRLE.decoded_buf); 2679 XBZRLE.decoded_buf = NULL; 2680 } 2681 2682 static void ram_state_cleanup(RAMState **rsp) 2683 { 2684 if (*rsp) { 2685 migration_page_queue_free(*rsp); 2686 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2687 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2688 g_free(*rsp); 2689 *rsp = NULL; 2690 } 2691 } 2692 2693 static void xbzrle_cleanup(void) 2694 { 2695 XBZRLE_cache_lock(); 2696 if (XBZRLE.cache) { 2697 cache_fini(XBZRLE.cache); 2698 g_free(XBZRLE.encoded_buf); 2699 g_free(XBZRLE.current_buf); 2700 g_free(XBZRLE.zero_target_page); 2701 XBZRLE.cache = NULL; 2702 XBZRLE.encoded_buf = NULL; 2703 XBZRLE.current_buf = NULL; 2704 XBZRLE.zero_target_page = NULL; 2705 } 2706 XBZRLE_cache_unlock(); 2707 } 2708 2709 static void ram_save_cleanup(void *opaque) 2710 { 2711 RAMState **rsp = opaque; 2712 RAMBlock *block; 2713 2714 /* We don't use dirty log with background snapshots */ 2715 if (!migrate_background_snapshot()) { 2716 /* caller have hold iothread lock or is in a bh, so there is 2717 * no writing race against the migration bitmap 2718 */ 2719 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2720 /* 2721 * do not stop dirty log without starting it, since 2722 * memory_global_dirty_log_stop will assert that 2723 * memory_global_dirty_log_start/stop used in pairs 2724 */ 2725 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2726 } 2727 } 2728 2729 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2730 g_free(block->clear_bmap); 2731 block->clear_bmap = NULL; 2732 g_free(block->bmap); 2733 block->bmap = NULL; 2734 } 2735 2736 xbzrle_cleanup(); 2737 compress_threads_save_cleanup(); 2738 ram_state_cleanup(rsp); 2739 g_free(migration_ops); 2740 migration_ops = NULL; 2741 } 2742 2743 static void ram_state_reset(RAMState *rs) 2744 { 2745 int i; 2746 2747 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2748 rs->pss[i].last_sent_block = NULL; 2749 } 2750 2751 rs->last_seen_block = NULL; 2752 rs->last_page = 0; 2753 rs->last_version = ram_list.version; 2754 rs->xbzrle_enabled = false; 2755 } 2756 2757 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2758 2759 /* **** functions for postcopy ***** */ 2760 2761 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2762 { 2763 struct RAMBlock *block; 2764 2765 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2766 unsigned long *bitmap = block->bmap; 2767 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2768 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2769 2770 while (run_start < range) { 2771 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2772 ram_discard_range(block->idstr, 2773 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2774 ((ram_addr_t)(run_end - run_start)) 2775 << TARGET_PAGE_BITS); 2776 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2777 } 2778 } 2779 } 2780 2781 /** 2782 * postcopy_send_discard_bm_ram: discard a RAMBlock 2783 * 2784 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2785 * 2786 * @ms: current migration state 2787 * @block: RAMBlock to discard 2788 */ 2789 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2790 { 2791 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2792 unsigned long current; 2793 unsigned long *bitmap = block->bmap; 2794 2795 for (current = 0; current < end; ) { 2796 unsigned long one = find_next_bit(bitmap, end, current); 2797 unsigned long zero, discard_length; 2798 2799 if (one >= end) { 2800 break; 2801 } 2802 2803 zero = find_next_zero_bit(bitmap, end, one + 1); 2804 2805 if (zero >= end) { 2806 discard_length = end - one; 2807 } else { 2808 discard_length = zero - one; 2809 } 2810 postcopy_discard_send_range(ms, one, discard_length); 2811 current = one + discard_length; 2812 } 2813 } 2814 2815 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2816 2817 /** 2818 * postcopy_each_ram_send_discard: discard all RAMBlocks 2819 * 2820 * Utility for the outgoing postcopy code. 2821 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2822 * passing it bitmap indexes and name. 2823 * (qemu_ram_foreach_block ends up passing unscaled lengths 2824 * which would mean postcopy code would have to deal with target page) 2825 * 2826 * @ms: current migration state 2827 */ 2828 static void postcopy_each_ram_send_discard(MigrationState *ms) 2829 { 2830 struct RAMBlock *block; 2831 2832 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2833 postcopy_discard_send_init(ms, block->idstr); 2834 2835 /* 2836 * Deal with TPS != HPS and huge pages. It discard any partially sent 2837 * host-page size chunks, mark any partially dirty host-page size 2838 * chunks as all dirty. In this case the host-page is the host-page 2839 * for the particular RAMBlock, i.e. it might be a huge page. 2840 */ 2841 postcopy_chunk_hostpages_pass(ms, block); 2842 2843 /* 2844 * Postcopy sends chunks of bitmap over the wire, but it 2845 * just needs indexes at this point, avoids it having 2846 * target page specific code. 2847 */ 2848 postcopy_send_discard_bm_ram(ms, block); 2849 postcopy_discard_send_finish(ms); 2850 } 2851 } 2852 2853 /** 2854 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2855 * 2856 * Helper for postcopy_chunk_hostpages; it's called twice to 2857 * canonicalize the two bitmaps, that are similar, but one is 2858 * inverted. 2859 * 2860 * Postcopy requires that all target pages in a hostpage are dirty or 2861 * clean, not a mix. This function canonicalizes the bitmaps. 2862 * 2863 * @ms: current migration state 2864 * @block: block that contains the page we want to canonicalize 2865 */ 2866 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2867 { 2868 RAMState *rs = ram_state; 2869 unsigned long *bitmap = block->bmap; 2870 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2871 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2872 unsigned long run_start; 2873 2874 if (block->page_size == TARGET_PAGE_SIZE) { 2875 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2876 return; 2877 } 2878 2879 /* Find a dirty page */ 2880 run_start = find_next_bit(bitmap, pages, 0); 2881 2882 while (run_start < pages) { 2883 2884 /* 2885 * If the start of this run of pages is in the middle of a host 2886 * page, then we need to fixup this host page. 2887 */ 2888 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2889 /* Find the end of this run */ 2890 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2891 /* 2892 * If the end isn't at the start of a host page, then the 2893 * run doesn't finish at the end of a host page 2894 * and we need to discard. 2895 */ 2896 } 2897 2898 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2899 unsigned long page; 2900 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2901 host_ratio); 2902 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2903 2904 /* Clean up the bitmap */ 2905 for (page = fixup_start_addr; 2906 page < fixup_start_addr + host_ratio; page++) { 2907 /* 2908 * Remark them as dirty, updating the count for any pages 2909 * that weren't previously dirty. 2910 */ 2911 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2912 } 2913 } 2914 2915 /* Find the next dirty page for the next iteration */ 2916 run_start = find_next_bit(bitmap, pages, run_start); 2917 } 2918 } 2919 2920 /** 2921 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2922 * 2923 * Transmit the set of pages to be discarded after precopy to the target 2924 * these are pages that: 2925 * a) Have been previously transmitted but are now dirty again 2926 * b) Pages that have never been transmitted, this ensures that 2927 * any pages on the destination that have been mapped by background 2928 * tasks get discarded (transparent huge pages is the specific concern) 2929 * Hopefully this is pretty sparse 2930 * 2931 * @ms: current migration state 2932 */ 2933 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2934 { 2935 RAMState *rs = ram_state; 2936 2937 RCU_READ_LOCK_GUARD(); 2938 2939 /* This should be our last sync, the src is now paused */ 2940 migration_bitmap_sync(rs); 2941 2942 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2943 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2944 rs->last_seen_block = NULL; 2945 rs->last_page = 0; 2946 2947 postcopy_each_ram_send_discard(ms); 2948 2949 trace_ram_postcopy_send_discard_bitmap(); 2950 } 2951 2952 /** 2953 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2954 * 2955 * Returns zero on success 2956 * 2957 * @rbname: name of the RAMBlock of the request. NULL means the 2958 * same that last one. 2959 * @start: RAMBlock starting page 2960 * @length: RAMBlock size 2961 */ 2962 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2963 { 2964 trace_ram_discard_range(rbname, start, length); 2965 2966 RCU_READ_LOCK_GUARD(); 2967 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2968 2969 if (!rb) { 2970 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2971 return -1; 2972 } 2973 2974 /* 2975 * On source VM, we don't need to update the received bitmap since 2976 * we don't even have one. 2977 */ 2978 if (rb->receivedmap) { 2979 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2980 length >> qemu_target_page_bits()); 2981 } 2982 2983 return ram_block_discard_range(rb, start, length); 2984 } 2985 2986 /* 2987 * For every allocation, we will try not to crash the VM if the 2988 * allocation failed. 2989 */ 2990 static int xbzrle_init(void) 2991 { 2992 Error *local_err = NULL; 2993 2994 if (!migrate_use_xbzrle()) { 2995 return 0; 2996 } 2997 2998 XBZRLE_cache_lock(); 2999 3000 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 3001 if (!XBZRLE.zero_target_page) { 3002 error_report("%s: Error allocating zero page", __func__); 3003 goto err_out; 3004 } 3005 3006 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 3007 TARGET_PAGE_SIZE, &local_err); 3008 if (!XBZRLE.cache) { 3009 error_report_err(local_err); 3010 goto free_zero_page; 3011 } 3012 3013 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 3014 if (!XBZRLE.encoded_buf) { 3015 error_report("%s: Error allocating encoded_buf", __func__); 3016 goto free_cache; 3017 } 3018 3019 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 3020 if (!XBZRLE.current_buf) { 3021 error_report("%s: Error allocating current_buf", __func__); 3022 goto free_encoded_buf; 3023 } 3024 3025 /* We are all good */ 3026 XBZRLE_cache_unlock(); 3027 return 0; 3028 3029 free_encoded_buf: 3030 g_free(XBZRLE.encoded_buf); 3031 XBZRLE.encoded_buf = NULL; 3032 free_cache: 3033 cache_fini(XBZRLE.cache); 3034 XBZRLE.cache = NULL; 3035 free_zero_page: 3036 g_free(XBZRLE.zero_target_page); 3037 XBZRLE.zero_target_page = NULL; 3038 err_out: 3039 XBZRLE_cache_unlock(); 3040 return -ENOMEM; 3041 } 3042 3043 static int ram_state_init(RAMState **rsp) 3044 { 3045 *rsp = g_try_new0(RAMState, 1); 3046 3047 if (!*rsp) { 3048 error_report("%s: Init ramstate fail", __func__); 3049 return -1; 3050 } 3051 3052 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3053 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3054 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3055 (*rsp)->ram_bytes_total = ram_bytes_total(); 3056 3057 /* 3058 * Count the total number of pages used by ram blocks not including any 3059 * gaps due to alignment or unplugs. 3060 * This must match with the initial values of dirty bitmap. 3061 */ 3062 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 3063 ram_state_reset(*rsp); 3064 3065 return 0; 3066 } 3067 3068 static void ram_list_init_bitmaps(void) 3069 { 3070 MigrationState *ms = migrate_get_current(); 3071 RAMBlock *block; 3072 unsigned long pages; 3073 uint8_t shift; 3074 3075 /* Skip setting bitmap if there is no RAM */ 3076 if (ram_bytes_total()) { 3077 shift = ms->clear_bitmap_shift; 3078 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3079 error_report("clear_bitmap_shift (%u) too big, using " 3080 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3081 shift = CLEAR_BITMAP_SHIFT_MAX; 3082 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3083 error_report("clear_bitmap_shift (%u) too small, using " 3084 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3085 shift = CLEAR_BITMAP_SHIFT_MIN; 3086 } 3087 3088 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3089 pages = block->max_length >> TARGET_PAGE_BITS; 3090 /* 3091 * The initial dirty bitmap for migration must be set with all 3092 * ones to make sure we'll migrate every guest RAM page to 3093 * destination. 3094 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3095 * new migration after a failed migration, ram_list. 3096 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3097 * guest memory. 3098 */ 3099 block->bmap = bitmap_new(pages); 3100 bitmap_set(block->bmap, 0, pages); 3101 block->clear_bmap_shift = shift; 3102 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3103 } 3104 } 3105 } 3106 3107 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3108 { 3109 unsigned long pages; 3110 RAMBlock *rb; 3111 3112 RCU_READ_LOCK_GUARD(); 3113 3114 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3115 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3116 rs->migration_dirty_pages -= pages; 3117 } 3118 } 3119 3120 static void ram_init_bitmaps(RAMState *rs) 3121 { 3122 /* For memory_global_dirty_log_start below. */ 3123 qemu_mutex_lock_iothread(); 3124 qemu_mutex_lock_ramlist(); 3125 3126 WITH_RCU_READ_LOCK_GUARD() { 3127 ram_list_init_bitmaps(); 3128 /* We don't use dirty log with background snapshots */ 3129 if (!migrate_background_snapshot()) { 3130 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3131 migration_bitmap_sync_precopy(rs); 3132 } 3133 } 3134 qemu_mutex_unlock_ramlist(); 3135 qemu_mutex_unlock_iothread(); 3136 3137 /* 3138 * After an eventual first bitmap sync, fixup the initial bitmap 3139 * containing all 1s to exclude any discarded pages from migration. 3140 */ 3141 migration_bitmap_clear_discarded_pages(rs); 3142 } 3143 3144 static int ram_init_all(RAMState **rsp) 3145 { 3146 if (ram_state_init(rsp)) { 3147 return -1; 3148 } 3149 3150 if (xbzrle_init()) { 3151 ram_state_cleanup(rsp); 3152 return -1; 3153 } 3154 3155 ram_init_bitmaps(*rsp); 3156 3157 return 0; 3158 } 3159 3160 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3161 { 3162 RAMBlock *block; 3163 uint64_t pages = 0; 3164 3165 /* 3166 * Postcopy is not using xbzrle/compression, so no need for that. 3167 * Also, since source are already halted, we don't need to care 3168 * about dirty page logging as well. 3169 */ 3170 3171 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3172 pages += bitmap_count_one(block->bmap, 3173 block->used_length >> TARGET_PAGE_BITS); 3174 } 3175 3176 /* This may not be aligned with current bitmaps. Recalculate. */ 3177 rs->migration_dirty_pages = pages; 3178 3179 ram_state_reset(rs); 3180 3181 /* Update RAMState cache of output QEMUFile */ 3182 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3183 3184 trace_ram_state_resume_prepare(pages); 3185 } 3186 3187 /* 3188 * This function clears bits of the free pages reported by the caller from the 3189 * migration dirty bitmap. @addr is the host address corresponding to the 3190 * start of the continuous guest free pages, and @len is the total bytes of 3191 * those pages. 3192 */ 3193 void qemu_guest_free_page_hint(void *addr, size_t len) 3194 { 3195 RAMBlock *block; 3196 ram_addr_t offset; 3197 size_t used_len, start, npages; 3198 MigrationState *s = migrate_get_current(); 3199 3200 /* This function is currently expected to be used during live migration */ 3201 if (!migration_is_setup_or_active(s->state)) { 3202 return; 3203 } 3204 3205 for (; len > 0; len -= used_len, addr += used_len) { 3206 block = qemu_ram_block_from_host(addr, false, &offset); 3207 if (unlikely(!block || offset >= block->used_length)) { 3208 /* 3209 * The implementation might not support RAMBlock resize during 3210 * live migration, but it could happen in theory with future 3211 * updates. So we add a check here to capture that case. 3212 */ 3213 error_report_once("%s unexpected error", __func__); 3214 return; 3215 } 3216 3217 if (len <= block->used_length - offset) { 3218 used_len = len; 3219 } else { 3220 used_len = block->used_length - offset; 3221 } 3222 3223 start = offset >> TARGET_PAGE_BITS; 3224 npages = used_len >> TARGET_PAGE_BITS; 3225 3226 qemu_mutex_lock(&ram_state->bitmap_mutex); 3227 /* 3228 * The skipped free pages are equavalent to be sent from clear_bmap's 3229 * perspective, so clear the bits from the memory region bitmap which 3230 * are initially set. Otherwise those skipped pages will be sent in 3231 * the next round after syncing from the memory region bitmap. 3232 */ 3233 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3234 ram_state->migration_dirty_pages -= 3235 bitmap_count_one_with_offset(block->bmap, start, npages); 3236 bitmap_clear(block->bmap, start, npages); 3237 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3238 } 3239 } 3240 3241 /* 3242 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3243 * long-running RCU critical section. When rcu-reclaims in the code 3244 * start to become numerous it will be necessary to reduce the 3245 * granularity of these critical sections. 3246 */ 3247 3248 /** 3249 * ram_save_setup: Setup RAM for migration 3250 * 3251 * Returns zero to indicate success and negative for error 3252 * 3253 * @f: QEMUFile where to send the data 3254 * @opaque: RAMState pointer 3255 */ 3256 static int ram_save_setup(QEMUFile *f, void *opaque) 3257 { 3258 RAMState **rsp = opaque; 3259 RAMBlock *block; 3260 int ret; 3261 3262 if (compress_threads_save_setup()) { 3263 return -1; 3264 } 3265 3266 /* migration has already setup the bitmap, reuse it. */ 3267 if (!migration_in_colo_state()) { 3268 if (ram_init_all(rsp) != 0) { 3269 compress_threads_save_cleanup(); 3270 return -1; 3271 } 3272 } 3273 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3274 3275 WITH_RCU_READ_LOCK_GUARD() { 3276 qemu_put_be64(f, ram_bytes_total_with_ignored() 3277 | RAM_SAVE_FLAG_MEM_SIZE); 3278 3279 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3280 qemu_put_byte(f, strlen(block->idstr)); 3281 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3282 qemu_put_be64(f, block->used_length); 3283 if (migrate_postcopy_ram() && block->page_size != 3284 qemu_host_page_size) { 3285 qemu_put_be64(f, block->page_size); 3286 } 3287 if (migrate_ignore_shared()) { 3288 qemu_put_be64(f, block->mr->addr); 3289 } 3290 } 3291 } 3292 3293 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3294 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3295 3296 migration_ops = g_malloc0(sizeof(MigrationOps)); 3297 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3298 ret = multifd_send_sync_main(f); 3299 if (ret < 0) { 3300 return ret; 3301 } 3302 3303 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3304 qemu_fflush(f); 3305 3306 return 0; 3307 } 3308 3309 /** 3310 * ram_save_iterate: iterative stage for migration 3311 * 3312 * Returns zero to indicate success and negative for error 3313 * 3314 * @f: QEMUFile where to send the data 3315 * @opaque: RAMState pointer 3316 */ 3317 static int ram_save_iterate(QEMUFile *f, void *opaque) 3318 { 3319 RAMState **temp = opaque; 3320 RAMState *rs = *temp; 3321 int ret = 0; 3322 int i; 3323 int64_t t0; 3324 int done = 0; 3325 3326 if (blk_mig_bulk_active()) { 3327 /* Avoid transferring ram during bulk phase of block migration as 3328 * the bulk phase will usually take a long time and transferring 3329 * ram updates during that time is pointless. */ 3330 goto out; 3331 } 3332 3333 /* 3334 * We'll take this lock a little bit long, but it's okay for two reasons. 3335 * Firstly, the only possible other thread to take it is who calls 3336 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3337 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3338 * guarantees that we'll at least released it in a regular basis. 3339 */ 3340 qemu_mutex_lock(&rs->bitmap_mutex); 3341 WITH_RCU_READ_LOCK_GUARD() { 3342 if (ram_list.version != rs->last_version) { 3343 ram_state_reset(rs); 3344 } 3345 3346 /* Read version before ram_list.blocks */ 3347 smp_rmb(); 3348 3349 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3350 3351 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3352 i = 0; 3353 while ((ret = qemu_file_rate_limit(f)) == 0 || 3354 postcopy_has_request(rs)) { 3355 int pages; 3356 3357 if (qemu_file_get_error(f)) { 3358 break; 3359 } 3360 3361 pages = ram_find_and_save_block(rs); 3362 /* no more pages to sent */ 3363 if (pages == 0) { 3364 done = 1; 3365 break; 3366 } 3367 3368 if (pages < 0) { 3369 qemu_file_set_error(f, pages); 3370 break; 3371 } 3372 3373 rs->target_page_count += pages; 3374 3375 /* 3376 * During postcopy, it is necessary to make sure one whole host 3377 * page is sent in one chunk. 3378 */ 3379 if (migrate_postcopy_ram()) { 3380 flush_compressed_data(rs); 3381 } 3382 3383 /* 3384 * we want to check in the 1st loop, just in case it was the 1st 3385 * time and we had to sync the dirty bitmap. 3386 * qemu_clock_get_ns() is a bit expensive, so we only check each 3387 * some iterations 3388 */ 3389 if ((i & 63) == 0) { 3390 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3391 1000000; 3392 if (t1 > MAX_WAIT) { 3393 trace_ram_save_iterate_big_wait(t1, i); 3394 break; 3395 } 3396 } 3397 i++; 3398 } 3399 } 3400 qemu_mutex_unlock(&rs->bitmap_mutex); 3401 3402 /* 3403 * Must occur before EOS (or any QEMUFile operation) 3404 * because of RDMA protocol. 3405 */ 3406 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3407 3408 out: 3409 if (ret >= 0 3410 && migration_is_setup_or_active(migrate_get_current()->state)) { 3411 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3412 if (ret < 0) { 3413 return ret; 3414 } 3415 3416 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3417 qemu_fflush(f); 3418 ram_transferred_add(8); 3419 3420 ret = qemu_file_get_error(f); 3421 } 3422 if (ret < 0) { 3423 return ret; 3424 } 3425 3426 return done; 3427 } 3428 3429 /** 3430 * ram_save_complete: function called to send the remaining amount of ram 3431 * 3432 * Returns zero to indicate success or negative on error 3433 * 3434 * Called with iothread lock 3435 * 3436 * @f: QEMUFile where to send the data 3437 * @opaque: RAMState pointer 3438 */ 3439 static int ram_save_complete(QEMUFile *f, void *opaque) 3440 { 3441 RAMState **temp = opaque; 3442 RAMState *rs = *temp; 3443 int ret = 0; 3444 3445 rs->last_stage = !migration_in_colo_state(); 3446 3447 WITH_RCU_READ_LOCK_GUARD() { 3448 if (!migration_in_postcopy()) { 3449 migration_bitmap_sync_precopy(rs); 3450 } 3451 3452 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3453 3454 /* try transferring iterative blocks of memory */ 3455 3456 /* flush all remaining blocks regardless of rate limiting */ 3457 qemu_mutex_lock(&rs->bitmap_mutex); 3458 while (true) { 3459 int pages; 3460 3461 pages = ram_find_and_save_block(rs); 3462 /* no more blocks to sent */ 3463 if (pages == 0) { 3464 break; 3465 } 3466 if (pages < 0) { 3467 ret = pages; 3468 break; 3469 } 3470 } 3471 qemu_mutex_unlock(&rs->bitmap_mutex); 3472 3473 flush_compressed_data(rs); 3474 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3475 } 3476 3477 if (ret < 0) { 3478 return ret; 3479 } 3480 3481 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3482 if (ret < 0) { 3483 return ret; 3484 } 3485 3486 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3487 qemu_fflush(f); 3488 3489 return 0; 3490 } 3491 3492 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3493 uint64_t *can_postcopy) 3494 { 3495 RAMState **temp = opaque; 3496 RAMState *rs = *temp; 3497 3498 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3499 3500 if (migrate_postcopy_ram()) { 3501 /* We can do postcopy, and all the data is postcopiable */ 3502 *can_postcopy += remaining_size; 3503 } else { 3504 *must_precopy += remaining_size; 3505 } 3506 } 3507 3508 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3509 uint64_t *can_postcopy) 3510 { 3511 RAMState **temp = opaque; 3512 RAMState *rs = *temp; 3513 3514 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3515 3516 if (!migration_in_postcopy()) { 3517 qemu_mutex_lock_iothread(); 3518 WITH_RCU_READ_LOCK_GUARD() { 3519 migration_bitmap_sync_precopy(rs); 3520 } 3521 qemu_mutex_unlock_iothread(); 3522 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3523 } 3524 3525 if (migrate_postcopy_ram()) { 3526 /* We can do postcopy, and all the data is postcopiable */ 3527 *can_postcopy += remaining_size; 3528 } else { 3529 *must_precopy += remaining_size; 3530 } 3531 } 3532 3533 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3534 { 3535 unsigned int xh_len; 3536 int xh_flags; 3537 uint8_t *loaded_data; 3538 3539 /* extract RLE header */ 3540 xh_flags = qemu_get_byte(f); 3541 xh_len = qemu_get_be16(f); 3542 3543 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3544 error_report("Failed to load XBZRLE page - wrong compression!"); 3545 return -1; 3546 } 3547 3548 if (xh_len > TARGET_PAGE_SIZE) { 3549 error_report("Failed to load XBZRLE page - len overflow!"); 3550 return -1; 3551 } 3552 loaded_data = XBZRLE.decoded_buf; 3553 /* load data and decode */ 3554 /* it can change loaded_data to point to an internal buffer */ 3555 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3556 3557 /* decode RLE */ 3558 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3559 TARGET_PAGE_SIZE) == -1) { 3560 error_report("Failed to load XBZRLE page - decode error!"); 3561 return -1; 3562 } 3563 3564 return 0; 3565 } 3566 3567 /** 3568 * ram_block_from_stream: read a RAMBlock id from the migration stream 3569 * 3570 * Must be called from within a rcu critical section. 3571 * 3572 * Returns a pointer from within the RCU-protected ram_list. 3573 * 3574 * @mis: the migration incoming state pointer 3575 * @f: QEMUFile where to read the data from 3576 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3577 * @channel: the channel we're using 3578 */ 3579 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3580 QEMUFile *f, int flags, 3581 int channel) 3582 { 3583 RAMBlock *block = mis->last_recv_block[channel]; 3584 char id[256]; 3585 uint8_t len; 3586 3587 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3588 if (!block) { 3589 error_report("Ack, bad migration stream!"); 3590 return NULL; 3591 } 3592 return block; 3593 } 3594 3595 len = qemu_get_byte(f); 3596 qemu_get_buffer(f, (uint8_t *)id, len); 3597 id[len] = 0; 3598 3599 block = qemu_ram_block_by_name(id); 3600 if (!block) { 3601 error_report("Can't find block %s", id); 3602 return NULL; 3603 } 3604 3605 if (ramblock_is_ignored(block)) { 3606 error_report("block %s should not be migrated !", id); 3607 return NULL; 3608 } 3609 3610 mis->last_recv_block[channel] = block; 3611 3612 return block; 3613 } 3614 3615 static inline void *host_from_ram_block_offset(RAMBlock *block, 3616 ram_addr_t offset) 3617 { 3618 if (!offset_in_ramblock(block, offset)) { 3619 return NULL; 3620 } 3621 3622 return block->host + offset; 3623 } 3624 3625 static void *host_page_from_ram_block_offset(RAMBlock *block, 3626 ram_addr_t offset) 3627 { 3628 /* Note: Explicitly no check against offset_in_ramblock(). */ 3629 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3630 block->page_size); 3631 } 3632 3633 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3634 ram_addr_t offset) 3635 { 3636 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3637 } 3638 3639 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3640 ram_addr_t offset, bool record_bitmap) 3641 { 3642 if (!offset_in_ramblock(block, offset)) { 3643 return NULL; 3644 } 3645 if (!block->colo_cache) { 3646 error_report("%s: colo_cache is NULL in block :%s", 3647 __func__, block->idstr); 3648 return NULL; 3649 } 3650 3651 /* 3652 * During colo checkpoint, we need bitmap of these migrated pages. 3653 * It help us to decide which pages in ram cache should be flushed 3654 * into VM's RAM later. 3655 */ 3656 if (record_bitmap && 3657 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3658 ram_state->migration_dirty_pages++; 3659 } 3660 return block->colo_cache + offset; 3661 } 3662 3663 /** 3664 * ram_handle_compressed: handle the zero page case 3665 * 3666 * If a page (or a whole RDMA chunk) has been 3667 * determined to be zero, then zap it. 3668 * 3669 * @host: host address for the zero page 3670 * @ch: what the page is filled from. We only support zero 3671 * @size: size of the zero page 3672 */ 3673 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3674 { 3675 if (ch != 0 || !buffer_is_zero(host, size)) { 3676 memset(host, ch, size); 3677 } 3678 } 3679 3680 /* return the size after decompression, or negative value on error */ 3681 static int 3682 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3683 const uint8_t *source, size_t source_len) 3684 { 3685 int err; 3686 3687 err = inflateReset(stream); 3688 if (err != Z_OK) { 3689 return -1; 3690 } 3691 3692 stream->avail_in = source_len; 3693 stream->next_in = (uint8_t *)source; 3694 stream->avail_out = dest_len; 3695 stream->next_out = dest; 3696 3697 err = inflate(stream, Z_NO_FLUSH); 3698 if (err != Z_STREAM_END) { 3699 return -1; 3700 } 3701 3702 return stream->total_out; 3703 } 3704 3705 static void *do_data_decompress(void *opaque) 3706 { 3707 DecompressParam *param = opaque; 3708 unsigned long pagesize; 3709 uint8_t *des; 3710 int len, ret; 3711 3712 qemu_mutex_lock(¶m->mutex); 3713 while (!param->quit) { 3714 if (param->des) { 3715 des = param->des; 3716 len = param->len; 3717 param->des = 0; 3718 qemu_mutex_unlock(¶m->mutex); 3719 3720 pagesize = TARGET_PAGE_SIZE; 3721 3722 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3723 param->compbuf, len); 3724 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3725 error_report("decompress data failed"); 3726 qemu_file_set_error(decomp_file, ret); 3727 } 3728 3729 qemu_mutex_lock(&decomp_done_lock); 3730 param->done = true; 3731 qemu_cond_signal(&decomp_done_cond); 3732 qemu_mutex_unlock(&decomp_done_lock); 3733 3734 qemu_mutex_lock(¶m->mutex); 3735 } else { 3736 qemu_cond_wait(¶m->cond, ¶m->mutex); 3737 } 3738 } 3739 qemu_mutex_unlock(¶m->mutex); 3740 3741 return NULL; 3742 } 3743 3744 static int wait_for_decompress_done(void) 3745 { 3746 int idx, thread_count; 3747 3748 if (!migrate_use_compression()) { 3749 return 0; 3750 } 3751 3752 thread_count = migrate_decompress_threads(); 3753 qemu_mutex_lock(&decomp_done_lock); 3754 for (idx = 0; idx < thread_count; idx++) { 3755 while (!decomp_param[idx].done) { 3756 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3757 } 3758 } 3759 qemu_mutex_unlock(&decomp_done_lock); 3760 return qemu_file_get_error(decomp_file); 3761 } 3762 3763 static void compress_threads_load_cleanup(void) 3764 { 3765 int i, thread_count; 3766 3767 if (!migrate_use_compression()) { 3768 return; 3769 } 3770 thread_count = migrate_decompress_threads(); 3771 for (i = 0; i < thread_count; i++) { 3772 /* 3773 * we use it as a indicator which shows if the thread is 3774 * properly init'd or not 3775 */ 3776 if (!decomp_param[i].compbuf) { 3777 break; 3778 } 3779 3780 qemu_mutex_lock(&decomp_param[i].mutex); 3781 decomp_param[i].quit = true; 3782 qemu_cond_signal(&decomp_param[i].cond); 3783 qemu_mutex_unlock(&decomp_param[i].mutex); 3784 } 3785 for (i = 0; i < thread_count; i++) { 3786 if (!decomp_param[i].compbuf) { 3787 break; 3788 } 3789 3790 qemu_thread_join(decompress_threads + i); 3791 qemu_mutex_destroy(&decomp_param[i].mutex); 3792 qemu_cond_destroy(&decomp_param[i].cond); 3793 inflateEnd(&decomp_param[i].stream); 3794 g_free(decomp_param[i].compbuf); 3795 decomp_param[i].compbuf = NULL; 3796 } 3797 g_free(decompress_threads); 3798 g_free(decomp_param); 3799 decompress_threads = NULL; 3800 decomp_param = NULL; 3801 decomp_file = NULL; 3802 } 3803 3804 static int compress_threads_load_setup(QEMUFile *f) 3805 { 3806 int i, thread_count; 3807 3808 if (!migrate_use_compression()) { 3809 return 0; 3810 } 3811 3812 thread_count = migrate_decompress_threads(); 3813 decompress_threads = g_new0(QemuThread, thread_count); 3814 decomp_param = g_new0(DecompressParam, thread_count); 3815 qemu_mutex_init(&decomp_done_lock); 3816 qemu_cond_init(&decomp_done_cond); 3817 decomp_file = f; 3818 for (i = 0; i < thread_count; i++) { 3819 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3820 goto exit; 3821 } 3822 3823 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3824 qemu_mutex_init(&decomp_param[i].mutex); 3825 qemu_cond_init(&decomp_param[i].cond); 3826 decomp_param[i].done = true; 3827 decomp_param[i].quit = false; 3828 qemu_thread_create(decompress_threads + i, "decompress", 3829 do_data_decompress, decomp_param + i, 3830 QEMU_THREAD_JOINABLE); 3831 } 3832 return 0; 3833 exit: 3834 compress_threads_load_cleanup(); 3835 return -1; 3836 } 3837 3838 static void decompress_data_with_multi_threads(QEMUFile *f, 3839 void *host, int len) 3840 { 3841 int idx, thread_count; 3842 3843 thread_count = migrate_decompress_threads(); 3844 QEMU_LOCK_GUARD(&decomp_done_lock); 3845 while (true) { 3846 for (idx = 0; idx < thread_count; idx++) { 3847 if (decomp_param[idx].done) { 3848 decomp_param[idx].done = false; 3849 qemu_mutex_lock(&decomp_param[idx].mutex); 3850 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3851 decomp_param[idx].des = host; 3852 decomp_param[idx].len = len; 3853 qemu_cond_signal(&decomp_param[idx].cond); 3854 qemu_mutex_unlock(&decomp_param[idx].mutex); 3855 break; 3856 } 3857 } 3858 if (idx < thread_count) { 3859 break; 3860 } else { 3861 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3862 } 3863 } 3864 } 3865 3866 static void colo_init_ram_state(void) 3867 { 3868 ram_state_init(&ram_state); 3869 } 3870 3871 /* 3872 * colo cache: this is for secondary VM, we cache the whole 3873 * memory of the secondary VM, it is need to hold the global lock 3874 * to call this helper. 3875 */ 3876 int colo_init_ram_cache(void) 3877 { 3878 RAMBlock *block; 3879 3880 WITH_RCU_READ_LOCK_GUARD() { 3881 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3882 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3883 NULL, false, false); 3884 if (!block->colo_cache) { 3885 error_report("%s: Can't alloc memory for COLO cache of block %s," 3886 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3887 block->used_length); 3888 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3889 if (block->colo_cache) { 3890 qemu_anon_ram_free(block->colo_cache, block->used_length); 3891 block->colo_cache = NULL; 3892 } 3893 } 3894 return -errno; 3895 } 3896 if (!machine_dump_guest_core(current_machine)) { 3897 qemu_madvise(block->colo_cache, block->used_length, 3898 QEMU_MADV_DONTDUMP); 3899 } 3900 } 3901 } 3902 3903 /* 3904 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3905 * with to decide which page in cache should be flushed into SVM's RAM. Here 3906 * we use the same name 'ram_bitmap' as for migration. 3907 */ 3908 if (ram_bytes_total()) { 3909 RAMBlock *block; 3910 3911 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3912 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3913 block->bmap = bitmap_new(pages); 3914 } 3915 } 3916 3917 colo_init_ram_state(); 3918 return 0; 3919 } 3920 3921 /* TODO: duplicated with ram_init_bitmaps */ 3922 void colo_incoming_start_dirty_log(void) 3923 { 3924 RAMBlock *block = NULL; 3925 /* For memory_global_dirty_log_start below. */ 3926 qemu_mutex_lock_iothread(); 3927 qemu_mutex_lock_ramlist(); 3928 3929 memory_global_dirty_log_sync(); 3930 WITH_RCU_READ_LOCK_GUARD() { 3931 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3932 ramblock_sync_dirty_bitmap(ram_state, block); 3933 /* Discard this dirty bitmap record */ 3934 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3935 } 3936 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3937 } 3938 ram_state->migration_dirty_pages = 0; 3939 qemu_mutex_unlock_ramlist(); 3940 qemu_mutex_unlock_iothread(); 3941 } 3942 3943 /* It is need to hold the global lock to call this helper */ 3944 void colo_release_ram_cache(void) 3945 { 3946 RAMBlock *block; 3947 3948 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3949 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3950 g_free(block->bmap); 3951 block->bmap = NULL; 3952 } 3953 3954 WITH_RCU_READ_LOCK_GUARD() { 3955 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3956 if (block->colo_cache) { 3957 qemu_anon_ram_free(block->colo_cache, block->used_length); 3958 block->colo_cache = NULL; 3959 } 3960 } 3961 } 3962 ram_state_cleanup(&ram_state); 3963 } 3964 3965 /** 3966 * ram_load_setup: Setup RAM for migration incoming side 3967 * 3968 * Returns zero to indicate success and negative for error 3969 * 3970 * @f: QEMUFile where to receive the data 3971 * @opaque: RAMState pointer 3972 */ 3973 static int ram_load_setup(QEMUFile *f, void *opaque) 3974 { 3975 if (compress_threads_load_setup(f)) { 3976 return -1; 3977 } 3978 3979 xbzrle_load_setup(); 3980 ramblock_recv_map_init(); 3981 3982 return 0; 3983 } 3984 3985 static int ram_load_cleanup(void *opaque) 3986 { 3987 RAMBlock *rb; 3988 3989 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3990 qemu_ram_block_writeback(rb); 3991 } 3992 3993 xbzrle_load_cleanup(); 3994 compress_threads_load_cleanup(); 3995 3996 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3997 g_free(rb->receivedmap); 3998 rb->receivedmap = NULL; 3999 } 4000 4001 return 0; 4002 } 4003 4004 /** 4005 * ram_postcopy_incoming_init: allocate postcopy data structures 4006 * 4007 * Returns 0 for success and negative if there was one error 4008 * 4009 * @mis: current migration incoming state 4010 * 4011 * Allocate data structures etc needed by incoming migration with 4012 * postcopy-ram. postcopy-ram's similarly names 4013 * postcopy_ram_incoming_init does the work. 4014 */ 4015 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 4016 { 4017 return postcopy_ram_incoming_init(mis); 4018 } 4019 4020 /** 4021 * ram_load_postcopy: load a page in postcopy case 4022 * 4023 * Returns 0 for success or -errno in case of error 4024 * 4025 * Called in postcopy mode by ram_load(). 4026 * rcu_read_lock is taken prior to this being called. 4027 * 4028 * @f: QEMUFile where to send the data 4029 * @channel: the channel to use for loading 4030 */ 4031 int ram_load_postcopy(QEMUFile *f, int channel) 4032 { 4033 int flags = 0, ret = 0; 4034 bool place_needed = false; 4035 bool matches_target_page_size = false; 4036 MigrationIncomingState *mis = migration_incoming_get_current(); 4037 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 4038 4039 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4040 ram_addr_t addr; 4041 void *page_buffer = NULL; 4042 void *place_source = NULL; 4043 RAMBlock *block = NULL; 4044 uint8_t ch; 4045 int len; 4046 4047 addr = qemu_get_be64(f); 4048 4049 /* 4050 * If qemu file error, we should stop here, and then "addr" 4051 * may be invalid 4052 */ 4053 ret = qemu_file_get_error(f); 4054 if (ret) { 4055 break; 4056 } 4057 4058 flags = addr & ~TARGET_PAGE_MASK; 4059 addr &= TARGET_PAGE_MASK; 4060 4061 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 4062 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4063 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 4064 block = ram_block_from_stream(mis, f, flags, channel); 4065 if (!block) { 4066 ret = -EINVAL; 4067 break; 4068 } 4069 4070 /* 4071 * Relying on used_length is racy and can result in false positives. 4072 * We might place pages beyond used_length in case RAM was shrunk 4073 * while in postcopy, which is fine - trying to place via 4074 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4075 */ 4076 if (!block->host || addr >= block->postcopy_length) { 4077 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4078 ret = -EINVAL; 4079 break; 4080 } 4081 tmp_page->target_pages++; 4082 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4083 /* 4084 * Postcopy requires that we place whole host pages atomically; 4085 * these may be huge pages for RAMBlocks that are backed by 4086 * hugetlbfs. 4087 * To make it atomic, the data is read into a temporary page 4088 * that's moved into place later. 4089 * The migration protocol uses, possibly smaller, target-pages 4090 * however the source ensures it always sends all the components 4091 * of a host page in one chunk. 4092 */ 4093 page_buffer = tmp_page->tmp_huge_page + 4094 host_page_offset_from_ram_block_offset(block, addr); 4095 /* If all TP are zero then we can optimise the place */ 4096 if (tmp_page->target_pages == 1) { 4097 tmp_page->host_addr = 4098 host_page_from_ram_block_offset(block, addr); 4099 } else if (tmp_page->host_addr != 4100 host_page_from_ram_block_offset(block, addr)) { 4101 /* not the 1st TP within the HP */ 4102 error_report("Non-same host page detected on channel %d: " 4103 "Target host page %p, received host page %p " 4104 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4105 channel, tmp_page->host_addr, 4106 host_page_from_ram_block_offset(block, addr), 4107 block->idstr, addr, tmp_page->target_pages); 4108 ret = -EINVAL; 4109 break; 4110 } 4111 4112 /* 4113 * If it's the last part of a host page then we place the host 4114 * page 4115 */ 4116 if (tmp_page->target_pages == 4117 (block->page_size / TARGET_PAGE_SIZE)) { 4118 place_needed = true; 4119 } 4120 place_source = tmp_page->tmp_huge_page; 4121 } 4122 4123 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4124 case RAM_SAVE_FLAG_ZERO: 4125 ch = qemu_get_byte(f); 4126 /* 4127 * Can skip to set page_buffer when 4128 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4129 */ 4130 if (ch || !matches_target_page_size) { 4131 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4132 } 4133 if (ch) { 4134 tmp_page->all_zero = false; 4135 } 4136 break; 4137 4138 case RAM_SAVE_FLAG_PAGE: 4139 tmp_page->all_zero = false; 4140 if (!matches_target_page_size) { 4141 /* For huge pages, we always use temporary buffer */ 4142 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4143 } else { 4144 /* 4145 * For small pages that matches target page size, we 4146 * avoid the qemu_file copy. Instead we directly use 4147 * the buffer of QEMUFile to place the page. Note: we 4148 * cannot do any QEMUFile operation before using that 4149 * buffer to make sure the buffer is valid when 4150 * placing the page. 4151 */ 4152 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4153 TARGET_PAGE_SIZE); 4154 } 4155 break; 4156 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4157 tmp_page->all_zero = false; 4158 len = qemu_get_be32(f); 4159 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4160 error_report("Invalid compressed data length: %d", len); 4161 ret = -EINVAL; 4162 break; 4163 } 4164 decompress_data_with_multi_threads(f, page_buffer, len); 4165 break; 4166 4167 case RAM_SAVE_FLAG_EOS: 4168 /* normal exit */ 4169 multifd_recv_sync_main(); 4170 break; 4171 default: 4172 error_report("Unknown combination of migration flags: 0x%x" 4173 " (postcopy mode)", flags); 4174 ret = -EINVAL; 4175 break; 4176 } 4177 4178 /* Got the whole host page, wait for decompress before placing. */ 4179 if (place_needed) { 4180 ret |= wait_for_decompress_done(); 4181 } 4182 4183 /* Detect for any possible file errors */ 4184 if (!ret && qemu_file_get_error(f)) { 4185 ret = qemu_file_get_error(f); 4186 } 4187 4188 if (!ret && place_needed) { 4189 if (tmp_page->all_zero) { 4190 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4191 } else { 4192 ret = postcopy_place_page(mis, tmp_page->host_addr, 4193 place_source, block); 4194 } 4195 place_needed = false; 4196 postcopy_temp_page_reset(tmp_page); 4197 } 4198 } 4199 4200 return ret; 4201 } 4202 4203 static bool postcopy_is_running(void) 4204 { 4205 PostcopyState ps = postcopy_state_get(); 4206 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4207 } 4208 4209 /* 4210 * Flush content of RAM cache into SVM's memory. 4211 * Only flush the pages that be dirtied by PVM or SVM or both. 4212 */ 4213 void colo_flush_ram_cache(void) 4214 { 4215 RAMBlock *block = NULL; 4216 void *dst_host; 4217 void *src_host; 4218 unsigned long offset = 0; 4219 4220 memory_global_dirty_log_sync(); 4221 WITH_RCU_READ_LOCK_GUARD() { 4222 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4223 ramblock_sync_dirty_bitmap(ram_state, block); 4224 } 4225 } 4226 4227 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4228 WITH_RCU_READ_LOCK_GUARD() { 4229 block = QLIST_FIRST_RCU(&ram_list.blocks); 4230 4231 while (block) { 4232 unsigned long num = 0; 4233 4234 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4235 if (!offset_in_ramblock(block, 4236 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4237 offset = 0; 4238 num = 0; 4239 block = QLIST_NEXT_RCU(block, next); 4240 } else { 4241 unsigned long i = 0; 4242 4243 for (i = 0; i < num; i++) { 4244 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4245 } 4246 dst_host = block->host 4247 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4248 src_host = block->colo_cache 4249 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4250 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4251 offset += num; 4252 } 4253 } 4254 } 4255 trace_colo_flush_ram_cache_end(); 4256 } 4257 4258 /** 4259 * ram_load_precopy: load pages in precopy case 4260 * 4261 * Returns 0 for success or -errno in case of error 4262 * 4263 * Called in precopy mode by ram_load(). 4264 * rcu_read_lock is taken prior to this being called. 4265 * 4266 * @f: QEMUFile where to send the data 4267 */ 4268 static int ram_load_precopy(QEMUFile *f) 4269 { 4270 MigrationIncomingState *mis = migration_incoming_get_current(); 4271 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4272 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4273 bool postcopy_advised = migration_incoming_postcopy_advised(); 4274 if (!migrate_use_compression()) { 4275 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4276 } 4277 4278 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4279 ram_addr_t addr, total_ram_bytes; 4280 void *host = NULL, *host_bak = NULL; 4281 uint8_t ch; 4282 4283 /* 4284 * Yield periodically to let main loop run, but an iteration of 4285 * the main loop is expensive, so do it each some iterations 4286 */ 4287 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4288 aio_co_schedule(qemu_get_current_aio_context(), 4289 qemu_coroutine_self()); 4290 qemu_coroutine_yield(); 4291 } 4292 i++; 4293 4294 addr = qemu_get_be64(f); 4295 flags = addr & ~TARGET_PAGE_MASK; 4296 addr &= TARGET_PAGE_MASK; 4297 4298 if (flags & invalid_flags) { 4299 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4300 error_report("Received an unexpected compressed page"); 4301 } 4302 4303 ret = -EINVAL; 4304 break; 4305 } 4306 4307 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4308 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4309 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4310 RAM_CHANNEL_PRECOPY); 4311 4312 host = host_from_ram_block_offset(block, addr); 4313 /* 4314 * After going into COLO stage, we should not load the page 4315 * into SVM's memory directly, we put them into colo_cache firstly. 4316 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4317 * Previously, we copied all these memory in preparing stage of COLO 4318 * while we need to stop VM, which is a time-consuming process. 4319 * Here we optimize it by a trick, back-up every page while in 4320 * migration process while COLO is enabled, though it affects the 4321 * speed of the migration, but it obviously reduce the downtime of 4322 * back-up all SVM'S memory in COLO preparing stage. 4323 */ 4324 if (migration_incoming_colo_enabled()) { 4325 if (migration_incoming_in_colo_state()) { 4326 /* In COLO stage, put all pages into cache temporarily */ 4327 host = colo_cache_from_block_offset(block, addr, true); 4328 } else { 4329 /* 4330 * In migration stage but before COLO stage, 4331 * Put all pages into both cache and SVM's memory. 4332 */ 4333 host_bak = colo_cache_from_block_offset(block, addr, false); 4334 } 4335 } 4336 if (!host) { 4337 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4338 ret = -EINVAL; 4339 break; 4340 } 4341 if (!migration_incoming_in_colo_state()) { 4342 ramblock_recv_bitmap_set(block, host); 4343 } 4344 4345 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4346 } 4347 4348 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4349 case RAM_SAVE_FLAG_MEM_SIZE: 4350 /* Synchronize RAM block list */ 4351 total_ram_bytes = addr; 4352 while (!ret && total_ram_bytes) { 4353 RAMBlock *block; 4354 char id[256]; 4355 ram_addr_t length; 4356 4357 len = qemu_get_byte(f); 4358 qemu_get_buffer(f, (uint8_t *)id, len); 4359 id[len] = 0; 4360 length = qemu_get_be64(f); 4361 4362 block = qemu_ram_block_by_name(id); 4363 if (block && !qemu_ram_is_migratable(block)) { 4364 error_report("block %s should not be migrated !", id); 4365 ret = -EINVAL; 4366 } else if (block) { 4367 if (length != block->used_length) { 4368 Error *local_err = NULL; 4369 4370 ret = qemu_ram_resize(block, length, 4371 &local_err); 4372 if (local_err) { 4373 error_report_err(local_err); 4374 } 4375 } 4376 /* For postcopy we need to check hugepage sizes match */ 4377 if (postcopy_advised && migrate_postcopy_ram() && 4378 block->page_size != qemu_host_page_size) { 4379 uint64_t remote_page_size = qemu_get_be64(f); 4380 if (remote_page_size != block->page_size) { 4381 error_report("Mismatched RAM page size %s " 4382 "(local) %zd != %" PRId64, 4383 id, block->page_size, 4384 remote_page_size); 4385 ret = -EINVAL; 4386 } 4387 } 4388 if (migrate_ignore_shared()) { 4389 hwaddr addr = qemu_get_be64(f); 4390 if (ramblock_is_ignored(block) && 4391 block->mr->addr != addr) { 4392 error_report("Mismatched GPAs for block %s " 4393 "%" PRId64 "!= %" PRId64, 4394 id, (uint64_t)addr, 4395 (uint64_t)block->mr->addr); 4396 ret = -EINVAL; 4397 } 4398 } 4399 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4400 block->idstr); 4401 } else { 4402 error_report("Unknown ramblock \"%s\", cannot " 4403 "accept migration", id); 4404 ret = -EINVAL; 4405 } 4406 4407 total_ram_bytes -= length; 4408 } 4409 break; 4410 4411 case RAM_SAVE_FLAG_ZERO: 4412 ch = qemu_get_byte(f); 4413 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4414 break; 4415 4416 case RAM_SAVE_FLAG_PAGE: 4417 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4418 break; 4419 4420 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4421 len = qemu_get_be32(f); 4422 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4423 error_report("Invalid compressed data length: %d", len); 4424 ret = -EINVAL; 4425 break; 4426 } 4427 decompress_data_with_multi_threads(f, host, len); 4428 break; 4429 4430 case RAM_SAVE_FLAG_XBZRLE: 4431 if (load_xbzrle(f, addr, host) < 0) { 4432 error_report("Failed to decompress XBZRLE page at " 4433 RAM_ADDR_FMT, addr); 4434 ret = -EINVAL; 4435 break; 4436 } 4437 break; 4438 case RAM_SAVE_FLAG_EOS: 4439 /* normal exit */ 4440 multifd_recv_sync_main(); 4441 break; 4442 default: 4443 if (flags & RAM_SAVE_FLAG_HOOK) { 4444 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4445 } else { 4446 error_report("Unknown combination of migration flags: 0x%x", 4447 flags); 4448 ret = -EINVAL; 4449 } 4450 } 4451 if (!ret) { 4452 ret = qemu_file_get_error(f); 4453 } 4454 if (!ret && host_bak) { 4455 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4456 } 4457 } 4458 4459 ret |= wait_for_decompress_done(); 4460 return ret; 4461 } 4462 4463 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4464 { 4465 int ret = 0; 4466 static uint64_t seq_iter; 4467 /* 4468 * If system is running in postcopy mode, page inserts to host memory must 4469 * be atomic 4470 */ 4471 bool postcopy_running = postcopy_is_running(); 4472 4473 seq_iter++; 4474 4475 if (version_id != 4) { 4476 return -EINVAL; 4477 } 4478 4479 /* 4480 * This RCU critical section can be very long running. 4481 * When RCU reclaims in the code start to become numerous, 4482 * it will be necessary to reduce the granularity of this 4483 * critical section. 4484 */ 4485 WITH_RCU_READ_LOCK_GUARD() { 4486 if (postcopy_running) { 4487 /* 4488 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4489 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4490 * service fast page faults. 4491 */ 4492 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4493 } else { 4494 ret = ram_load_precopy(f); 4495 } 4496 } 4497 trace_ram_load_complete(ret, seq_iter); 4498 4499 return ret; 4500 } 4501 4502 static bool ram_has_postcopy(void *opaque) 4503 { 4504 RAMBlock *rb; 4505 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4506 if (ramblock_is_pmem(rb)) { 4507 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4508 "is not supported now!", rb->idstr, rb->host); 4509 return false; 4510 } 4511 } 4512 4513 return migrate_postcopy_ram(); 4514 } 4515 4516 /* Sync all the dirty bitmap with destination VM. */ 4517 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4518 { 4519 RAMBlock *block; 4520 QEMUFile *file = s->to_dst_file; 4521 int ramblock_count = 0; 4522 4523 trace_ram_dirty_bitmap_sync_start(); 4524 4525 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4526 qemu_savevm_send_recv_bitmap(file, block->idstr); 4527 trace_ram_dirty_bitmap_request(block->idstr); 4528 ramblock_count++; 4529 } 4530 4531 trace_ram_dirty_bitmap_sync_wait(); 4532 4533 /* Wait until all the ramblocks' dirty bitmap synced */ 4534 while (ramblock_count--) { 4535 qemu_sem_wait(&s->rp_state.rp_sem); 4536 } 4537 4538 trace_ram_dirty_bitmap_sync_complete(); 4539 4540 return 0; 4541 } 4542 4543 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4544 { 4545 qemu_sem_post(&s->rp_state.rp_sem); 4546 } 4547 4548 /* 4549 * Read the received bitmap, revert it as the initial dirty bitmap. 4550 * This is only used when the postcopy migration is paused but wants 4551 * to resume from a middle point. 4552 */ 4553 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4554 { 4555 int ret = -EINVAL; 4556 /* from_dst_file is always valid because we're within rp_thread */ 4557 QEMUFile *file = s->rp_state.from_dst_file; 4558 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4559 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4560 uint64_t size, end_mark; 4561 4562 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4563 4564 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4565 error_report("%s: incorrect state %s", __func__, 4566 MigrationStatus_str(s->state)); 4567 return -EINVAL; 4568 } 4569 4570 /* 4571 * Note: see comments in ramblock_recv_bitmap_send() on why we 4572 * need the endianness conversion, and the paddings. 4573 */ 4574 local_size = ROUND_UP(local_size, 8); 4575 4576 /* Add paddings */ 4577 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4578 4579 size = qemu_get_be64(file); 4580 4581 /* The size of the bitmap should match with our ramblock */ 4582 if (size != local_size) { 4583 error_report("%s: ramblock '%s' bitmap size mismatch " 4584 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4585 block->idstr, size, local_size); 4586 ret = -EINVAL; 4587 goto out; 4588 } 4589 4590 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4591 end_mark = qemu_get_be64(file); 4592 4593 ret = qemu_file_get_error(file); 4594 if (ret || size != local_size) { 4595 error_report("%s: read bitmap failed for ramblock '%s': %d" 4596 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4597 __func__, block->idstr, ret, local_size, size); 4598 ret = -EIO; 4599 goto out; 4600 } 4601 4602 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4603 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4604 __func__, block->idstr, end_mark); 4605 ret = -EINVAL; 4606 goto out; 4607 } 4608 4609 /* 4610 * Endianness conversion. We are during postcopy (though paused). 4611 * The dirty bitmap won't change. We can directly modify it. 4612 */ 4613 bitmap_from_le(block->bmap, le_bitmap, nbits); 4614 4615 /* 4616 * What we received is "received bitmap". Revert it as the initial 4617 * dirty bitmap for this ramblock. 4618 */ 4619 bitmap_complement(block->bmap, block->bmap, nbits); 4620 4621 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4622 ramblock_dirty_bitmap_clear_discarded_pages(block); 4623 4624 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4625 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4626 4627 /* 4628 * We succeeded to sync bitmap for current ramblock. If this is 4629 * the last one to sync, we need to notify the main send thread. 4630 */ 4631 ram_dirty_bitmap_reload_notify(s); 4632 4633 ret = 0; 4634 out: 4635 g_free(le_bitmap); 4636 return ret; 4637 } 4638 4639 static int ram_resume_prepare(MigrationState *s, void *opaque) 4640 { 4641 RAMState *rs = *(RAMState **)opaque; 4642 int ret; 4643 4644 ret = ram_dirty_bitmap_sync_all(s, rs); 4645 if (ret) { 4646 return ret; 4647 } 4648 4649 ram_state_resume_prepare(rs, s->to_dst_file); 4650 4651 return 0; 4652 } 4653 4654 void postcopy_preempt_shutdown_file(MigrationState *s) 4655 { 4656 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4657 qemu_fflush(s->postcopy_qemufile_src); 4658 } 4659 4660 static SaveVMHandlers savevm_ram_handlers = { 4661 .save_setup = ram_save_setup, 4662 .save_live_iterate = ram_save_iterate, 4663 .save_live_complete_postcopy = ram_save_complete, 4664 .save_live_complete_precopy = ram_save_complete, 4665 .has_postcopy = ram_has_postcopy, 4666 .state_pending_exact = ram_state_pending_exact, 4667 .state_pending_estimate = ram_state_pending_estimate, 4668 .load_state = ram_load, 4669 .save_cleanup = ram_save_cleanup, 4670 .load_setup = ram_load_setup, 4671 .load_cleanup = ram_load_cleanup, 4672 .resume_prepare = ram_resume_prepare, 4673 }; 4674 4675 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4676 size_t old_size, size_t new_size) 4677 { 4678 PostcopyState ps = postcopy_state_get(); 4679 ram_addr_t offset; 4680 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4681 Error *err = NULL; 4682 4683 if (ramblock_is_ignored(rb)) { 4684 return; 4685 } 4686 4687 if (!migration_is_idle()) { 4688 /* 4689 * Precopy code on the source cannot deal with the size of RAM blocks 4690 * changing at random points in time - especially after sending the 4691 * RAM block sizes in the migration stream, they must no longer change. 4692 * Abort and indicate a proper reason. 4693 */ 4694 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4695 migration_cancel(err); 4696 error_free(err); 4697 } 4698 4699 switch (ps) { 4700 case POSTCOPY_INCOMING_ADVISE: 4701 /* 4702 * Update what ram_postcopy_incoming_init()->init_range() does at the 4703 * time postcopy was advised. Syncing RAM blocks with the source will 4704 * result in RAM resizes. 4705 */ 4706 if (old_size < new_size) { 4707 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4708 error_report("RAM block '%s' discard of resized RAM failed", 4709 rb->idstr); 4710 } 4711 } 4712 rb->postcopy_length = new_size; 4713 break; 4714 case POSTCOPY_INCOMING_NONE: 4715 case POSTCOPY_INCOMING_RUNNING: 4716 case POSTCOPY_INCOMING_END: 4717 /* 4718 * Once our guest is running, postcopy does no longer care about 4719 * resizes. When growing, the new memory was not available on the 4720 * source, no handler needed. 4721 */ 4722 break; 4723 default: 4724 error_report("RAM block '%s' resized during postcopy state: %d", 4725 rb->idstr, ps); 4726 exit(-1); 4727 } 4728 } 4729 4730 static RAMBlockNotifier ram_mig_ram_notifier = { 4731 .ram_block_resized = ram_mig_ram_block_resized, 4732 }; 4733 4734 void ram_mig_init(void) 4735 { 4736 qemu_mutex_init(&XBZRLE.lock); 4737 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4738 ram_block_notifier_add(&ram_mig_ram_notifier); 4739 } 4740