1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #include "hw/boards.h" /* for machine_dump_guest_core() */ 62 63 #if defined(__linux__) 64 #include "qemu/userfaultfd.h" 65 #endif /* defined(__linux__) */ 66 67 /***********************************************************/ 68 /* ram save/restore */ 69 70 /* 71 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 72 * worked for pages that were filled with the same char. We switched 73 * it to only search for the zero value. And to avoid confusion with 74 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 75 */ 76 /* 77 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 78 */ 79 #define RAM_SAVE_FLAG_FULL 0x01 80 #define RAM_SAVE_FLAG_ZERO 0x02 81 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 82 #define RAM_SAVE_FLAG_PAGE 0x08 83 #define RAM_SAVE_FLAG_EOS 0x10 84 #define RAM_SAVE_FLAG_CONTINUE 0x20 85 #define RAM_SAVE_FLAG_XBZRLE 0x40 86 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 87 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 88 /* We can't use any flag that is bigger than 0x200 */ 89 90 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, 91 uint8_t *, int) = xbzrle_encode_buffer; 92 #if defined(CONFIG_AVX512BW_OPT) 93 #include "qemu/cpuid.h" 94 static void __attribute__((constructor)) init_cpu_flag(void) 95 { 96 unsigned max = __get_cpuid_max(0, NULL); 97 int a, b, c, d; 98 if (max >= 1) { 99 __cpuid(1, a, b, c, d); 100 /* We must check that AVX is not just available, but usable. */ 101 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { 102 int bv; 103 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); 104 __cpuid_count(7, 0, a, b, c, d); 105 /* 0xe6: 106 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 107 * and ZMM16-ZMM31 state are enabled by OS) 108 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) 109 */ 110 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { 111 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; 112 } 113 } 114 } 115 } 116 #endif 117 118 XBZRLECacheStats xbzrle_counters; 119 120 /* used by the search for pages to send */ 121 struct PageSearchStatus { 122 /* The migration channel used for a specific host page */ 123 QEMUFile *pss_channel; 124 /* Last block from where we have sent data */ 125 RAMBlock *last_sent_block; 126 /* Current block being searched */ 127 RAMBlock *block; 128 /* Current page to search from */ 129 unsigned long page; 130 /* Set once we wrap around */ 131 bool complete_round; 132 /* Whether we're sending a host page */ 133 bool host_page_sending; 134 /* The start/end of current host page. Invalid if host_page_sending==false */ 135 unsigned long host_page_start; 136 unsigned long host_page_end; 137 }; 138 typedef struct PageSearchStatus PageSearchStatus; 139 140 /* struct contains XBZRLE cache and a static page 141 used by the compression */ 142 static struct { 143 /* buffer used for XBZRLE encoding */ 144 uint8_t *encoded_buf; 145 /* buffer for storing page content */ 146 uint8_t *current_buf; 147 /* Cache for XBZRLE, Protected by lock. */ 148 PageCache *cache; 149 QemuMutex lock; 150 /* it will store a page full of zeros */ 151 uint8_t *zero_target_page; 152 /* buffer used for XBZRLE decoding */ 153 uint8_t *decoded_buf; 154 } XBZRLE; 155 156 static void XBZRLE_cache_lock(void) 157 { 158 if (migrate_use_xbzrle()) { 159 qemu_mutex_lock(&XBZRLE.lock); 160 } 161 } 162 163 static void XBZRLE_cache_unlock(void) 164 { 165 if (migrate_use_xbzrle()) { 166 qemu_mutex_unlock(&XBZRLE.lock); 167 } 168 } 169 170 /** 171 * xbzrle_cache_resize: resize the xbzrle cache 172 * 173 * This function is called from migrate_params_apply in main 174 * thread, possibly while a migration is in progress. A running 175 * migration may be using the cache and might finish during this call, 176 * hence changes to the cache are protected by XBZRLE.lock(). 177 * 178 * Returns 0 for success or -1 for error 179 * 180 * @new_size: new cache size 181 * @errp: set *errp if the check failed, with reason 182 */ 183 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 184 { 185 PageCache *new_cache; 186 int64_t ret = 0; 187 188 /* Check for truncation */ 189 if (new_size != (size_t)new_size) { 190 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 191 "exceeding address space"); 192 return -1; 193 } 194 195 if (new_size == migrate_xbzrle_cache_size()) { 196 /* nothing to do */ 197 return 0; 198 } 199 200 XBZRLE_cache_lock(); 201 202 if (XBZRLE.cache != NULL) { 203 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 204 if (!new_cache) { 205 ret = -1; 206 goto out; 207 } 208 209 cache_fini(XBZRLE.cache); 210 XBZRLE.cache = new_cache; 211 } 212 out: 213 XBZRLE_cache_unlock(); 214 return ret; 215 } 216 217 static bool postcopy_preempt_active(void) 218 { 219 return migrate_postcopy_preempt() && migration_in_postcopy(); 220 } 221 222 bool ramblock_is_ignored(RAMBlock *block) 223 { 224 return !qemu_ram_is_migratable(block) || 225 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 226 } 227 228 #undef RAMBLOCK_FOREACH 229 230 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 231 { 232 RAMBlock *block; 233 int ret = 0; 234 235 RCU_READ_LOCK_GUARD(); 236 237 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 238 ret = func(block, opaque); 239 if (ret) { 240 break; 241 } 242 } 243 return ret; 244 } 245 246 static void ramblock_recv_map_init(void) 247 { 248 RAMBlock *rb; 249 250 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 251 assert(!rb->receivedmap); 252 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 253 } 254 } 255 256 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 257 { 258 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 259 rb->receivedmap); 260 } 261 262 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 263 { 264 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 265 } 266 267 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 268 { 269 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 270 } 271 272 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 273 size_t nr) 274 { 275 bitmap_set_atomic(rb->receivedmap, 276 ramblock_recv_bitmap_offset(host_addr, rb), 277 nr); 278 } 279 280 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 281 282 /* 283 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 284 * 285 * Returns >0 if success with sent bytes, or <0 if error. 286 */ 287 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 288 const char *block_name) 289 { 290 RAMBlock *block = qemu_ram_block_by_name(block_name); 291 unsigned long *le_bitmap, nbits; 292 uint64_t size; 293 294 if (!block) { 295 error_report("%s: invalid block name: %s", __func__, block_name); 296 return -1; 297 } 298 299 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 300 301 /* 302 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 303 * machines we may need 4 more bytes for padding (see below 304 * comment). So extend it a bit before hand. 305 */ 306 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 307 308 /* 309 * Always use little endian when sending the bitmap. This is 310 * required that when source and destination VMs are not using the 311 * same endianness. (Note: big endian won't work.) 312 */ 313 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 314 315 /* Size of the bitmap, in bytes */ 316 size = DIV_ROUND_UP(nbits, 8); 317 318 /* 319 * size is always aligned to 8 bytes for 64bit machines, but it 320 * may not be true for 32bit machines. We need this padding to 321 * make sure the migration can survive even between 32bit and 322 * 64bit machines. 323 */ 324 size = ROUND_UP(size, 8); 325 326 qemu_put_be64(file, size); 327 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 328 /* 329 * Mark as an end, in case the middle part is screwed up due to 330 * some "mysterious" reason. 331 */ 332 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 333 qemu_fflush(file); 334 335 g_free(le_bitmap); 336 337 if (qemu_file_get_error(file)) { 338 return qemu_file_get_error(file); 339 } 340 341 return size + sizeof(size); 342 } 343 344 /* 345 * An outstanding page request, on the source, having been received 346 * and queued 347 */ 348 struct RAMSrcPageRequest { 349 RAMBlock *rb; 350 hwaddr offset; 351 hwaddr len; 352 353 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 354 }; 355 356 /* State of RAM for migration */ 357 struct RAMState { 358 /* 359 * PageSearchStatus structures for the channels when send pages. 360 * Protected by the bitmap_mutex. 361 */ 362 PageSearchStatus pss[RAM_CHANNEL_MAX]; 363 /* UFFD file descriptor, used in 'write-tracking' migration */ 364 int uffdio_fd; 365 /* total ram size in bytes */ 366 uint64_t ram_bytes_total; 367 /* Last block that we have visited searching for dirty pages */ 368 RAMBlock *last_seen_block; 369 /* Last dirty target page we have sent */ 370 ram_addr_t last_page; 371 /* last ram version we have seen */ 372 uint32_t last_version; 373 /* How many times we have dirty too many pages */ 374 int dirty_rate_high_cnt; 375 /* these variables are used for bitmap sync */ 376 /* last time we did a full bitmap_sync */ 377 int64_t time_last_bitmap_sync; 378 /* bytes transferred at start_time */ 379 uint64_t bytes_xfer_prev; 380 /* number of dirty pages since start_time */ 381 uint64_t num_dirty_pages_period; 382 /* xbzrle misses since the beginning of the period */ 383 uint64_t xbzrle_cache_miss_prev; 384 /* Amount of xbzrle pages since the beginning of the period */ 385 uint64_t xbzrle_pages_prev; 386 /* Amount of xbzrle encoded bytes since the beginning of the period */ 387 uint64_t xbzrle_bytes_prev; 388 /* Start using XBZRLE (e.g., after the first round). */ 389 bool xbzrle_enabled; 390 /* Are we on the last stage of migration */ 391 bool last_stage; 392 /* compression statistics since the beginning of the period */ 393 /* amount of count that no free thread to compress data */ 394 uint64_t compress_thread_busy_prev; 395 /* amount bytes after compression */ 396 uint64_t compressed_size_prev; 397 /* amount of compressed pages */ 398 uint64_t compress_pages_prev; 399 400 /* total handled target pages at the beginning of period */ 401 uint64_t target_page_count_prev; 402 /* total handled target pages since start */ 403 uint64_t target_page_count; 404 /* number of dirty bits in the bitmap */ 405 uint64_t migration_dirty_pages; 406 /* 407 * Protects: 408 * - dirty/clear bitmap 409 * - migration_dirty_pages 410 * - pss structures 411 */ 412 QemuMutex bitmap_mutex; 413 /* The RAMBlock used in the last src_page_requests */ 414 RAMBlock *last_req_rb; 415 /* Queue of outstanding page requests from the destination */ 416 QemuMutex src_page_req_mutex; 417 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 418 }; 419 typedef struct RAMState RAMState; 420 421 static RAMState *ram_state; 422 423 static NotifierWithReturnList precopy_notifier_list; 424 425 /* Whether postcopy has queued requests? */ 426 static bool postcopy_has_request(RAMState *rs) 427 { 428 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 429 } 430 431 void precopy_infrastructure_init(void) 432 { 433 notifier_with_return_list_init(&precopy_notifier_list); 434 } 435 436 void precopy_add_notifier(NotifierWithReturn *n) 437 { 438 notifier_with_return_list_add(&precopy_notifier_list, n); 439 } 440 441 void precopy_remove_notifier(NotifierWithReturn *n) 442 { 443 notifier_with_return_remove(n); 444 } 445 446 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 447 { 448 PrecopyNotifyData pnd; 449 pnd.reason = reason; 450 pnd.errp = errp; 451 452 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 453 } 454 455 uint64_t ram_bytes_remaining(void) 456 { 457 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 458 0; 459 } 460 461 /* 462 * NOTE: not all stats in ram_counters are used in reality. See comments 463 * for struct MigrationAtomicStats. The ultimate result of ram migration 464 * counters will be a merged version with both ram_counters and the atomic 465 * fields in ram_atomic_counters. 466 */ 467 MigrationStats ram_counters; 468 MigrationAtomicStats ram_atomic_counters; 469 470 void ram_transferred_add(uint64_t bytes) 471 { 472 if (runstate_is_running()) { 473 ram_counters.precopy_bytes += bytes; 474 } else if (migration_in_postcopy()) { 475 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes); 476 } else { 477 ram_counters.downtime_bytes += bytes; 478 } 479 stat64_add(&ram_atomic_counters.transferred, bytes); 480 } 481 482 void dirty_sync_missed_zero_copy(void) 483 { 484 ram_counters.dirty_sync_missed_zero_copy++; 485 } 486 487 struct MigrationOps { 488 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 489 }; 490 typedef struct MigrationOps MigrationOps; 491 492 MigrationOps *migration_ops; 493 494 CompressionStats compression_counters; 495 496 struct CompressParam { 497 bool done; 498 bool quit; 499 bool zero_page; 500 QEMUFile *file; 501 QemuMutex mutex; 502 QemuCond cond; 503 RAMBlock *block; 504 ram_addr_t offset; 505 506 /* internally used fields */ 507 z_stream stream; 508 uint8_t *originbuf; 509 }; 510 typedef struct CompressParam CompressParam; 511 512 struct DecompressParam { 513 bool done; 514 bool quit; 515 QemuMutex mutex; 516 QemuCond cond; 517 void *des; 518 uint8_t *compbuf; 519 int len; 520 z_stream stream; 521 }; 522 typedef struct DecompressParam DecompressParam; 523 524 static CompressParam *comp_param; 525 static QemuThread *compress_threads; 526 /* comp_done_cond is used to wake up the migration thread when 527 * one of the compression threads has finished the compression. 528 * comp_done_lock is used to co-work with comp_done_cond. 529 */ 530 static QemuMutex comp_done_lock; 531 static QemuCond comp_done_cond; 532 533 static QEMUFile *decomp_file; 534 static DecompressParam *decomp_param; 535 static QemuThread *decompress_threads; 536 static QemuMutex decomp_done_lock; 537 static QemuCond decomp_done_cond; 538 539 static int ram_save_host_page_urgent(PageSearchStatus *pss); 540 541 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 542 ram_addr_t offset, uint8_t *source_buf); 543 544 /* NOTE: page is the PFN not real ram_addr_t. */ 545 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 546 { 547 pss->block = rb; 548 pss->page = page; 549 pss->complete_round = false; 550 } 551 552 /* 553 * Check whether two PSSs are actively sending the same page. Return true 554 * if it is, false otherwise. 555 */ 556 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 557 { 558 return pss1->host_page_sending && pss2->host_page_sending && 559 (pss1->host_page_start == pss2->host_page_start); 560 } 561 562 static void *do_data_compress(void *opaque) 563 { 564 CompressParam *param = opaque; 565 RAMBlock *block; 566 ram_addr_t offset; 567 bool zero_page; 568 569 qemu_mutex_lock(¶m->mutex); 570 while (!param->quit) { 571 if (param->block) { 572 block = param->block; 573 offset = param->offset; 574 param->block = NULL; 575 qemu_mutex_unlock(¶m->mutex); 576 577 zero_page = do_compress_ram_page(param->file, ¶m->stream, 578 block, offset, param->originbuf); 579 580 qemu_mutex_lock(&comp_done_lock); 581 param->done = true; 582 param->zero_page = zero_page; 583 qemu_cond_signal(&comp_done_cond); 584 qemu_mutex_unlock(&comp_done_lock); 585 586 qemu_mutex_lock(¶m->mutex); 587 } else { 588 qemu_cond_wait(¶m->cond, ¶m->mutex); 589 } 590 } 591 qemu_mutex_unlock(¶m->mutex); 592 593 return NULL; 594 } 595 596 static void compress_threads_save_cleanup(void) 597 { 598 int i, thread_count; 599 600 if (!migrate_use_compression() || !comp_param) { 601 return; 602 } 603 604 thread_count = migrate_compress_threads(); 605 for (i = 0; i < thread_count; i++) { 606 /* 607 * we use it as a indicator which shows if the thread is 608 * properly init'd or not 609 */ 610 if (!comp_param[i].file) { 611 break; 612 } 613 614 qemu_mutex_lock(&comp_param[i].mutex); 615 comp_param[i].quit = true; 616 qemu_cond_signal(&comp_param[i].cond); 617 qemu_mutex_unlock(&comp_param[i].mutex); 618 619 qemu_thread_join(compress_threads + i); 620 qemu_mutex_destroy(&comp_param[i].mutex); 621 qemu_cond_destroy(&comp_param[i].cond); 622 deflateEnd(&comp_param[i].stream); 623 g_free(comp_param[i].originbuf); 624 qemu_fclose(comp_param[i].file); 625 comp_param[i].file = NULL; 626 } 627 qemu_mutex_destroy(&comp_done_lock); 628 qemu_cond_destroy(&comp_done_cond); 629 g_free(compress_threads); 630 g_free(comp_param); 631 compress_threads = NULL; 632 comp_param = NULL; 633 } 634 635 static int compress_threads_save_setup(void) 636 { 637 int i, thread_count; 638 639 if (!migrate_use_compression()) { 640 return 0; 641 } 642 thread_count = migrate_compress_threads(); 643 compress_threads = g_new0(QemuThread, thread_count); 644 comp_param = g_new0(CompressParam, thread_count); 645 qemu_cond_init(&comp_done_cond); 646 qemu_mutex_init(&comp_done_lock); 647 for (i = 0; i < thread_count; i++) { 648 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 649 if (!comp_param[i].originbuf) { 650 goto exit; 651 } 652 653 if (deflateInit(&comp_param[i].stream, 654 migrate_compress_level()) != Z_OK) { 655 g_free(comp_param[i].originbuf); 656 goto exit; 657 } 658 659 /* comp_param[i].file is just used as a dummy buffer to save data, 660 * set its ops to empty. 661 */ 662 comp_param[i].file = qemu_file_new_output( 663 QIO_CHANNEL(qio_channel_null_new())); 664 comp_param[i].done = true; 665 comp_param[i].quit = false; 666 qemu_mutex_init(&comp_param[i].mutex); 667 qemu_cond_init(&comp_param[i].cond); 668 qemu_thread_create(compress_threads + i, "compress", 669 do_data_compress, comp_param + i, 670 QEMU_THREAD_JOINABLE); 671 } 672 return 0; 673 674 exit: 675 compress_threads_save_cleanup(); 676 return -1; 677 } 678 679 /** 680 * save_page_header: write page header to wire 681 * 682 * If this is the 1st block, it also writes the block identification 683 * 684 * Returns the number of bytes written 685 * 686 * @pss: current PSS channel status 687 * @block: block that contains the page we want to send 688 * @offset: offset inside the block for the page 689 * in the lower bits, it contains flags 690 */ 691 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 692 RAMBlock *block, ram_addr_t offset) 693 { 694 size_t size, len; 695 bool same_block = (block == pss->last_sent_block); 696 697 if (same_block) { 698 offset |= RAM_SAVE_FLAG_CONTINUE; 699 } 700 qemu_put_be64(f, offset); 701 size = 8; 702 703 if (!same_block) { 704 len = strlen(block->idstr); 705 qemu_put_byte(f, len); 706 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 707 size += 1 + len; 708 pss->last_sent_block = block; 709 } 710 return size; 711 } 712 713 /** 714 * mig_throttle_guest_down: throttle down the guest 715 * 716 * Reduce amount of guest cpu execution to hopefully slow down memory 717 * writes. If guest dirty memory rate is reduced below the rate at 718 * which we can transfer pages to the destination then we should be 719 * able to complete migration. Some workloads dirty memory way too 720 * fast and will not effectively converge, even with auto-converge. 721 */ 722 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 723 uint64_t bytes_dirty_threshold) 724 { 725 MigrationState *s = migrate_get_current(); 726 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 727 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 728 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 729 int pct_max = s->parameters.max_cpu_throttle; 730 731 uint64_t throttle_now = cpu_throttle_get_percentage(); 732 uint64_t cpu_now, cpu_ideal, throttle_inc; 733 734 /* We have not started throttling yet. Let's start it. */ 735 if (!cpu_throttle_active()) { 736 cpu_throttle_set(pct_initial); 737 } else { 738 /* Throttling already on, just increase the rate */ 739 if (!pct_tailslow) { 740 throttle_inc = pct_increment; 741 } else { 742 /* Compute the ideal CPU percentage used by Guest, which may 743 * make the dirty rate match the dirty rate threshold. */ 744 cpu_now = 100 - throttle_now; 745 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 746 bytes_dirty_period); 747 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 748 } 749 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 750 } 751 } 752 753 void mig_throttle_counter_reset(void) 754 { 755 RAMState *rs = ram_state; 756 757 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 758 rs->num_dirty_pages_period = 0; 759 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); 760 } 761 762 /** 763 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 764 * 765 * @rs: current RAM state 766 * @current_addr: address for the zero page 767 * 768 * Update the xbzrle cache to reflect a page that's been sent as all 0. 769 * The important thing is that a stale (not-yet-0'd) page be replaced 770 * by the new data. 771 * As a bonus, if the page wasn't in the cache it gets added so that 772 * when a small write is made into the 0'd page it gets XBZRLE sent. 773 */ 774 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 775 { 776 /* We don't care if this fails to allocate a new cache page 777 * as long as it updated an old one */ 778 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 779 ram_counters.dirty_sync_count); 780 } 781 782 #define ENCODING_FLAG_XBZRLE 0x1 783 784 /** 785 * save_xbzrle_page: compress and send current page 786 * 787 * Returns: 1 means that we wrote the page 788 * 0 means that page is identical to the one already sent 789 * -1 means that xbzrle would be longer than normal 790 * 791 * @rs: current RAM state 792 * @pss: current PSS channel 793 * @current_data: pointer to the address of the page contents 794 * @current_addr: addr of the page 795 * @block: block that contains the page we want to send 796 * @offset: offset inside the block for the page 797 */ 798 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 799 uint8_t **current_data, ram_addr_t current_addr, 800 RAMBlock *block, ram_addr_t offset) 801 { 802 int encoded_len = 0, bytes_xbzrle; 803 uint8_t *prev_cached_page; 804 QEMUFile *file = pss->pss_channel; 805 806 if (!cache_is_cached(XBZRLE.cache, current_addr, 807 ram_counters.dirty_sync_count)) { 808 xbzrle_counters.cache_miss++; 809 if (!rs->last_stage) { 810 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 811 ram_counters.dirty_sync_count) == -1) { 812 return -1; 813 } else { 814 /* update *current_data when the page has been 815 inserted into cache */ 816 *current_data = get_cached_data(XBZRLE.cache, current_addr); 817 } 818 } 819 return -1; 820 } 821 822 /* 823 * Reaching here means the page has hit the xbzrle cache, no matter what 824 * encoding result it is (normal encoding, overflow or skipping the page), 825 * count the page as encoded. This is used to calculate the encoding rate. 826 * 827 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 828 * 2nd page turns out to be skipped (i.e. no new bytes written to the 829 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 830 * skipped page included. In this way, the encoding rate can tell if the 831 * guest page is good for xbzrle encoding. 832 */ 833 xbzrle_counters.pages++; 834 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 835 836 /* save current buffer into memory */ 837 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 838 839 /* XBZRLE encoding (if there is no overflow) */ 840 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, 841 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 842 TARGET_PAGE_SIZE); 843 844 /* 845 * Update the cache contents, so that it corresponds to the data 846 * sent, in all cases except where we skip the page. 847 */ 848 if (!rs->last_stage && encoded_len != 0) { 849 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 850 /* 851 * In the case where we couldn't compress, ensure that the caller 852 * sends the data from the cache, since the guest might have 853 * changed the RAM since we copied it. 854 */ 855 *current_data = prev_cached_page; 856 } 857 858 if (encoded_len == 0) { 859 trace_save_xbzrle_page_skipping(); 860 return 0; 861 } else if (encoded_len == -1) { 862 trace_save_xbzrle_page_overflow(); 863 xbzrle_counters.overflow++; 864 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 865 return -1; 866 } 867 868 /* Send XBZRLE based compressed page */ 869 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 870 offset | RAM_SAVE_FLAG_XBZRLE); 871 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 872 qemu_put_be16(file, encoded_len); 873 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 874 bytes_xbzrle += encoded_len + 1 + 2; 875 /* 876 * Like compressed_size (please see update_compress_thread_counts), 877 * the xbzrle encoded bytes don't count the 8 byte header with 878 * RAM_SAVE_FLAG_CONTINUE. 879 */ 880 xbzrle_counters.bytes += bytes_xbzrle - 8; 881 ram_transferred_add(bytes_xbzrle); 882 883 return 1; 884 } 885 886 /** 887 * pss_find_next_dirty: find the next dirty page of current ramblock 888 * 889 * This function updates pss->page to point to the next dirty page index 890 * within the ramblock to migrate, or the end of ramblock when nothing 891 * found. Note that when pss->host_page_sending==true it means we're 892 * during sending a host page, so we won't look for dirty page that is 893 * outside the host page boundary. 894 * 895 * @pss: the current page search status 896 */ 897 static void pss_find_next_dirty(PageSearchStatus *pss) 898 { 899 RAMBlock *rb = pss->block; 900 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 901 unsigned long *bitmap = rb->bmap; 902 903 if (ramblock_is_ignored(rb)) { 904 /* Points directly to the end, so we know no dirty page */ 905 pss->page = size; 906 return; 907 } 908 909 /* 910 * If during sending a host page, only look for dirty pages within the 911 * current host page being send. 912 */ 913 if (pss->host_page_sending) { 914 assert(pss->host_page_end); 915 size = MIN(size, pss->host_page_end); 916 } 917 918 pss->page = find_next_bit(bitmap, size, pss->page); 919 } 920 921 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 922 unsigned long page) 923 { 924 uint8_t shift; 925 hwaddr size, start; 926 927 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 928 return; 929 } 930 931 shift = rb->clear_bmap_shift; 932 /* 933 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 934 * can make things easier sometimes since then start address 935 * of the small chunk will always be 64 pages aligned so the 936 * bitmap will always be aligned to unsigned long. We should 937 * even be able to remove this restriction but I'm simply 938 * keeping it. 939 */ 940 assert(shift >= 6); 941 942 size = 1ULL << (TARGET_PAGE_BITS + shift); 943 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 944 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 945 memory_region_clear_dirty_bitmap(rb->mr, start, size); 946 } 947 948 static void 949 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 950 unsigned long start, 951 unsigned long npages) 952 { 953 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 954 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 955 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 956 957 /* 958 * Clear pages from start to start + npages - 1, so the end boundary is 959 * exclusive. 960 */ 961 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 962 migration_clear_memory_region_dirty_bitmap(rb, i); 963 } 964 } 965 966 /* 967 * colo_bitmap_find_diry:find contiguous dirty pages from start 968 * 969 * Returns the page offset within memory region of the start of the contiguout 970 * dirty page 971 * 972 * @rs: current RAM state 973 * @rb: RAMBlock where to search for dirty pages 974 * @start: page where we start the search 975 * @num: the number of contiguous dirty pages 976 */ 977 static inline 978 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 979 unsigned long start, unsigned long *num) 980 { 981 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 982 unsigned long *bitmap = rb->bmap; 983 unsigned long first, next; 984 985 *num = 0; 986 987 if (ramblock_is_ignored(rb)) { 988 return size; 989 } 990 991 first = find_next_bit(bitmap, size, start); 992 if (first >= size) { 993 return first; 994 } 995 next = find_next_zero_bit(bitmap, size, first + 1); 996 assert(next >= first); 997 *num = next - first; 998 return first; 999 } 1000 1001 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 1002 RAMBlock *rb, 1003 unsigned long page) 1004 { 1005 bool ret; 1006 1007 /* 1008 * Clear dirty bitmap if needed. This _must_ be called before we 1009 * send any of the page in the chunk because we need to make sure 1010 * we can capture further page content changes when we sync dirty 1011 * log the next time. So as long as we are going to send any of 1012 * the page in the chunk we clear the remote dirty bitmap for all. 1013 * Clearing it earlier won't be a problem, but too late will. 1014 */ 1015 migration_clear_memory_region_dirty_bitmap(rb, page); 1016 1017 ret = test_and_clear_bit(page, rb->bmap); 1018 if (ret) { 1019 rs->migration_dirty_pages--; 1020 } 1021 1022 return ret; 1023 } 1024 1025 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 1026 void *opaque) 1027 { 1028 const hwaddr offset = section->offset_within_region; 1029 const hwaddr size = int128_get64(section->size); 1030 const unsigned long start = offset >> TARGET_PAGE_BITS; 1031 const unsigned long npages = size >> TARGET_PAGE_BITS; 1032 RAMBlock *rb = section->mr->ram_block; 1033 uint64_t *cleared_bits = opaque; 1034 1035 /* 1036 * We don't grab ram_state->bitmap_mutex because we expect to run 1037 * only when starting migration or during postcopy recovery where 1038 * we don't have concurrent access. 1039 */ 1040 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1041 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1042 } 1043 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1044 bitmap_clear(rb->bmap, start, npages); 1045 } 1046 1047 /* 1048 * Exclude all dirty pages from migration that fall into a discarded range as 1049 * managed by a RamDiscardManager responsible for the mapped memory region of 1050 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1051 * 1052 * Discarded pages ("logically unplugged") have undefined content and must 1053 * not get migrated, because even reading these pages for migration might 1054 * result in undesired behavior. 1055 * 1056 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1057 * 1058 * Note: The result is only stable while migrating (precopy/postcopy). 1059 */ 1060 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1061 { 1062 uint64_t cleared_bits = 0; 1063 1064 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1065 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1066 MemoryRegionSection section = { 1067 .mr = rb->mr, 1068 .offset_within_region = 0, 1069 .size = int128_make64(qemu_ram_get_used_length(rb)), 1070 }; 1071 1072 ram_discard_manager_replay_discarded(rdm, §ion, 1073 dirty_bitmap_clear_section, 1074 &cleared_bits); 1075 } 1076 return cleared_bits; 1077 } 1078 1079 /* 1080 * Check if a host-page aligned page falls into a discarded range as managed by 1081 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1082 * 1083 * Note: The result is only stable while migrating (precopy/postcopy). 1084 */ 1085 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1086 { 1087 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1088 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1089 MemoryRegionSection section = { 1090 .mr = rb->mr, 1091 .offset_within_region = start, 1092 .size = int128_make64(qemu_ram_pagesize(rb)), 1093 }; 1094 1095 return !ram_discard_manager_is_populated(rdm, §ion); 1096 } 1097 return false; 1098 } 1099 1100 /* Called with RCU critical section */ 1101 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1102 { 1103 uint64_t new_dirty_pages = 1104 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1105 1106 rs->migration_dirty_pages += new_dirty_pages; 1107 rs->num_dirty_pages_period += new_dirty_pages; 1108 } 1109 1110 /** 1111 * ram_pagesize_summary: calculate all the pagesizes of a VM 1112 * 1113 * Returns a summary bitmap of the page sizes of all RAMBlocks 1114 * 1115 * For VMs with just normal pages this is equivalent to the host page 1116 * size. If it's got some huge pages then it's the OR of all the 1117 * different page sizes. 1118 */ 1119 uint64_t ram_pagesize_summary(void) 1120 { 1121 RAMBlock *block; 1122 uint64_t summary = 0; 1123 1124 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1125 summary |= block->page_size; 1126 } 1127 1128 return summary; 1129 } 1130 1131 uint64_t ram_get_total_transferred_pages(void) 1132 { 1133 return stat64_get(&ram_atomic_counters.normal) + 1134 stat64_get(&ram_atomic_counters.duplicate) + 1135 compression_counters.pages + xbzrle_counters.pages; 1136 } 1137 1138 static void migration_update_rates(RAMState *rs, int64_t end_time) 1139 { 1140 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1141 double compressed_size; 1142 1143 /* calculate period counters */ 1144 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1145 / (end_time - rs->time_last_bitmap_sync); 1146 1147 if (!page_count) { 1148 return; 1149 } 1150 1151 if (migrate_use_xbzrle()) { 1152 double encoded_size, unencoded_size; 1153 1154 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1155 rs->xbzrle_cache_miss_prev) / page_count; 1156 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1157 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1158 TARGET_PAGE_SIZE; 1159 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1160 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1161 xbzrle_counters.encoding_rate = 0; 1162 } else { 1163 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1164 } 1165 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1166 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1167 } 1168 1169 if (migrate_use_compression()) { 1170 compression_counters.busy_rate = (double)(compression_counters.busy - 1171 rs->compress_thread_busy_prev) / page_count; 1172 rs->compress_thread_busy_prev = compression_counters.busy; 1173 1174 compressed_size = compression_counters.compressed_size - 1175 rs->compressed_size_prev; 1176 if (compressed_size) { 1177 double uncompressed_size = (compression_counters.pages - 1178 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1179 1180 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1181 compression_counters.compression_rate = 1182 uncompressed_size / compressed_size; 1183 1184 rs->compress_pages_prev = compression_counters.pages; 1185 rs->compressed_size_prev = compression_counters.compressed_size; 1186 } 1187 } 1188 } 1189 1190 static void migration_trigger_throttle(RAMState *rs) 1191 { 1192 MigrationState *s = migrate_get_current(); 1193 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1194 uint64_t bytes_xfer_period = 1195 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev; 1196 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1197 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1198 1199 /* During block migration the auto-converge logic incorrectly detects 1200 * that ram migration makes no progress. Avoid this by disabling the 1201 * throttling logic during the bulk phase of block migration. */ 1202 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1203 /* The following detection logic can be refined later. For now: 1204 Check to see if the ratio between dirtied bytes and the approx. 1205 amount of bytes that just got transferred since the last time 1206 we were in this routine reaches the threshold. If that happens 1207 twice, start or increase throttling. */ 1208 1209 if ((bytes_dirty_period > bytes_dirty_threshold) && 1210 (++rs->dirty_rate_high_cnt >= 2)) { 1211 trace_migration_throttle(); 1212 rs->dirty_rate_high_cnt = 0; 1213 mig_throttle_guest_down(bytes_dirty_period, 1214 bytes_dirty_threshold); 1215 } 1216 } 1217 } 1218 1219 static void migration_bitmap_sync(RAMState *rs) 1220 { 1221 RAMBlock *block; 1222 int64_t end_time; 1223 1224 ram_counters.dirty_sync_count++; 1225 1226 if (!rs->time_last_bitmap_sync) { 1227 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1228 } 1229 1230 trace_migration_bitmap_sync_start(); 1231 memory_global_dirty_log_sync(); 1232 1233 qemu_mutex_lock(&rs->bitmap_mutex); 1234 WITH_RCU_READ_LOCK_GUARD() { 1235 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1236 ramblock_sync_dirty_bitmap(rs, block); 1237 } 1238 ram_counters.remaining = ram_bytes_remaining(); 1239 } 1240 qemu_mutex_unlock(&rs->bitmap_mutex); 1241 1242 memory_global_after_dirty_log_sync(); 1243 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1244 1245 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1246 1247 /* more than 1 second = 1000 millisecons */ 1248 if (end_time > rs->time_last_bitmap_sync + 1000) { 1249 migration_trigger_throttle(rs); 1250 1251 migration_update_rates(rs, end_time); 1252 1253 rs->target_page_count_prev = rs->target_page_count; 1254 1255 /* reset period counters */ 1256 rs->time_last_bitmap_sync = end_time; 1257 rs->num_dirty_pages_period = 0; 1258 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); 1259 } 1260 if (migrate_use_events()) { 1261 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1262 } 1263 } 1264 1265 static void migration_bitmap_sync_precopy(RAMState *rs) 1266 { 1267 Error *local_err = NULL; 1268 1269 /* 1270 * The current notifier usage is just an optimization to migration, so we 1271 * don't stop the normal migration process in the error case. 1272 */ 1273 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1274 error_report_err(local_err); 1275 local_err = NULL; 1276 } 1277 1278 migration_bitmap_sync(rs); 1279 1280 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1281 error_report_err(local_err); 1282 } 1283 } 1284 1285 void ram_release_page(const char *rbname, uint64_t offset) 1286 { 1287 if (!migrate_release_ram() || !migration_in_postcopy()) { 1288 return; 1289 } 1290 1291 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1292 } 1293 1294 /** 1295 * save_zero_page_to_file: send the zero page to the file 1296 * 1297 * Returns the size of data written to the file, 0 means the page is not 1298 * a zero page 1299 * 1300 * @pss: current PSS channel 1301 * @block: block that contains the page we want to send 1302 * @offset: offset inside the block for the page 1303 */ 1304 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1305 RAMBlock *block, ram_addr_t offset) 1306 { 1307 uint8_t *p = block->host + offset; 1308 int len = 0; 1309 1310 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1311 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1312 qemu_put_byte(file, 0); 1313 len += 1; 1314 ram_release_page(block->idstr, offset); 1315 } 1316 return len; 1317 } 1318 1319 /** 1320 * save_zero_page: send the zero page to the stream 1321 * 1322 * Returns the number of pages written. 1323 * 1324 * @pss: current PSS channel 1325 * @block: block that contains the page we want to send 1326 * @offset: offset inside the block for the page 1327 */ 1328 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1329 ram_addr_t offset) 1330 { 1331 int len = save_zero_page_to_file(pss, f, block, offset); 1332 1333 if (len) { 1334 stat64_add(&ram_atomic_counters.duplicate, 1); 1335 ram_transferred_add(len); 1336 return 1; 1337 } 1338 return -1; 1339 } 1340 1341 /* 1342 * @pages: the number of pages written by the control path, 1343 * < 0 - error 1344 * > 0 - number of pages written 1345 * 1346 * Return true if the pages has been saved, otherwise false is returned. 1347 */ 1348 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1349 ram_addr_t offset, int *pages) 1350 { 1351 uint64_t bytes_xmit = 0; 1352 int ret; 1353 1354 *pages = -1; 1355 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1356 TARGET_PAGE_SIZE, &bytes_xmit); 1357 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1358 return false; 1359 } 1360 1361 if (bytes_xmit) { 1362 ram_transferred_add(bytes_xmit); 1363 *pages = 1; 1364 } 1365 1366 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1367 return true; 1368 } 1369 1370 if (bytes_xmit > 0) { 1371 stat64_add(&ram_atomic_counters.normal, 1); 1372 } else if (bytes_xmit == 0) { 1373 stat64_add(&ram_atomic_counters.duplicate, 1); 1374 } 1375 1376 return true; 1377 } 1378 1379 /* 1380 * directly send the page to the stream 1381 * 1382 * Returns the number of pages written. 1383 * 1384 * @pss: current PSS channel 1385 * @block: block that contains the page we want to send 1386 * @offset: offset inside the block for the page 1387 * @buf: the page to be sent 1388 * @async: send to page asyncly 1389 */ 1390 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1391 ram_addr_t offset, uint8_t *buf, bool async) 1392 { 1393 QEMUFile *file = pss->pss_channel; 1394 1395 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1396 offset | RAM_SAVE_FLAG_PAGE)); 1397 if (async) { 1398 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1399 migrate_release_ram() && 1400 migration_in_postcopy()); 1401 } else { 1402 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1403 } 1404 ram_transferred_add(TARGET_PAGE_SIZE); 1405 stat64_add(&ram_atomic_counters.normal, 1); 1406 return 1; 1407 } 1408 1409 /** 1410 * ram_save_page: send the given page to the stream 1411 * 1412 * Returns the number of pages written. 1413 * < 0 - error 1414 * >=0 - Number of pages written - this might legally be 0 1415 * if xbzrle noticed the page was the same. 1416 * 1417 * @rs: current RAM state 1418 * @block: block that contains the page we want to send 1419 * @offset: offset inside the block for the page 1420 */ 1421 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1422 { 1423 int pages = -1; 1424 uint8_t *p; 1425 bool send_async = true; 1426 RAMBlock *block = pss->block; 1427 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1428 ram_addr_t current_addr = block->offset + offset; 1429 1430 p = block->host + offset; 1431 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1432 1433 XBZRLE_cache_lock(); 1434 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1435 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1436 block, offset); 1437 if (!rs->last_stage) { 1438 /* Can't send this cached data async, since the cache page 1439 * might get updated before it gets to the wire 1440 */ 1441 send_async = false; 1442 } 1443 } 1444 1445 /* XBZRLE overflow or normal page */ 1446 if (pages == -1) { 1447 pages = save_normal_page(pss, block, offset, p, send_async); 1448 } 1449 1450 XBZRLE_cache_unlock(); 1451 1452 return pages; 1453 } 1454 1455 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1456 ram_addr_t offset) 1457 { 1458 if (multifd_queue_page(file, block, offset) < 0) { 1459 return -1; 1460 } 1461 stat64_add(&ram_atomic_counters.normal, 1); 1462 1463 return 1; 1464 } 1465 1466 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1467 ram_addr_t offset, uint8_t *source_buf) 1468 { 1469 RAMState *rs = ram_state; 1470 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1471 uint8_t *p = block->host + offset; 1472 int ret; 1473 1474 if (save_zero_page_to_file(pss, f, block, offset)) { 1475 return true; 1476 } 1477 1478 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1479 1480 /* 1481 * copy it to a internal buffer to avoid it being modified by VM 1482 * so that we can catch up the error during compression and 1483 * decompression 1484 */ 1485 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1486 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1487 if (ret < 0) { 1488 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1489 error_report("compressed data failed!"); 1490 } 1491 return false; 1492 } 1493 1494 static void 1495 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1496 { 1497 ram_transferred_add(bytes_xmit); 1498 1499 if (param->zero_page) { 1500 stat64_add(&ram_atomic_counters.duplicate, 1); 1501 return; 1502 } 1503 1504 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1505 compression_counters.compressed_size += bytes_xmit - 8; 1506 compression_counters.pages++; 1507 } 1508 1509 static bool save_page_use_compression(RAMState *rs); 1510 1511 static void flush_compressed_data(RAMState *rs) 1512 { 1513 MigrationState *ms = migrate_get_current(); 1514 int idx, len, thread_count; 1515 1516 if (!save_page_use_compression(rs)) { 1517 return; 1518 } 1519 thread_count = migrate_compress_threads(); 1520 1521 qemu_mutex_lock(&comp_done_lock); 1522 for (idx = 0; idx < thread_count; idx++) { 1523 while (!comp_param[idx].done) { 1524 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1525 } 1526 } 1527 qemu_mutex_unlock(&comp_done_lock); 1528 1529 for (idx = 0; idx < thread_count; idx++) { 1530 qemu_mutex_lock(&comp_param[idx].mutex); 1531 if (!comp_param[idx].quit) { 1532 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1533 /* 1534 * it's safe to fetch zero_page without holding comp_done_lock 1535 * as there is no further request submitted to the thread, 1536 * i.e, the thread should be waiting for a request at this point. 1537 */ 1538 update_compress_thread_counts(&comp_param[idx], len); 1539 } 1540 qemu_mutex_unlock(&comp_param[idx].mutex); 1541 } 1542 } 1543 1544 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1545 ram_addr_t offset) 1546 { 1547 param->block = block; 1548 param->offset = offset; 1549 } 1550 1551 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1552 { 1553 int idx, thread_count, bytes_xmit = -1, pages = -1; 1554 bool wait = migrate_compress_wait_thread(); 1555 MigrationState *ms = migrate_get_current(); 1556 1557 thread_count = migrate_compress_threads(); 1558 qemu_mutex_lock(&comp_done_lock); 1559 retry: 1560 for (idx = 0; idx < thread_count; idx++) { 1561 if (comp_param[idx].done) { 1562 comp_param[idx].done = false; 1563 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1564 comp_param[idx].file); 1565 qemu_mutex_lock(&comp_param[idx].mutex); 1566 set_compress_params(&comp_param[idx], block, offset); 1567 qemu_cond_signal(&comp_param[idx].cond); 1568 qemu_mutex_unlock(&comp_param[idx].mutex); 1569 pages = 1; 1570 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1571 break; 1572 } 1573 } 1574 1575 /* 1576 * wait for the free thread if the user specifies 'compress-wait-thread', 1577 * otherwise we will post the page out in the main thread as normal page. 1578 */ 1579 if (pages < 0 && wait) { 1580 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1581 goto retry; 1582 } 1583 qemu_mutex_unlock(&comp_done_lock); 1584 1585 return pages; 1586 } 1587 1588 #define PAGE_ALL_CLEAN 0 1589 #define PAGE_TRY_AGAIN 1 1590 #define PAGE_DIRTY_FOUND 2 1591 /** 1592 * find_dirty_block: find the next dirty page and update any state 1593 * associated with the search process. 1594 * 1595 * Returns: 1596 * PAGE_ALL_CLEAN: no dirty page found, give up 1597 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1598 * PAGE_DIRTY_FOUND: dirty page found 1599 * 1600 * @rs: current RAM state 1601 * @pss: data about the state of the current dirty page scan 1602 * @again: set to false if the search has scanned the whole of RAM 1603 */ 1604 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1605 { 1606 /* Update pss->page for the next dirty bit in ramblock */ 1607 pss_find_next_dirty(pss); 1608 1609 if (pss->complete_round && pss->block == rs->last_seen_block && 1610 pss->page >= rs->last_page) { 1611 /* 1612 * We've been once around the RAM and haven't found anything. 1613 * Give up. 1614 */ 1615 return PAGE_ALL_CLEAN; 1616 } 1617 if (!offset_in_ramblock(pss->block, 1618 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1619 /* Didn't find anything in this RAM Block */ 1620 pss->page = 0; 1621 pss->block = QLIST_NEXT_RCU(pss->block, next); 1622 if (!pss->block) { 1623 /* 1624 * If memory migration starts over, we will meet a dirtied page 1625 * which may still exists in compression threads's ring, so we 1626 * should flush the compressed data to make sure the new page 1627 * is not overwritten by the old one in the destination. 1628 * 1629 * Also If xbzrle is on, stop using the data compression at this 1630 * point. In theory, xbzrle can do better than compression. 1631 */ 1632 flush_compressed_data(rs); 1633 1634 /* Hit the end of the list */ 1635 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1636 /* Flag that we've looped */ 1637 pss->complete_round = true; 1638 /* After the first round, enable XBZRLE. */ 1639 if (migrate_use_xbzrle()) { 1640 rs->xbzrle_enabled = true; 1641 } 1642 } 1643 /* Didn't find anything this time, but try again on the new block */ 1644 return PAGE_TRY_AGAIN; 1645 } else { 1646 /* We've found something */ 1647 return PAGE_DIRTY_FOUND; 1648 } 1649 } 1650 1651 /** 1652 * unqueue_page: gets a page of the queue 1653 * 1654 * Helper for 'get_queued_page' - gets a page off the queue 1655 * 1656 * Returns the block of the page (or NULL if none available) 1657 * 1658 * @rs: current RAM state 1659 * @offset: used to return the offset within the RAMBlock 1660 */ 1661 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1662 { 1663 struct RAMSrcPageRequest *entry; 1664 RAMBlock *block = NULL; 1665 1666 if (!postcopy_has_request(rs)) { 1667 return NULL; 1668 } 1669 1670 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1671 1672 /* 1673 * This should _never_ change even after we take the lock, because no one 1674 * should be taking anything off the request list other than us. 1675 */ 1676 assert(postcopy_has_request(rs)); 1677 1678 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1679 block = entry->rb; 1680 *offset = entry->offset; 1681 1682 if (entry->len > TARGET_PAGE_SIZE) { 1683 entry->len -= TARGET_PAGE_SIZE; 1684 entry->offset += TARGET_PAGE_SIZE; 1685 } else { 1686 memory_region_unref(block->mr); 1687 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1688 g_free(entry); 1689 migration_consume_urgent_request(); 1690 } 1691 1692 return block; 1693 } 1694 1695 #if defined(__linux__) 1696 /** 1697 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1698 * is found, return RAM block pointer and page offset 1699 * 1700 * Returns pointer to the RAMBlock containing faulting page, 1701 * NULL if no write faults are pending 1702 * 1703 * @rs: current RAM state 1704 * @offset: page offset from the beginning of the block 1705 */ 1706 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1707 { 1708 struct uffd_msg uffd_msg; 1709 void *page_address; 1710 RAMBlock *block; 1711 int res; 1712 1713 if (!migrate_background_snapshot()) { 1714 return NULL; 1715 } 1716 1717 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1718 if (res <= 0) { 1719 return NULL; 1720 } 1721 1722 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1723 block = qemu_ram_block_from_host(page_address, false, offset); 1724 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1725 return block; 1726 } 1727 1728 /** 1729 * ram_save_release_protection: release UFFD write protection after 1730 * a range of pages has been saved 1731 * 1732 * @rs: current RAM state 1733 * @pss: page-search-status structure 1734 * @start_page: index of the first page in the range relative to pss->block 1735 * 1736 * Returns 0 on success, negative value in case of an error 1737 */ 1738 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1739 unsigned long start_page) 1740 { 1741 int res = 0; 1742 1743 /* Check if page is from UFFD-managed region. */ 1744 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1745 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1746 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1747 1748 /* Flush async buffers before un-protect. */ 1749 qemu_fflush(pss->pss_channel); 1750 /* Un-protect memory range. */ 1751 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1752 false, false); 1753 } 1754 1755 return res; 1756 } 1757 1758 /* ram_write_tracking_available: check if kernel supports required UFFD features 1759 * 1760 * Returns true if supports, false otherwise 1761 */ 1762 bool ram_write_tracking_available(void) 1763 { 1764 uint64_t uffd_features; 1765 int res; 1766 1767 res = uffd_query_features(&uffd_features); 1768 return (res == 0 && 1769 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1770 } 1771 1772 /* ram_write_tracking_compatible: check if guest configuration is 1773 * compatible with 'write-tracking' 1774 * 1775 * Returns true if compatible, false otherwise 1776 */ 1777 bool ram_write_tracking_compatible(void) 1778 { 1779 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1780 int uffd_fd; 1781 RAMBlock *block; 1782 bool ret = false; 1783 1784 /* Open UFFD file descriptor */ 1785 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1786 if (uffd_fd < 0) { 1787 return false; 1788 } 1789 1790 RCU_READ_LOCK_GUARD(); 1791 1792 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1793 uint64_t uffd_ioctls; 1794 1795 /* Nothing to do with read-only and MMIO-writable regions */ 1796 if (block->mr->readonly || block->mr->rom_device) { 1797 continue; 1798 } 1799 /* Try to register block memory via UFFD-IO to track writes */ 1800 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1801 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1802 goto out; 1803 } 1804 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1805 goto out; 1806 } 1807 } 1808 ret = true; 1809 1810 out: 1811 uffd_close_fd(uffd_fd); 1812 return ret; 1813 } 1814 1815 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1816 ram_addr_t size) 1817 { 1818 const ram_addr_t end = offset + size; 1819 1820 /* 1821 * We read one byte of each page; this will preallocate page tables if 1822 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1823 * where no page was populated yet. This might require adaption when 1824 * supporting other mappings, like shmem. 1825 */ 1826 for (; offset < end; offset += block->page_size) { 1827 char tmp = *((char *)block->host + offset); 1828 1829 /* Don't optimize the read out */ 1830 asm volatile("" : "+r" (tmp)); 1831 } 1832 } 1833 1834 static inline int populate_read_section(MemoryRegionSection *section, 1835 void *opaque) 1836 { 1837 const hwaddr size = int128_get64(section->size); 1838 hwaddr offset = section->offset_within_region; 1839 RAMBlock *block = section->mr->ram_block; 1840 1841 populate_read_range(block, offset, size); 1842 return 0; 1843 } 1844 1845 /* 1846 * ram_block_populate_read: preallocate page tables and populate pages in the 1847 * RAM block by reading a byte of each page. 1848 * 1849 * Since it's solely used for userfault_fd WP feature, here we just 1850 * hardcode page size to qemu_real_host_page_size. 1851 * 1852 * @block: RAM block to populate 1853 */ 1854 static void ram_block_populate_read(RAMBlock *rb) 1855 { 1856 /* 1857 * Skip populating all pages that fall into a discarded range as managed by 1858 * a RamDiscardManager responsible for the mapped memory region of the 1859 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1860 * must not get populated automatically. We don't have to track 1861 * modifications via userfaultfd WP reliably, because these pages will 1862 * not be part of the migration stream either way -- see 1863 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1864 * 1865 * Note: The result is only stable while migrating (precopy/postcopy). 1866 */ 1867 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1868 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1869 MemoryRegionSection section = { 1870 .mr = rb->mr, 1871 .offset_within_region = 0, 1872 .size = rb->mr->size, 1873 }; 1874 1875 ram_discard_manager_replay_populated(rdm, §ion, 1876 populate_read_section, NULL); 1877 } else { 1878 populate_read_range(rb, 0, rb->used_length); 1879 } 1880 } 1881 1882 /* 1883 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1884 */ 1885 void ram_write_tracking_prepare(void) 1886 { 1887 RAMBlock *block; 1888 1889 RCU_READ_LOCK_GUARD(); 1890 1891 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1892 /* Nothing to do with read-only and MMIO-writable regions */ 1893 if (block->mr->readonly || block->mr->rom_device) { 1894 continue; 1895 } 1896 1897 /* 1898 * Populate pages of the RAM block before enabling userfault_fd 1899 * write protection. 1900 * 1901 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1902 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1903 * pages with pte_none() entries in page table. 1904 */ 1905 ram_block_populate_read(block); 1906 } 1907 } 1908 1909 static inline int uffd_protect_section(MemoryRegionSection *section, 1910 void *opaque) 1911 { 1912 const hwaddr size = int128_get64(section->size); 1913 const hwaddr offset = section->offset_within_region; 1914 RAMBlock *rb = section->mr->ram_block; 1915 int uffd_fd = (uintptr_t)opaque; 1916 1917 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1918 false); 1919 } 1920 1921 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1922 { 1923 assert(rb->flags & RAM_UF_WRITEPROTECT); 1924 1925 /* See ram_block_populate_read() */ 1926 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1927 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1928 MemoryRegionSection section = { 1929 .mr = rb->mr, 1930 .offset_within_region = 0, 1931 .size = rb->mr->size, 1932 }; 1933 1934 return ram_discard_manager_replay_populated(rdm, §ion, 1935 uffd_protect_section, 1936 (void *)(uintptr_t)uffd_fd); 1937 } 1938 return uffd_change_protection(uffd_fd, rb->host, 1939 rb->used_length, true, false); 1940 } 1941 1942 /* 1943 * ram_write_tracking_start: start UFFD-WP memory tracking 1944 * 1945 * Returns 0 for success or negative value in case of error 1946 */ 1947 int ram_write_tracking_start(void) 1948 { 1949 int uffd_fd; 1950 RAMState *rs = ram_state; 1951 RAMBlock *block; 1952 1953 /* Open UFFD file descriptor */ 1954 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1955 if (uffd_fd < 0) { 1956 return uffd_fd; 1957 } 1958 rs->uffdio_fd = uffd_fd; 1959 1960 RCU_READ_LOCK_GUARD(); 1961 1962 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1963 /* Nothing to do with read-only and MMIO-writable regions */ 1964 if (block->mr->readonly || block->mr->rom_device) { 1965 continue; 1966 } 1967 1968 /* Register block memory with UFFD to track writes */ 1969 if (uffd_register_memory(rs->uffdio_fd, block->host, 1970 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1971 goto fail; 1972 } 1973 block->flags |= RAM_UF_WRITEPROTECT; 1974 memory_region_ref(block->mr); 1975 1976 /* Apply UFFD write protection to the block memory range */ 1977 if (ram_block_uffd_protect(block, uffd_fd)) { 1978 goto fail; 1979 } 1980 1981 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1982 block->host, block->max_length); 1983 } 1984 1985 return 0; 1986 1987 fail: 1988 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1989 1990 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1991 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1992 continue; 1993 } 1994 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1995 /* Cleanup flags and remove reference */ 1996 block->flags &= ~RAM_UF_WRITEPROTECT; 1997 memory_region_unref(block->mr); 1998 } 1999 2000 uffd_close_fd(uffd_fd); 2001 rs->uffdio_fd = -1; 2002 return -1; 2003 } 2004 2005 /** 2006 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 2007 */ 2008 void ram_write_tracking_stop(void) 2009 { 2010 RAMState *rs = ram_state; 2011 RAMBlock *block; 2012 2013 RCU_READ_LOCK_GUARD(); 2014 2015 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2016 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 2017 continue; 2018 } 2019 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 2020 2021 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 2022 block->host, block->max_length); 2023 2024 /* Cleanup flags and remove reference */ 2025 block->flags &= ~RAM_UF_WRITEPROTECT; 2026 memory_region_unref(block->mr); 2027 } 2028 2029 /* Finally close UFFD file descriptor */ 2030 uffd_close_fd(rs->uffdio_fd); 2031 rs->uffdio_fd = -1; 2032 } 2033 2034 #else 2035 /* No target OS support, stubs just fail or ignore */ 2036 2037 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 2038 { 2039 (void) rs; 2040 (void) offset; 2041 2042 return NULL; 2043 } 2044 2045 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 2046 unsigned long start_page) 2047 { 2048 (void) rs; 2049 (void) pss; 2050 (void) start_page; 2051 2052 return 0; 2053 } 2054 2055 bool ram_write_tracking_available(void) 2056 { 2057 return false; 2058 } 2059 2060 bool ram_write_tracking_compatible(void) 2061 { 2062 assert(0); 2063 return false; 2064 } 2065 2066 int ram_write_tracking_start(void) 2067 { 2068 assert(0); 2069 return -1; 2070 } 2071 2072 void ram_write_tracking_stop(void) 2073 { 2074 assert(0); 2075 } 2076 #endif /* defined(__linux__) */ 2077 2078 /** 2079 * get_queued_page: unqueue a page from the postcopy requests 2080 * 2081 * Skips pages that are already sent (!dirty) 2082 * 2083 * Returns true if a queued page is found 2084 * 2085 * @rs: current RAM state 2086 * @pss: data about the state of the current dirty page scan 2087 */ 2088 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2089 { 2090 RAMBlock *block; 2091 ram_addr_t offset; 2092 bool dirty; 2093 2094 do { 2095 block = unqueue_page(rs, &offset); 2096 /* 2097 * We're sending this page, and since it's postcopy nothing else 2098 * will dirty it, and we must make sure it doesn't get sent again 2099 * even if this queue request was received after the background 2100 * search already sent it. 2101 */ 2102 if (block) { 2103 unsigned long page; 2104 2105 page = offset >> TARGET_PAGE_BITS; 2106 dirty = test_bit(page, block->bmap); 2107 if (!dirty) { 2108 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2109 page); 2110 } else { 2111 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2112 } 2113 } 2114 2115 } while (block && !dirty); 2116 2117 if (!block) { 2118 /* 2119 * Poll write faults too if background snapshot is enabled; that's 2120 * when we have vcpus got blocked by the write protected pages. 2121 */ 2122 block = poll_fault_page(rs, &offset); 2123 } 2124 2125 if (block) { 2126 /* 2127 * We want the background search to continue from the queued page 2128 * since the guest is likely to want other pages near to the page 2129 * it just requested. 2130 */ 2131 pss->block = block; 2132 pss->page = offset >> TARGET_PAGE_BITS; 2133 2134 /* 2135 * This unqueued page would break the "one round" check, even is 2136 * really rare. 2137 */ 2138 pss->complete_round = false; 2139 } 2140 2141 return !!block; 2142 } 2143 2144 /** 2145 * migration_page_queue_free: drop any remaining pages in the ram 2146 * request queue 2147 * 2148 * It should be empty at the end anyway, but in error cases there may 2149 * be some left. in case that there is any page left, we drop it. 2150 * 2151 */ 2152 static void migration_page_queue_free(RAMState *rs) 2153 { 2154 struct RAMSrcPageRequest *mspr, *next_mspr; 2155 /* This queue generally should be empty - but in the case of a failed 2156 * migration might have some droppings in. 2157 */ 2158 RCU_READ_LOCK_GUARD(); 2159 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2160 memory_region_unref(mspr->rb->mr); 2161 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2162 g_free(mspr); 2163 } 2164 } 2165 2166 /** 2167 * ram_save_queue_pages: queue the page for transmission 2168 * 2169 * A request from postcopy destination for example. 2170 * 2171 * Returns zero on success or negative on error 2172 * 2173 * @rbname: Name of the RAMBLock of the request. NULL means the 2174 * same that last one. 2175 * @start: starting address from the start of the RAMBlock 2176 * @len: length (in bytes) to send 2177 */ 2178 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2179 { 2180 RAMBlock *ramblock; 2181 RAMState *rs = ram_state; 2182 2183 ram_counters.postcopy_requests++; 2184 RCU_READ_LOCK_GUARD(); 2185 2186 if (!rbname) { 2187 /* Reuse last RAMBlock */ 2188 ramblock = rs->last_req_rb; 2189 2190 if (!ramblock) { 2191 /* 2192 * Shouldn't happen, we can't reuse the last RAMBlock if 2193 * it's the 1st request. 2194 */ 2195 error_report("ram_save_queue_pages no previous block"); 2196 return -1; 2197 } 2198 } else { 2199 ramblock = qemu_ram_block_by_name(rbname); 2200 2201 if (!ramblock) { 2202 /* We shouldn't be asked for a non-existent RAMBlock */ 2203 error_report("ram_save_queue_pages no block '%s'", rbname); 2204 return -1; 2205 } 2206 rs->last_req_rb = ramblock; 2207 } 2208 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2209 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2210 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2211 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2212 __func__, start, len, ramblock->used_length); 2213 return -1; 2214 } 2215 2216 /* 2217 * When with postcopy preempt, we send back the page directly in the 2218 * rp-return thread. 2219 */ 2220 if (postcopy_preempt_active()) { 2221 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2222 size_t page_size = qemu_ram_pagesize(ramblock); 2223 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2224 int ret = 0; 2225 2226 qemu_mutex_lock(&rs->bitmap_mutex); 2227 2228 pss_init(pss, ramblock, page_start); 2229 /* 2230 * Always use the preempt channel, and make sure it's there. It's 2231 * safe to access without lock, because when rp-thread is running 2232 * we should be the only one who operates on the qemufile 2233 */ 2234 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2235 assert(pss->pss_channel); 2236 2237 /* 2238 * It must be either one or multiple of host page size. Just 2239 * assert; if something wrong we're mostly split brain anyway. 2240 */ 2241 assert(len % page_size == 0); 2242 while (len) { 2243 if (ram_save_host_page_urgent(pss)) { 2244 error_report("%s: ram_save_host_page_urgent() failed: " 2245 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2246 __func__, ramblock->idstr, start); 2247 ret = -1; 2248 break; 2249 } 2250 /* 2251 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2252 * will automatically be moved and point to the next host page 2253 * we're going to send, so no need to update here. 2254 * 2255 * Normally QEMU never sends >1 host page in requests, so 2256 * logically we don't even need that as the loop should only 2257 * run once, but just to be consistent. 2258 */ 2259 len -= page_size; 2260 }; 2261 qemu_mutex_unlock(&rs->bitmap_mutex); 2262 2263 return ret; 2264 } 2265 2266 struct RAMSrcPageRequest *new_entry = 2267 g_new0(struct RAMSrcPageRequest, 1); 2268 new_entry->rb = ramblock; 2269 new_entry->offset = start; 2270 new_entry->len = len; 2271 2272 memory_region_ref(ramblock->mr); 2273 qemu_mutex_lock(&rs->src_page_req_mutex); 2274 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2275 migration_make_urgent_request(); 2276 qemu_mutex_unlock(&rs->src_page_req_mutex); 2277 2278 return 0; 2279 } 2280 2281 static bool save_page_use_compression(RAMState *rs) 2282 { 2283 if (!migrate_use_compression()) { 2284 return false; 2285 } 2286 2287 /* 2288 * If xbzrle is enabled (e.g., after first round of migration), stop 2289 * using the data compression. In theory, xbzrle can do better than 2290 * compression. 2291 */ 2292 if (rs->xbzrle_enabled) { 2293 return false; 2294 } 2295 2296 return true; 2297 } 2298 2299 /* 2300 * try to compress the page before posting it out, return true if the page 2301 * has been properly handled by compression, otherwise needs other 2302 * paths to handle it 2303 */ 2304 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2305 RAMBlock *block, ram_addr_t offset) 2306 { 2307 if (!save_page_use_compression(rs)) { 2308 return false; 2309 } 2310 2311 /* 2312 * When starting the process of a new block, the first page of 2313 * the block should be sent out before other pages in the same 2314 * block, and all the pages in last block should have been sent 2315 * out, keeping this order is important, because the 'cont' flag 2316 * is used to avoid resending the block name. 2317 * 2318 * We post the fist page as normal page as compression will take 2319 * much CPU resource. 2320 */ 2321 if (block != pss->last_sent_block) { 2322 flush_compressed_data(rs); 2323 return false; 2324 } 2325 2326 if (compress_page_with_multi_thread(block, offset) > 0) { 2327 return true; 2328 } 2329 2330 compression_counters.busy++; 2331 return false; 2332 } 2333 2334 /** 2335 * ram_save_target_page_legacy: save one target page 2336 * 2337 * Returns the number of pages written 2338 * 2339 * @rs: current RAM state 2340 * @pss: data about the page we want to send 2341 */ 2342 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2343 { 2344 RAMBlock *block = pss->block; 2345 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2346 int res; 2347 2348 if (control_save_page(pss, block, offset, &res)) { 2349 return res; 2350 } 2351 2352 if (save_compress_page(rs, pss, block, offset)) { 2353 return 1; 2354 } 2355 2356 res = save_zero_page(pss, pss->pss_channel, block, offset); 2357 if (res > 0) { 2358 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2359 * page would be stale 2360 */ 2361 if (rs->xbzrle_enabled) { 2362 XBZRLE_cache_lock(); 2363 xbzrle_cache_zero_page(rs, block->offset + offset); 2364 XBZRLE_cache_unlock(); 2365 } 2366 return res; 2367 } 2368 2369 /* 2370 * Do not use multifd in postcopy as one whole host page should be 2371 * placed. Meanwhile postcopy requires atomic update of pages, so even 2372 * if host page size == guest page size the dest guest during run may 2373 * still see partially copied pages which is data corruption. 2374 */ 2375 if (migrate_use_multifd() && !migration_in_postcopy()) { 2376 return ram_save_multifd_page(pss->pss_channel, block, offset); 2377 } 2378 2379 return ram_save_page(rs, pss); 2380 } 2381 2382 /* Should be called before sending a host page */ 2383 static void pss_host_page_prepare(PageSearchStatus *pss) 2384 { 2385 /* How many guest pages are there in one host page? */ 2386 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2387 2388 pss->host_page_sending = true; 2389 if (guest_pfns <= 1) { 2390 /* 2391 * This covers both when guest psize == host psize, or when guest 2392 * has larger psize than the host (guest_pfns==0). 2393 * 2394 * For the latter, we always send one whole guest page per 2395 * iteration of the host page (example: an Alpha VM on x86 host 2396 * will have guest psize 8K while host psize 4K). 2397 */ 2398 pss->host_page_start = pss->page; 2399 pss->host_page_end = pss->page + 1; 2400 } else { 2401 /* 2402 * The host page spans over multiple guest pages, we send them 2403 * within the same host page iteration. 2404 */ 2405 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2406 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2407 } 2408 } 2409 2410 /* 2411 * Whether the page pointed by PSS is within the host page being sent. 2412 * Must be called after a previous pss_host_page_prepare(). 2413 */ 2414 static bool pss_within_range(PageSearchStatus *pss) 2415 { 2416 ram_addr_t ram_addr; 2417 2418 assert(pss->host_page_sending); 2419 2420 /* Over host-page boundary? */ 2421 if (pss->page >= pss->host_page_end) { 2422 return false; 2423 } 2424 2425 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2426 2427 return offset_in_ramblock(pss->block, ram_addr); 2428 } 2429 2430 static void pss_host_page_finish(PageSearchStatus *pss) 2431 { 2432 pss->host_page_sending = false; 2433 /* This is not needed, but just to reset it */ 2434 pss->host_page_start = pss->host_page_end = 0; 2435 } 2436 2437 /* 2438 * Send an urgent host page specified by `pss'. Need to be called with 2439 * bitmap_mutex held. 2440 * 2441 * Returns 0 if save host page succeeded, false otherwise. 2442 */ 2443 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2444 { 2445 bool page_dirty, sent = false; 2446 RAMState *rs = ram_state; 2447 int ret = 0; 2448 2449 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2450 pss_host_page_prepare(pss); 2451 2452 /* 2453 * If precopy is sending the same page, let it be done in precopy, or 2454 * we could send the same page in two channels and none of them will 2455 * receive the whole page. 2456 */ 2457 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2458 trace_postcopy_preempt_hit(pss->block->idstr, 2459 pss->page << TARGET_PAGE_BITS); 2460 return 0; 2461 } 2462 2463 do { 2464 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2465 2466 if (page_dirty) { 2467 /* Be strict to return code; it must be 1, or what else? */ 2468 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2469 error_report_once("%s: ram_save_target_page failed", __func__); 2470 ret = -1; 2471 goto out; 2472 } 2473 sent = true; 2474 } 2475 pss_find_next_dirty(pss); 2476 } while (pss_within_range(pss)); 2477 out: 2478 pss_host_page_finish(pss); 2479 /* For urgent requests, flush immediately if sent */ 2480 if (sent) { 2481 qemu_fflush(pss->pss_channel); 2482 } 2483 return ret; 2484 } 2485 2486 /** 2487 * ram_save_host_page: save a whole host page 2488 * 2489 * Starting at *offset send pages up to the end of the current host 2490 * page. It's valid for the initial offset to point into the middle of 2491 * a host page in which case the remainder of the hostpage is sent. 2492 * Only dirty target pages are sent. Note that the host page size may 2493 * be a huge page for this block. 2494 * 2495 * The saving stops at the boundary of the used_length of the block 2496 * if the RAMBlock isn't a multiple of the host page size. 2497 * 2498 * The caller must be with ram_state.bitmap_mutex held to call this 2499 * function. Note that this function can temporarily release the lock, but 2500 * when the function is returned it'll make sure the lock is still held. 2501 * 2502 * Returns the number of pages written or negative on error 2503 * 2504 * @rs: current RAM state 2505 * @pss: data about the page we want to send 2506 */ 2507 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2508 { 2509 bool page_dirty, preempt_active = postcopy_preempt_active(); 2510 int tmppages, pages = 0; 2511 size_t pagesize_bits = 2512 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2513 unsigned long start_page = pss->page; 2514 int res; 2515 2516 if (ramblock_is_ignored(pss->block)) { 2517 error_report("block %s should not be migrated !", pss->block->idstr); 2518 return 0; 2519 } 2520 2521 /* Update host page boundary information */ 2522 pss_host_page_prepare(pss); 2523 2524 do { 2525 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2526 2527 /* Check the pages is dirty and if it is send it */ 2528 if (page_dirty) { 2529 /* 2530 * Properly yield the lock only in postcopy preempt mode 2531 * because both migration thread and rp-return thread can 2532 * operate on the bitmaps. 2533 */ 2534 if (preempt_active) { 2535 qemu_mutex_unlock(&rs->bitmap_mutex); 2536 } 2537 tmppages = migration_ops->ram_save_target_page(rs, pss); 2538 if (tmppages >= 0) { 2539 pages += tmppages; 2540 /* 2541 * Allow rate limiting to happen in the middle of huge pages if 2542 * something is sent in the current iteration. 2543 */ 2544 if (pagesize_bits > 1 && tmppages > 0) { 2545 migration_rate_limit(); 2546 } 2547 } 2548 if (preempt_active) { 2549 qemu_mutex_lock(&rs->bitmap_mutex); 2550 } 2551 } else { 2552 tmppages = 0; 2553 } 2554 2555 if (tmppages < 0) { 2556 pss_host_page_finish(pss); 2557 return tmppages; 2558 } 2559 2560 pss_find_next_dirty(pss); 2561 } while (pss_within_range(pss)); 2562 2563 pss_host_page_finish(pss); 2564 2565 res = ram_save_release_protection(rs, pss, start_page); 2566 return (res < 0 ? res : pages); 2567 } 2568 2569 /** 2570 * ram_find_and_save_block: finds a dirty page and sends it to f 2571 * 2572 * Called within an RCU critical section. 2573 * 2574 * Returns the number of pages written where zero means no dirty pages, 2575 * or negative on error 2576 * 2577 * @rs: current RAM state 2578 * 2579 * On systems where host-page-size > target-page-size it will send all the 2580 * pages in a host page that are dirty. 2581 */ 2582 static int ram_find_and_save_block(RAMState *rs) 2583 { 2584 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2585 int pages = 0; 2586 2587 /* No dirty page as there is zero RAM */ 2588 if (!rs->ram_bytes_total) { 2589 return pages; 2590 } 2591 2592 /* 2593 * Always keep last_seen_block/last_page valid during this procedure, 2594 * because find_dirty_block() relies on these values (e.g., we compare 2595 * last_seen_block with pss.block to see whether we searched all the 2596 * ramblocks) to detect the completion of migration. Having NULL value 2597 * of last_seen_block can conditionally cause below loop to run forever. 2598 */ 2599 if (!rs->last_seen_block) { 2600 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2601 rs->last_page = 0; 2602 } 2603 2604 pss_init(pss, rs->last_seen_block, rs->last_page); 2605 2606 while (true){ 2607 if (!get_queued_page(rs, pss)) { 2608 /* priority queue empty, so just search for something dirty */ 2609 int res = find_dirty_block(rs, pss); 2610 if (res != PAGE_DIRTY_FOUND) { 2611 if (res == PAGE_ALL_CLEAN) { 2612 break; 2613 } else if (res == PAGE_TRY_AGAIN) { 2614 continue; 2615 } 2616 } 2617 } 2618 pages = ram_save_host_page(rs, pss); 2619 if (pages) { 2620 break; 2621 } 2622 } 2623 2624 rs->last_seen_block = pss->block; 2625 rs->last_page = pss->page; 2626 2627 return pages; 2628 } 2629 2630 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2631 { 2632 uint64_t pages = size / TARGET_PAGE_SIZE; 2633 2634 if (zero) { 2635 stat64_add(&ram_atomic_counters.duplicate, pages); 2636 } else { 2637 stat64_add(&ram_atomic_counters.normal, pages); 2638 ram_transferred_add(size); 2639 qemu_file_credit_transfer(f, size); 2640 } 2641 } 2642 2643 static uint64_t ram_bytes_total_with_ignored(void) 2644 { 2645 RAMBlock *block; 2646 uint64_t total = 0; 2647 2648 RCU_READ_LOCK_GUARD(); 2649 2650 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2651 total += block->used_length; 2652 } 2653 return total; 2654 } 2655 2656 uint64_t ram_bytes_total(void) 2657 { 2658 RAMBlock *block; 2659 uint64_t total = 0; 2660 2661 RCU_READ_LOCK_GUARD(); 2662 2663 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2664 total += block->used_length; 2665 } 2666 return total; 2667 } 2668 2669 static void xbzrle_load_setup(void) 2670 { 2671 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2672 } 2673 2674 static void xbzrle_load_cleanup(void) 2675 { 2676 g_free(XBZRLE.decoded_buf); 2677 XBZRLE.decoded_buf = NULL; 2678 } 2679 2680 static void ram_state_cleanup(RAMState **rsp) 2681 { 2682 if (*rsp) { 2683 migration_page_queue_free(*rsp); 2684 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2685 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2686 g_free(*rsp); 2687 *rsp = NULL; 2688 } 2689 } 2690 2691 static void xbzrle_cleanup(void) 2692 { 2693 XBZRLE_cache_lock(); 2694 if (XBZRLE.cache) { 2695 cache_fini(XBZRLE.cache); 2696 g_free(XBZRLE.encoded_buf); 2697 g_free(XBZRLE.current_buf); 2698 g_free(XBZRLE.zero_target_page); 2699 XBZRLE.cache = NULL; 2700 XBZRLE.encoded_buf = NULL; 2701 XBZRLE.current_buf = NULL; 2702 XBZRLE.zero_target_page = NULL; 2703 } 2704 XBZRLE_cache_unlock(); 2705 } 2706 2707 static void ram_save_cleanup(void *opaque) 2708 { 2709 RAMState **rsp = opaque; 2710 RAMBlock *block; 2711 2712 /* We don't use dirty log with background snapshots */ 2713 if (!migrate_background_snapshot()) { 2714 /* caller have hold iothread lock or is in a bh, so there is 2715 * no writing race against the migration bitmap 2716 */ 2717 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2718 /* 2719 * do not stop dirty log without starting it, since 2720 * memory_global_dirty_log_stop will assert that 2721 * memory_global_dirty_log_start/stop used in pairs 2722 */ 2723 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2724 } 2725 } 2726 2727 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2728 g_free(block->clear_bmap); 2729 block->clear_bmap = NULL; 2730 g_free(block->bmap); 2731 block->bmap = NULL; 2732 } 2733 2734 xbzrle_cleanup(); 2735 compress_threads_save_cleanup(); 2736 ram_state_cleanup(rsp); 2737 g_free(migration_ops); 2738 migration_ops = NULL; 2739 } 2740 2741 static void ram_state_reset(RAMState *rs) 2742 { 2743 int i; 2744 2745 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2746 rs->pss[i].last_sent_block = NULL; 2747 } 2748 2749 rs->last_seen_block = NULL; 2750 rs->last_page = 0; 2751 rs->last_version = ram_list.version; 2752 rs->xbzrle_enabled = false; 2753 } 2754 2755 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2756 2757 /* **** functions for postcopy ***** */ 2758 2759 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2760 { 2761 struct RAMBlock *block; 2762 2763 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2764 unsigned long *bitmap = block->bmap; 2765 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2766 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2767 2768 while (run_start < range) { 2769 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2770 ram_discard_range(block->idstr, 2771 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2772 ((ram_addr_t)(run_end - run_start)) 2773 << TARGET_PAGE_BITS); 2774 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2775 } 2776 } 2777 } 2778 2779 /** 2780 * postcopy_send_discard_bm_ram: discard a RAMBlock 2781 * 2782 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2783 * 2784 * @ms: current migration state 2785 * @block: RAMBlock to discard 2786 */ 2787 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2788 { 2789 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2790 unsigned long current; 2791 unsigned long *bitmap = block->bmap; 2792 2793 for (current = 0; current < end; ) { 2794 unsigned long one = find_next_bit(bitmap, end, current); 2795 unsigned long zero, discard_length; 2796 2797 if (one >= end) { 2798 break; 2799 } 2800 2801 zero = find_next_zero_bit(bitmap, end, one + 1); 2802 2803 if (zero >= end) { 2804 discard_length = end - one; 2805 } else { 2806 discard_length = zero - one; 2807 } 2808 postcopy_discard_send_range(ms, one, discard_length); 2809 current = one + discard_length; 2810 } 2811 } 2812 2813 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2814 2815 /** 2816 * postcopy_each_ram_send_discard: discard all RAMBlocks 2817 * 2818 * Utility for the outgoing postcopy code. 2819 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2820 * passing it bitmap indexes and name. 2821 * (qemu_ram_foreach_block ends up passing unscaled lengths 2822 * which would mean postcopy code would have to deal with target page) 2823 * 2824 * @ms: current migration state 2825 */ 2826 static void postcopy_each_ram_send_discard(MigrationState *ms) 2827 { 2828 struct RAMBlock *block; 2829 2830 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2831 postcopy_discard_send_init(ms, block->idstr); 2832 2833 /* 2834 * Deal with TPS != HPS and huge pages. It discard any partially sent 2835 * host-page size chunks, mark any partially dirty host-page size 2836 * chunks as all dirty. In this case the host-page is the host-page 2837 * for the particular RAMBlock, i.e. it might be a huge page. 2838 */ 2839 postcopy_chunk_hostpages_pass(ms, block); 2840 2841 /* 2842 * Postcopy sends chunks of bitmap over the wire, but it 2843 * just needs indexes at this point, avoids it having 2844 * target page specific code. 2845 */ 2846 postcopy_send_discard_bm_ram(ms, block); 2847 postcopy_discard_send_finish(ms); 2848 } 2849 } 2850 2851 /** 2852 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2853 * 2854 * Helper for postcopy_chunk_hostpages; it's called twice to 2855 * canonicalize the two bitmaps, that are similar, but one is 2856 * inverted. 2857 * 2858 * Postcopy requires that all target pages in a hostpage are dirty or 2859 * clean, not a mix. This function canonicalizes the bitmaps. 2860 * 2861 * @ms: current migration state 2862 * @block: block that contains the page we want to canonicalize 2863 */ 2864 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2865 { 2866 RAMState *rs = ram_state; 2867 unsigned long *bitmap = block->bmap; 2868 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2869 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2870 unsigned long run_start; 2871 2872 if (block->page_size == TARGET_PAGE_SIZE) { 2873 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2874 return; 2875 } 2876 2877 /* Find a dirty page */ 2878 run_start = find_next_bit(bitmap, pages, 0); 2879 2880 while (run_start < pages) { 2881 2882 /* 2883 * If the start of this run of pages is in the middle of a host 2884 * page, then we need to fixup this host page. 2885 */ 2886 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2887 /* Find the end of this run */ 2888 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2889 /* 2890 * If the end isn't at the start of a host page, then the 2891 * run doesn't finish at the end of a host page 2892 * and we need to discard. 2893 */ 2894 } 2895 2896 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2897 unsigned long page; 2898 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2899 host_ratio); 2900 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2901 2902 /* Clean up the bitmap */ 2903 for (page = fixup_start_addr; 2904 page < fixup_start_addr + host_ratio; page++) { 2905 /* 2906 * Remark them as dirty, updating the count for any pages 2907 * that weren't previously dirty. 2908 */ 2909 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2910 } 2911 } 2912 2913 /* Find the next dirty page for the next iteration */ 2914 run_start = find_next_bit(bitmap, pages, run_start); 2915 } 2916 } 2917 2918 /** 2919 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2920 * 2921 * Transmit the set of pages to be discarded after precopy to the target 2922 * these are pages that: 2923 * a) Have been previously transmitted but are now dirty again 2924 * b) Pages that have never been transmitted, this ensures that 2925 * any pages on the destination that have been mapped by background 2926 * tasks get discarded (transparent huge pages is the specific concern) 2927 * Hopefully this is pretty sparse 2928 * 2929 * @ms: current migration state 2930 */ 2931 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2932 { 2933 RAMState *rs = ram_state; 2934 2935 RCU_READ_LOCK_GUARD(); 2936 2937 /* This should be our last sync, the src is now paused */ 2938 migration_bitmap_sync(rs); 2939 2940 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2941 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2942 rs->last_seen_block = NULL; 2943 rs->last_page = 0; 2944 2945 postcopy_each_ram_send_discard(ms); 2946 2947 trace_ram_postcopy_send_discard_bitmap(); 2948 } 2949 2950 /** 2951 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2952 * 2953 * Returns zero on success 2954 * 2955 * @rbname: name of the RAMBlock of the request. NULL means the 2956 * same that last one. 2957 * @start: RAMBlock starting page 2958 * @length: RAMBlock size 2959 */ 2960 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2961 { 2962 trace_ram_discard_range(rbname, start, length); 2963 2964 RCU_READ_LOCK_GUARD(); 2965 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2966 2967 if (!rb) { 2968 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2969 return -1; 2970 } 2971 2972 /* 2973 * On source VM, we don't need to update the received bitmap since 2974 * we don't even have one. 2975 */ 2976 if (rb->receivedmap) { 2977 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2978 length >> qemu_target_page_bits()); 2979 } 2980 2981 return ram_block_discard_range(rb, start, length); 2982 } 2983 2984 /* 2985 * For every allocation, we will try not to crash the VM if the 2986 * allocation failed. 2987 */ 2988 static int xbzrle_init(void) 2989 { 2990 Error *local_err = NULL; 2991 2992 if (!migrate_use_xbzrle()) { 2993 return 0; 2994 } 2995 2996 XBZRLE_cache_lock(); 2997 2998 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2999 if (!XBZRLE.zero_target_page) { 3000 error_report("%s: Error allocating zero page", __func__); 3001 goto err_out; 3002 } 3003 3004 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 3005 TARGET_PAGE_SIZE, &local_err); 3006 if (!XBZRLE.cache) { 3007 error_report_err(local_err); 3008 goto free_zero_page; 3009 } 3010 3011 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 3012 if (!XBZRLE.encoded_buf) { 3013 error_report("%s: Error allocating encoded_buf", __func__); 3014 goto free_cache; 3015 } 3016 3017 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 3018 if (!XBZRLE.current_buf) { 3019 error_report("%s: Error allocating current_buf", __func__); 3020 goto free_encoded_buf; 3021 } 3022 3023 /* We are all good */ 3024 XBZRLE_cache_unlock(); 3025 return 0; 3026 3027 free_encoded_buf: 3028 g_free(XBZRLE.encoded_buf); 3029 XBZRLE.encoded_buf = NULL; 3030 free_cache: 3031 cache_fini(XBZRLE.cache); 3032 XBZRLE.cache = NULL; 3033 free_zero_page: 3034 g_free(XBZRLE.zero_target_page); 3035 XBZRLE.zero_target_page = NULL; 3036 err_out: 3037 XBZRLE_cache_unlock(); 3038 return -ENOMEM; 3039 } 3040 3041 static int ram_state_init(RAMState **rsp) 3042 { 3043 *rsp = g_try_new0(RAMState, 1); 3044 3045 if (!*rsp) { 3046 error_report("%s: Init ramstate fail", __func__); 3047 return -1; 3048 } 3049 3050 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3051 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3052 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3053 (*rsp)->ram_bytes_total = ram_bytes_total(); 3054 3055 /* 3056 * Count the total number of pages used by ram blocks not including any 3057 * gaps due to alignment or unplugs. 3058 * This must match with the initial values of dirty bitmap. 3059 */ 3060 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 3061 ram_state_reset(*rsp); 3062 3063 return 0; 3064 } 3065 3066 static void ram_list_init_bitmaps(void) 3067 { 3068 MigrationState *ms = migrate_get_current(); 3069 RAMBlock *block; 3070 unsigned long pages; 3071 uint8_t shift; 3072 3073 /* Skip setting bitmap if there is no RAM */ 3074 if (ram_bytes_total()) { 3075 shift = ms->clear_bitmap_shift; 3076 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3077 error_report("clear_bitmap_shift (%u) too big, using " 3078 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3079 shift = CLEAR_BITMAP_SHIFT_MAX; 3080 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3081 error_report("clear_bitmap_shift (%u) too small, using " 3082 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3083 shift = CLEAR_BITMAP_SHIFT_MIN; 3084 } 3085 3086 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3087 pages = block->max_length >> TARGET_PAGE_BITS; 3088 /* 3089 * The initial dirty bitmap for migration must be set with all 3090 * ones to make sure we'll migrate every guest RAM page to 3091 * destination. 3092 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3093 * new migration after a failed migration, ram_list. 3094 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3095 * guest memory. 3096 */ 3097 block->bmap = bitmap_new(pages); 3098 bitmap_set(block->bmap, 0, pages); 3099 block->clear_bmap_shift = shift; 3100 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3101 } 3102 } 3103 } 3104 3105 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3106 { 3107 unsigned long pages; 3108 RAMBlock *rb; 3109 3110 RCU_READ_LOCK_GUARD(); 3111 3112 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3113 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3114 rs->migration_dirty_pages -= pages; 3115 } 3116 } 3117 3118 static void ram_init_bitmaps(RAMState *rs) 3119 { 3120 /* For memory_global_dirty_log_start below. */ 3121 qemu_mutex_lock_iothread(); 3122 qemu_mutex_lock_ramlist(); 3123 3124 WITH_RCU_READ_LOCK_GUARD() { 3125 ram_list_init_bitmaps(); 3126 /* We don't use dirty log with background snapshots */ 3127 if (!migrate_background_snapshot()) { 3128 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3129 migration_bitmap_sync_precopy(rs); 3130 } 3131 } 3132 qemu_mutex_unlock_ramlist(); 3133 qemu_mutex_unlock_iothread(); 3134 3135 /* 3136 * After an eventual first bitmap sync, fixup the initial bitmap 3137 * containing all 1s to exclude any discarded pages from migration. 3138 */ 3139 migration_bitmap_clear_discarded_pages(rs); 3140 } 3141 3142 static int ram_init_all(RAMState **rsp) 3143 { 3144 if (ram_state_init(rsp)) { 3145 return -1; 3146 } 3147 3148 if (xbzrle_init()) { 3149 ram_state_cleanup(rsp); 3150 return -1; 3151 } 3152 3153 ram_init_bitmaps(*rsp); 3154 3155 return 0; 3156 } 3157 3158 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3159 { 3160 RAMBlock *block; 3161 uint64_t pages = 0; 3162 3163 /* 3164 * Postcopy is not using xbzrle/compression, so no need for that. 3165 * Also, since source are already halted, we don't need to care 3166 * about dirty page logging as well. 3167 */ 3168 3169 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3170 pages += bitmap_count_one(block->bmap, 3171 block->used_length >> TARGET_PAGE_BITS); 3172 } 3173 3174 /* This may not be aligned with current bitmaps. Recalculate. */ 3175 rs->migration_dirty_pages = pages; 3176 3177 ram_state_reset(rs); 3178 3179 /* Update RAMState cache of output QEMUFile */ 3180 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3181 3182 trace_ram_state_resume_prepare(pages); 3183 } 3184 3185 /* 3186 * This function clears bits of the free pages reported by the caller from the 3187 * migration dirty bitmap. @addr is the host address corresponding to the 3188 * start of the continuous guest free pages, and @len is the total bytes of 3189 * those pages. 3190 */ 3191 void qemu_guest_free_page_hint(void *addr, size_t len) 3192 { 3193 RAMBlock *block; 3194 ram_addr_t offset; 3195 size_t used_len, start, npages; 3196 MigrationState *s = migrate_get_current(); 3197 3198 /* This function is currently expected to be used during live migration */ 3199 if (!migration_is_setup_or_active(s->state)) { 3200 return; 3201 } 3202 3203 for (; len > 0; len -= used_len, addr += used_len) { 3204 block = qemu_ram_block_from_host(addr, false, &offset); 3205 if (unlikely(!block || offset >= block->used_length)) { 3206 /* 3207 * The implementation might not support RAMBlock resize during 3208 * live migration, but it could happen in theory with future 3209 * updates. So we add a check here to capture that case. 3210 */ 3211 error_report_once("%s unexpected error", __func__); 3212 return; 3213 } 3214 3215 if (len <= block->used_length - offset) { 3216 used_len = len; 3217 } else { 3218 used_len = block->used_length - offset; 3219 } 3220 3221 start = offset >> TARGET_PAGE_BITS; 3222 npages = used_len >> TARGET_PAGE_BITS; 3223 3224 qemu_mutex_lock(&ram_state->bitmap_mutex); 3225 /* 3226 * The skipped free pages are equavalent to be sent from clear_bmap's 3227 * perspective, so clear the bits from the memory region bitmap which 3228 * are initially set. Otherwise those skipped pages will be sent in 3229 * the next round after syncing from the memory region bitmap. 3230 */ 3231 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3232 ram_state->migration_dirty_pages -= 3233 bitmap_count_one_with_offset(block->bmap, start, npages); 3234 bitmap_clear(block->bmap, start, npages); 3235 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3236 } 3237 } 3238 3239 /* 3240 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3241 * long-running RCU critical section. When rcu-reclaims in the code 3242 * start to become numerous it will be necessary to reduce the 3243 * granularity of these critical sections. 3244 */ 3245 3246 /** 3247 * ram_save_setup: Setup RAM for migration 3248 * 3249 * Returns zero to indicate success and negative for error 3250 * 3251 * @f: QEMUFile where to send the data 3252 * @opaque: RAMState pointer 3253 */ 3254 static int ram_save_setup(QEMUFile *f, void *opaque) 3255 { 3256 RAMState **rsp = opaque; 3257 RAMBlock *block; 3258 int ret; 3259 3260 if (compress_threads_save_setup()) { 3261 return -1; 3262 } 3263 3264 /* migration has already setup the bitmap, reuse it. */ 3265 if (!migration_in_colo_state()) { 3266 if (ram_init_all(rsp) != 0) { 3267 compress_threads_save_cleanup(); 3268 return -1; 3269 } 3270 } 3271 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3272 3273 WITH_RCU_READ_LOCK_GUARD() { 3274 qemu_put_be64(f, ram_bytes_total_with_ignored() 3275 | RAM_SAVE_FLAG_MEM_SIZE); 3276 3277 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3278 qemu_put_byte(f, strlen(block->idstr)); 3279 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3280 qemu_put_be64(f, block->used_length); 3281 if (migrate_postcopy_ram() && block->page_size != 3282 qemu_host_page_size) { 3283 qemu_put_be64(f, block->page_size); 3284 } 3285 if (migrate_ignore_shared()) { 3286 qemu_put_be64(f, block->mr->addr); 3287 } 3288 } 3289 } 3290 3291 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3292 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3293 3294 migration_ops = g_malloc0(sizeof(MigrationOps)); 3295 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3296 ret = multifd_send_sync_main(f); 3297 if (ret < 0) { 3298 return ret; 3299 } 3300 3301 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3302 qemu_fflush(f); 3303 3304 return 0; 3305 } 3306 3307 /** 3308 * ram_save_iterate: iterative stage for migration 3309 * 3310 * Returns zero to indicate success and negative for error 3311 * 3312 * @f: QEMUFile where to send the data 3313 * @opaque: RAMState pointer 3314 */ 3315 static int ram_save_iterate(QEMUFile *f, void *opaque) 3316 { 3317 RAMState **temp = opaque; 3318 RAMState *rs = *temp; 3319 int ret = 0; 3320 int i; 3321 int64_t t0; 3322 int done = 0; 3323 3324 if (blk_mig_bulk_active()) { 3325 /* Avoid transferring ram during bulk phase of block migration as 3326 * the bulk phase will usually take a long time and transferring 3327 * ram updates during that time is pointless. */ 3328 goto out; 3329 } 3330 3331 /* 3332 * We'll take this lock a little bit long, but it's okay for two reasons. 3333 * Firstly, the only possible other thread to take it is who calls 3334 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3335 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3336 * guarantees that we'll at least released it in a regular basis. 3337 */ 3338 qemu_mutex_lock(&rs->bitmap_mutex); 3339 WITH_RCU_READ_LOCK_GUARD() { 3340 if (ram_list.version != rs->last_version) { 3341 ram_state_reset(rs); 3342 } 3343 3344 /* Read version before ram_list.blocks */ 3345 smp_rmb(); 3346 3347 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3348 3349 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3350 i = 0; 3351 while ((ret = qemu_file_rate_limit(f)) == 0 || 3352 postcopy_has_request(rs)) { 3353 int pages; 3354 3355 if (qemu_file_get_error(f)) { 3356 break; 3357 } 3358 3359 pages = ram_find_and_save_block(rs); 3360 /* no more pages to sent */ 3361 if (pages == 0) { 3362 done = 1; 3363 break; 3364 } 3365 3366 if (pages < 0) { 3367 qemu_file_set_error(f, pages); 3368 break; 3369 } 3370 3371 rs->target_page_count += pages; 3372 3373 /* 3374 * During postcopy, it is necessary to make sure one whole host 3375 * page is sent in one chunk. 3376 */ 3377 if (migrate_postcopy_ram()) { 3378 flush_compressed_data(rs); 3379 } 3380 3381 /* 3382 * we want to check in the 1st loop, just in case it was the 1st 3383 * time and we had to sync the dirty bitmap. 3384 * qemu_clock_get_ns() is a bit expensive, so we only check each 3385 * some iterations 3386 */ 3387 if ((i & 63) == 0) { 3388 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3389 1000000; 3390 if (t1 > MAX_WAIT) { 3391 trace_ram_save_iterate_big_wait(t1, i); 3392 break; 3393 } 3394 } 3395 i++; 3396 } 3397 } 3398 qemu_mutex_unlock(&rs->bitmap_mutex); 3399 3400 /* 3401 * Must occur before EOS (or any QEMUFile operation) 3402 * because of RDMA protocol. 3403 */ 3404 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3405 3406 out: 3407 if (ret >= 0 3408 && migration_is_setup_or_active(migrate_get_current()->state)) { 3409 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3410 if (ret < 0) { 3411 return ret; 3412 } 3413 3414 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3415 qemu_fflush(f); 3416 ram_transferred_add(8); 3417 3418 ret = qemu_file_get_error(f); 3419 } 3420 if (ret < 0) { 3421 return ret; 3422 } 3423 3424 return done; 3425 } 3426 3427 /** 3428 * ram_save_complete: function called to send the remaining amount of ram 3429 * 3430 * Returns zero to indicate success or negative on error 3431 * 3432 * Called with iothread lock 3433 * 3434 * @f: QEMUFile where to send the data 3435 * @opaque: RAMState pointer 3436 */ 3437 static int ram_save_complete(QEMUFile *f, void *opaque) 3438 { 3439 RAMState **temp = opaque; 3440 RAMState *rs = *temp; 3441 int ret = 0; 3442 3443 rs->last_stage = !migration_in_colo_state(); 3444 3445 WITH_RCU_READ_LOCK_GUARD() { 3446 if (!migration_in_postcopy()) { 3447 migration_bitmap_sync_precopy(rs); 3448 } 3449 3450 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3451 3452 /* try transferring iterative blocks of memory */ 3453 3454 /* flush all remaining blocks regardless of rate limiting */ 3455 qemu_mutex_lock(&rs->bitmap_mutex); 3456 while (true) { 3457 int pages; 3458 3459 pages = ram_find_and_save_block(rs); 3460 /* no more blocks to sent */ 3461 if (pages == 0) { 3462 break; 3463 } 3464 if (pages < 0) { 3465 ret = pages; 3466 break; 3467 } 3468 } 3469 qemu_mutex_unlock(&rs->bitmap_mutex); 3470 3471 flush_compressed_data(rs); 3472 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3473 } 3474 3475 if (ret < 0) { 3476 return ret; 3477 } 3478 3479 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3480 if (ret < 0) { 3481 return ret; 3482 } 3483 3484 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3485 qemu_fflush(f); 3486 3487 return 0; 3488 } 3489 3490 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3491 uint64_t *can_postcopy) 3492 { 3493 RAMState **temp = opaque; 3494 RAMState *rs = *temp; 3495 3496 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3497 3498 if (migrate_postcopy_ram()) { 3499 /* We can do postcopy, and all the data is postcopiable */ 3500 *can_postcopy += remaining_size; 3501 } else { 3502 *must_precopy += remaining_size; 3503 } 3504 } 3505 3506 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3507 uint64_t *can_postcopy) 3508 { 3509 MigrationState *s = migrate_get_current(); 3510 RAMState **temp = opaque; 3511 RAMState *rs = *temp; 3512 3513 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3514 3515 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3516 qemu_mutex_lock_iothread(); 3517 WITH_RCU_READ_LOCK_GUARD() { 3518 migration_bitmap_sync_precopy(rs); 3519 } 3520 qemu_mutex_unlock_iothread(); 3521 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3522 } 3523 3524 if (migrate_postcopy_ram()) { 3525 /* We can do postcopy, and all the data is postcopiable */ 3526 *can_postcopy += remaining_size; 3527 } else { 3528 *must_precopy += remaining_size; 3529 } 3530 } 3531 3532 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3533 { 3534 unsigned int xh_len; 3535 int xh_flags; 3536 uint8_t *loaded_data; 3537 3538 /* extract RLE header */ 3539 xh_flags = qemu_get_byte(f); 3540 xh_len = qemu_get_be16(f); 3541 3542 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3543 error_report("Failed to load XBZRLE page - wrong compression!"); 3544 return -1; 3545 } 3546 3547 if (xh_len > TARGET_PAGE_SIZE) { 3548 error_report("Failed to load XBZRLE page - len overflow!"); 3549 return -1; 3550 } 3551 loaded_data = XBZRLE.decoded_buf; 3552 /* load data and decode */ 3553 /* it can change loaded_data to point to an internal buffer */ 3554 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3555 3556 /* decode RLE */ 3557 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3558 TARGET_PAGE_SIZE) == -1) { 3559 error_report("Failed to load XBZRLE page - decode error!"); 3560 return -1; 3561 } 3562 3563 return 0; 3564 } 3565 3566 /** 3567 * ram_block_from_stream: read a RAMBlock id from the migration stream 3568 * 3569 * Must be called from within a rcu critical section. 3570 * 3571 * Returns a pointer from within the RCU-protected ram_list. 3572 * 3573 * @mis: the migration incoming state pointer 3574 * @f: QEMUFile where to read the data from 3575 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3576 * @channel: the channel we're using 3577 */ 3578 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3579 QEMUFile *f, int flags, 3580 int channel) 3581 { 3582 RAMBlock *block = mis->last_recv_block[channel]; 3583 char id[256]; 3584 uint8_t len; 3585 3586 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3587 if (!block) { 3588 error_report("Ack, bad migration stream!"); 3589 return NULL; 3590 } 3591 return block; 3592 } 3593 3594 len = qemu_get_byte(f); 3595 qemu_get_buffer(f, (uint8_t *)id, len); 3596 id[len] = 0; 3597 3598 block = qemu_ram_block_by_name(id); 3599 if (!block) { 3600 error_report("Can't find block %s", id); 3601 return NULL; 3602 } 3603 3604 if (ramblock_is_ignored(block)) { 3605 error_report("block %s should not be migrated !", id); 3606 return NULL; 3607 } 3608 3609 mis->last_recv_block[channel] = block; 3610 3611 return block; 3612 } 3613 3614 static inline void *host_from_ram_block_offset(RAMBlock *block, 3615 ram_addr_t offset) 3616 { 3617 if (!offset_in_ramblock(block, offset)) { 3618 return NULL; 3619 } 3620 3621 return block->host + offset; 3622 } 3623 3624 static void *host_page_from_ram_block_offset(RAMBlock *block, 3625 ram_addr_t offset) 3626 { 3627 /* Note: Explicitly no check against offset_in_ramblock(). */ 3628 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3629 block->page_size); 3630 } 3631 3632 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3633 ram_addr_t offset) 3634 { 3635 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3636 } 3637 3638 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3639 ram_addr_t offset, bool record_bitmap) 3640 { 3641 if (!offset_in_ramblock(block, offset)) { 3642 return NULL; 3643 } 3644 if (!block->colo_cache) { 3645 error_report("%s: colo_cache is NULL in block :%s", 3646 __func__, block->idstr); 3647 return NULL; 3648 } 3649 3650 /* 3651 * During colo checkpoint, we need bitmap of these migrated pages. 3652 * It help us to decide which pages in ram cache should be flushed 3653 * into VM's RAM later. 3654 */ 3655 if (record_bitmap && 3656 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3657 ram_state->migration_dirty_pages++; 3658 } 3659 return block->colo_cache + offset; 3660 } 3661 3662 /** 3663 * ram_handle_compressed: handle the zero page case 3664 * 3665 * If a page (or a whole RDMA chunk) has been 3666 * determined to be zero, then zap it. 3667 * 3668 * @host: host address for the zero page 3669 * @ch: what the page is filled from. We only support zero 3670 * @size: size of the zero page 3671 */ 3672 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3673 { 3674 if (ch != 0 || !buffer_is_zero(host, size)) { 3675 memset(host, ch, size); 3676 } 3677 } 3678 3679 /* return the size after decompression, or negative value on error */ 3680 static int 3681 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3682 const uint8_t *source, size_t source_len) 3683 { 3684 int err; 3685 3686 err = inflateReset(stream); 3687 if (err != Z_OK) { 3688 return -1; 3689 } 3690 3691 stream->avail_in = source_len; 3692 stream->next_in = (uint8_t *)source; 3693 stream->avail_out = dest_len; 3694 stream->next_out = dest; 3695 3696 err = inflate(stream, Z_NO_FLUSH); 3697 if (err != Z_STREAM_END) { 3698 return -1; 3699 } 3700 3701 return stream->total_out; 3702 } 3703 3704 static void *do_data_decompress(void *opaque) 3705 { 3706 DecompressParam *param = opaque; 3707 unsigned long pagesize; 3708 uint8_t *des; 3709 int len, ret; 3710 3711 qemu_mutex_lock(¶m->mutex); 3712 while (!param->quit) { 3713 if (param->des) { 3714 des = param->des; 3715 len = param->len; 3716 param->des = 0; 3717 qemu_mutex_unlock(¶m->mutex); 3718 3719 pagesize = TARGET_PAGE_SIZE; 3720 3721 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3722 param->compbuf, len); 3723 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3724 error_report("decompress data failed"); 3725 qemu_file_set_error(decomp_file, ret); 3726 } 3727 3728 qemu_mutex_lock(&decomp_done_lock); 3729 param->done = true; 3730 qemu_cond_signal(&decomp_done_cond); 3731 qemu_mutex_unlock(&decomp_done_lock); 3732 3733 qemu_mutex_lock(¶m->mutex); 3734 } else { 3735 qemu_cond_wait(¶m->cond, ¶m->mutex); 3736 } 3737 } 3738 qemu_mutex_unlock(¶m->mutex); 3739 3740 return NULL; 3741 } 3742 3743 static int wait_for_decompress_done(void) 3744 { 3745 int idx, thread_count; 3746 3747 if (!migrate_use_compression()) { 3748 return 0; 3749 } 3750 3751 thread_count = migrate_decompress_threads(); 3752 qemu_mutex_lock(&decomp_done_lock); 3753 for (idx = 0; idx < thread_count; idx++) { 3754 while (!decomp_param[idx].done) { 3755 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3756 } 3757 } 3758 qemu_mutex_unlock(&decomp_done_lock); 3759 return qemu_file_get_error(decomp_file); 3760 } 3761 3762 static void compress_threads_load_cleanup(void) 3763 { 3764 int i, thread_count; 3765 3766 if (!migrate_use_compression()) { 3767 return; 3768 } 3769 thread_count = migrate_decompress_threads(); 3770 for (i = 0; i < thread_count; i++) { 3771 /* 3772 * we use it as a indicator which shows if the thread is 3773 * properly init'd or not 3774 */ 3775 if (!decomp_param[i].compbuf) { 3776 break; 3777 } 3778 3779 qemu_mutex_lock(&decomp_param[i].mutex); 3780 decomp_param[i].quit = true; 3781 qemu_cond_signal(&decomp_param[i].cond); 3782 qemu_mutex_unlock(&decomp_param[i].mutex); 3783 } 3784 for (i = 0; i < thread_count; i++) { 3785 if (!decomp_param[i].compbuf) { 3786 break; 3787 } 3788 3789 qemu_thread_join(decompress_threads + i); 3790 qemu_mutex_destroy(&decomp_param[i].mutex); 3791 qemu_cond_destroy(&decomp_param[i].cond); 3792 inflateEnd(&decomp_param[i].stream); 3793 g_free(decomp_param[i].compbuf); 3794 decomp_param[i].compbuf = NULL; 3795 } 3796 g_free(decompress_threads); 3797 g_free(decomp_param); 3798 decompress_threads = NULL; 3799 decomp_param = NULL; 3800 decomp_file = NULL; 3801 } 3802 3803 static int compress_threads_load_setup(QEMUFile *f) 3804 { 3805 int i, thread_count; 3806 3807 if (!migrate_use_compression()) { 3808 return 0; 3809 } 3810 3811 thread_count = migrate_decompress_threads(); 3812 decompress_threads = g_new0(QemuThread, thread_count); 3813 decomp_param = g_new0(DecompressParam, thread_count); 3814 qemu_mutex_init(&decomp_done_lock); 3815 qemu_cond_init(&decomp_done_cond); 3816 decomp_file = f; 3817 for (i = 0; i < thread_count; i++) { 3818 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3819 goto exit; 3820 } 3821 3822 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3823 qemu_mutex_init(&decomp_param[i].mutex); 3824 qemu_cond_init(&decomp_param[i].cond); 3825 decomp_param[i].done = true; 3826 decomp_param[i].quit = false; 3827 qemu_thread_create(decompress_threads + i, "decompress", 3828 do_data_decompress, decomp_param + i, 3829 QEMU_THREAD_JOINABLE); 3830 } 3831 return 0; 3832 exit: 3833 compress_threads_load_cleanup(); 3834 return -1; 3835 } 3836 3837 static void decompress_data_with_multi_threads(QEMUFile *f, 3838 void *host, int len) 3839 { 3840 int idx, thread_count; 3841 3842 thread_count = migrate_decompress_threads(); 3843 QEMU_LOCK_GUARD(&decomp_done_lock); 3844 while (true) { 3845 for (idx = 0; idx < thread_count; idx++) { 3846 if (decomp_param[idx].done) { 3847 decomp_param[idx].done = false; 3848 qemu_mutex_lock(&decomp_param[idx].mutex); 3849 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3850 decomp_param[idx].des = host; 3851 decomp_param[idx].len = len; 3852 qemu_cond_signal(&decomp_param[idx].cond); 3853 qemu_mutex_unlock(&decomp_param[idx].mutex); 3854 break; 3855 } 3856 } 3857 if (idx < thread_count) { 3858 break; 3859 } else { 3860 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3861 } 3862 } 3863 } 3864 3865 static void colo_init_ram_state(void) 3866 { 3867 ram_state_init(&ram_state); 3868 } 3869 3870 /* 3871 * colo cache: this is for secondary VM, we cache the whole 3872 * memory of the secondary VM, it is need to hold the global lock 3873 * to call this helper. 3874 */ 3875 int colo_init_ram_cache(void) 3876 { 3877 RAMBlock *block; 3878 3879 WITH_RCU_READ_LOCK_GUARD() { 3880 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3881 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3882 NULL, false, false); 3883 if (!block->colo_cache) { 3884 error_report("%s: Can't alloc memory for COLO cache of block %s," 3885 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3886 block->used_length); 3887 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3888 if (block->colo_cache) { 3889 qemu_anon_ram_free(block->colo_cache, block->used_length); 3890 block->colo_cache = NULL; 3891 } 3892 } 3893 return -errno; 3894 } 3895 if (!machine_dump_guest_core(current_machine)) { 3896 qemu_madvise(block->colo_cache, block->used_length, 3897 QEMU_MADV_DONTDUMP); 3898 } 3899 } 3900 } 3901 3902 /* 3903 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3904 * with to decide which page in cache should be flushed into SVM's RAM. Here 3905 * we use the same name 'ram_bitmap' as for migration. 3906 */ 3907 if (ram_bytes_total()) { 3908 RAMBlock *block; 3909 3910 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3911 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3912 block->bmap = bitmap_new(pages); 3913 } 3914 } 3915 3916 colo_init_ram_state(); 3917 return 0; 3918 } 3919 3920 /* TODO: duplicated with ram_init_bitmaps */ 3921 void colo_incoming_start_dirty_log(void) 3922 { 3923 RAMBlock *block = NULL; 3924 /* For memory_global_dirty_log_start below. */ 3925 qemu_mutex_lock_iothread(); 3926 qemu_mutex_lock_ramlist(); 3927 3928 memory_global_dirty_log_sync(); 3929 WITH_RCU_READ_LOCK_GUARD() { 3930 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3931 ramblock_sync_dirty_bitmap(ram_state, block); 3932 /* Discard this dirty bitmap record */ 3933 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3934 } 3935 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3936 } 3937 ram_state->migration_dirty_pages = 0; 3938 qemu_mutex_unlock_ramlist(); 3939 qemu_mutex_unlock_iothread(); 3940 } 3941 3942 /* It is need to hold the global lock to call this helper */ 3943 void colo_release_ram_cache(void) 3944 { 3945 RAMBlock *block; 3946 3947 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3948 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3949 g_free(block->bmap); 3950 block->bmap = NULL; 3951 } 3952 3953 WITH_RCU_READ_LOCK_GUARD() { 3954 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3955 if (block->colo_cache) { 3956 qemu_anon_ram_free(block->colo_cache, block->used_length); 3957 block->colo_cache = NULL; 3958 } 3959 } 3960 } 3961 ram_state_cleanup(&ram_state); 3962 } 3963 3964 /** 3965 * ram_load_setup: Setup RAM for migration incoming side 3966 * 3967 * Returns zero to indicate success and negative for error 3968 * 3969 * @f: QEMUFile where to receive the data 3970 * @opaque: RAMState pointer 3971 */ 3972 static int ram_load_setup(QEMUFile *f, void *opaque) 3973 { 3974 if (compress_threads_load_setup(f)) { 3975 return -1; 3976 } 3977 3978 xbzrle_load_setup(); 3979 ramblock_recv_map_init(); 3980 3981 return 0; 3982 } 3983 3984 static int ram_load_cleanup(void *opaque) 3985 { 3986 RAMBlock *rb; 3987 3988 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3989 qemu_ram_block_writeback(rb); 3990 } 3991 3992 xbzrle_load_cleanup(); 3993 compress_threads_load_cleanup(); 3994 3995 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3996 g_free(rb->receivedmap); 3997 rb->receivedmap = NULL; 3998 } 3999 4000 return 0; 4001 } 4002 4003 /** 4004 * ram_postcopy_incoming_init: allocate postcopy data structures 4005 * 4006 * Returns 0 for success and negative if there was one error 4007 * 4008 * @mis: current migration incoming state 4009 * 4010 * Allocate data structures etc needed by incoming migration with 4011 * postcopy-ram. postcopy-ram's similarly names 4012 * postcopy_ram_incoming_init does the work. 4013 */ 4014 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 4015 { 4016 return postcopy_ram_incoming_init(mis); 4017 } 4018 4019 /** 4020 * ram_load_postcopy: load a page in postcopy case 4021 * 4022 * Returns 0 for success or -errno in case of error 4023 * 4024 * Called in postcopy mode by ram_load(). 4025 * rcu_read_lock is taken prior to this being called. 4026 * 4027 * @f: QEMUFile where to send the data 4028 * @channel: the channel to use for loading 4029 */ 4030 int ram_load_postcopy(QEMUFile *f, int channel) 4031 { 4032 int flags = 0, ret = 0; 4033 bool place_needed = false; 4034 bool matches_target_page_size = false; 4035 MigrationIncomingState *mis = migration_incoming_get_current(); 4036 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 4037 4038 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4039 ram_addr_t addr; 4040 void *page_buffer = NULL; 4041 void *place_source = NULL; 4042 RAMBlock *block = NULL; 4043 uint8_t ch; 4044 int len; 4045 4046 addr = qemu_get_be64(f); 4047 4048 /* 4049 * If qemu file error, we should stop here, and then "addr" 4050 * may be invalid 4051 */ 4052 ret = qemu_file_get_error(f); 4053 if (ret) { 4054 break; 4055 } 4056 4057 flags = addr & ~TARGET_PAGE_MASK; 4058 addr &= TARGET_PAGE_MASK; 4059 4060 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 4061 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4062 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 4063 block = ram_block_from_stream(mis, f, flags, channel); 4064 if (!block) { 4065 ret = -EINVAL; 4066 break; 4067 } 4068 4069 /* 4070 * Relying on used_length is racy and can result in false positives. 4071 * We might place pages beyond used_length in case RAM was shrunk 4072 * while in postcopy, which is fine - trying to place via 4073 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4074 */ 4075 if (!block->host || addr >= block->postcopy_length) { 4076 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4077 ret = -EINVAL; 4078 break; 4079 } 4080 tmp_page->target_pages++; 4081 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4082 /* 4083 * Postcopy requires that we place whole host pages atomically; 4084 * these may be huge pages for RAMBlocks that are backed by 4085 * hugetlbfs. 4086 * To make it atomic, the data is read into a temporary page 4087 * that's moved into place later. 4088 * The migration protocol uses, possibly smaller, target-pages 4089 * however the source ensures it always sends all the components 4090 * of a host page in one chunk. 4091 */ 4092 page_buffer = tmp_page->tmp_huge_page + 4093 host_page_offset_from_ram_block_offset(block, addr); 4094 /* If all TP are zero then we can optimise the place */ 4095 if (tmp_page->target_pages == 1) { 4096 tmp_page->host_addr = 4097 host_page_from_ram_block_offset(block, addr); 4098 } else if (tmp_page->host_addr != 4099 host_page_from_ram_block_offset(block, addr)) { 4100 /* not the 1st TP within the HP */ 4101 error_report("Non-same host page detected on channel %d: " 4102 "Target host page %p, received host page %p " 4103 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4104 channel, tmp_page->host_addr, 4105 host_page_from_ram_block_offset(block, addr), 4106 block->idstr, addr, tmp_page->target_pages); 4107 ret = -EINVAL; 4108 break; 4109 } 4110 4111 /* 4112 * If it's the last part of a host page then we place the host 4113 * page 4114 */ 4115 if (tmp_page->target_pages == 4116 (block->page_size / TARGET_PAGE_SIZE)) { 4117 place_needed = true; 4118 } 4119 place_source = tmp_page->tmp_huge_page; 4120 } 4121 4122 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4123 case RAM_SAVE_FLAG_ZERO: 4124 ch = qemu_get_byte(f); 4125 /* 4126 * Can skip to set page_buffer when 4127 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4128 */ 4129 if (ch || !matches_target_page_size) { 4130 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4131 } 4132 if (ch) { 4133 tmp_page->all_zero = false; 4134 } 4135 break; 4136 4137 case RAM_SAVE_FLAG_PAGE: 4138 tmp_page->all_zero = false; 4139 if (!matches_target_page_size) { 4140 /* For huge pages, we always use temporary buffer */ 4141 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4142 } else { 4143 /* 4144 * For small pages that matches target page size, we 4145 * avoid the qemu_file copy. Instead we directly use 4146 * the buffer of QEMUFile to place the page. Note: we 4147 * cannot do any QEMUFile operation before using that 4148 * buffer to make sure the buffer is valid when 4149 * placing the page. 4150 */ 4151 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4152 TARGET_PAGE_SIZE); 4153 } 4154 break; 4155 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4156 tmp_page->all_zero = false; 4157 len = qemu_get_be32(f); 4158 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4159 error_report("Invalid compressed data length: %d", len); 4160 ret = -EINVAL; 4161 break; 4162 } 4163 decompress_data_with_multi_threads(f, page_buffer, len); 4164 break; 4165 4166 case RAM_SAVE_FLAG_EOS: 4167 /* normal exit */ 4168 multifd_recv_sync_main(); 4169 break; 4170 default: 4171 error_report("Unknown combination of migration flags: 0x%x" 4172 " (postcopy mode)", flags); 4173 ret = -EINVAL; 4174 break; 4175 } 4176 4177 /* Got the whole host page, wait for decompress before placing. */ 4178 if (place_needed) { 4179 ret |= wait_for_decompress_done(); 4180 } 4181 4182 /* Detect for any possible file errors */ 4183 if (!ret && qemu_file_get_error(f)) { 4184 ret = qemu_file_get_error(f); 4185 } 4186 4187 if (!ret && place_needed) { 4188 if (tmp_page->all_zero) { 4189 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4190 } else { 4191 ret = postcopy_place_page(mis, tmp_page->host_addr, 4192 place_source, block); 4193 } 4194 place_needed = false; 4195 postcopy_temp_page_reset(tmp_page); 4196 } 4197 } 4198 4199 return ret; 4200 } 4201 4202 static bool postcopy_is_running(void) 4203 { 4204 PostcopyState ps = postcopy_state_get(); 4205 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4206 } 4207 4208 /* 4209 * Flush content of RAM cache into SVM's memory. 4210 * Only flush the pages that be dirtied by PVM or SVM or both. 4211 */ 4212 void colo_flush_ram_cache(void) 4213 { 4214 RAMBlock *block = NULL; 4215 void *dst_host; 4216 void *src_host; 4217 unsigned long offset = 0; 4218 4219 memory_global_dirty_log_sync(); 4220 WITH_RCU_READ_LOCK_GUARD() { 4221 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4222 ramblock_sync_dirty_bitmap(ram_state, block); 4223 } 4224 } 4225 4226 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4227 WITH_RCU_READ_LOCK_GUARD() { 4228 block = QLIST_FIRST_RCU(&ram_list.blocks); 4229 4230 while (block) { 4231 unsigned long num = 0; 4232 4233 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4234 if (!offset_in_ramblock(block, 4235 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4236 offset = 0; 4237 num = 0; 4238 block = QLIST_NEXT_RCU(block, next); 4239 } else { 4240 unsigned long i = 0; 4241 4242 for (i = 0; i < num; i++) { 4243 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4244 } 4245 dst_host = block->host 4246 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4247 src_host = block->colo_cache 4248 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4249 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4250 offset += num; 4251 } 4252 } 4253 } 4254 trace_colo_flush_ram_cache_end(); 4255 } 4256 4257 /** 4258 * ram_load_precopy: load pages in precopy case 4259 * 4260 * Returns 0 for success or -errno in case of error 4261 * 4262 * Called in precopy mode by ram_load(). 4263 * rcu_read_lock is taken prior to this being called. 4264 * 4265 * @f: QEMUFile where to send the data 4266 */ 4267 static int ram_load_precopy(QEMUFile *f) 4268 { 4269 MigrationIncomingState *mis = migration_incoming_get_current(); 4270 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4271 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4272 bool postcopy_advised = migration_incoming_postcopy_advised(); 4273 if (!migrate_use_compression()) { 4274 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4275 } 4276 4277 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4278 ram_addr_t addr, total_ram_bytes; 4279 void *host = NULL, *host_bak = NULL; 4280 uint8_t ch; 4281 4282 /* 4283 * Yield periodically to let main loop run, but an iteration of 4284 * the main loop is expensive, so do it each some iterations 4285 */ 4286 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4287 aio_co_schedule(qemu_get_current_aio_context(), 4288 qemu_coroutine_self()); 4289 qemu_coroutine_yield(); 4290 } 4291 i++; 4292 4293 addr = qemu_get_be64(f); 4294 flags = addr & ~TARGET_PAGE_MASK; 4295 addr &= TARGET_PAGE_MASK; 4296 4297 if (flags & invalid_flags) { 4298 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4299 error_report("Received an unexpected compressed page"); 4300 } 4301 4302 ret = -EINVAL; 4303 break; 4304 } 4305 4306 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4307 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4308 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4309 RAM_CHANNEL_PRECOPY); 4310 4311 host = host_from_ram_block_offset(block, addr); 4312 /* 4313 * After going into COLO stage, we should not load the page 4314 * into SVM's memory directly, we put them into colo_cache firstly. 4315 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4316 * Previously, we copied all these memory in preparing stage of COLO 4317 * while we need to stop VM, which is a time-consuming process. 4318 * Here we optimize it by a trick, back-up every page while in 4319 * migration process while COLO is enabled, though it affects the 4320 * speed of the migration, but it obviously reduce the downtime of 4321 * back-up all SVM'S memory in COLO preparing stage. 4322 */ 4323 if (migration_incoming_colo_enabled()) { 4324 if (migration_incoming_in_colo_state()) { 4325 /* In COLO stage, put all pages into cache temporarily */ 4326 host = colo_cache_from_block_offset(block, addr, true); 4327 } else { 4328 /* 4329 * In migration stage but before COLO stage, 4330 * Put all pages into both cache and SVM's memory. 4331 */ 4332 host_bak = colo_cache_from_block_offset(block, addr, false); 4333 } 4334 } 4335 if (!host) { 4336 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4337 ret = -EINVAL; 4338 break; 4339 } 4340 if (!migration_incoming_in_colo_state()) { 4341 ramblock_recv_bitmap_set(block, host); 4342 } 4343 4344 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4345 } 4346 4347 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4348 case RAM_SAVE_FLAG_MEM_SIZE: 4349 /* Synchronize RAM block list */ 4350 total_ram_bytes = addr; 4351 while (!ret && total_ram_bytes) { 4352 RAMBlock *block; 4353 char id[256]; 4354 ram_addr_t length; 4355 4356 len = qemu_get_byte(f); 4357 qemu_get_buffer(f, (uint8_t *)id, len); 4358 id[len] = 0; 4359 length = qemu_get_be64(f); 4360 4361 block = qemu_ram_block_by_name(id); 4362 if (block && !qemu_ram_is_migratable(block)) { 4363 error_report("block %s should not be migrated !", id); 4364 ret = -EINVAL; 4365 } else if (block) { 4366 if (length != block->used_length) { 4367 Error *local_err = NULL; 4368 4369 ret = qemu_ram_resize(block, length, 4370 &local_err); 4371 if (local_err) { 4372 error_report_err(local_err); 4373 } 4374 } 4375 /* For postcopy we need to check hugepage sizes match */ 4376 if (postcopy_advised && migrate_postcopy_ram() && 4377 block->page_size != qemu_host_page_size) { 4378 uint64_t remote_page_size = qemu_get_be64(f); 4379 if (remote_page_size != block->page_size) { 4380 error_report("Mismatched RAM page size %s " 4381 "(local) %zd != %" PRId64, 4382 id, block->page_size, 4383 remote_page_size); 4384 ret = -EINVAL; 4385 } 4386 } 4387 if (migrate_ignore_shared()) { 4388 hwaddr addr = qemu_get_be64(f); 4389 if (ramblock_is_ignored(block) && 4390 block->mr->addr != addr) { 4391 error_report("Mismatched GPAs for block %s " 4392 "%" PRId64 "!= %" PRId64, 4393 id, (uint64_t)addr, 4394 (uint64_t)block->mr->addr); 4395 ret = -EINVAL; 4396 } 4397 } 4398 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4399 block->idstr); 4400 } else { 4401 error_report("Unknown ramblock \"%s\", cannot " 4402 "accept migration", id); 4403 ret = -EINVAL; 4404 } 4405 4406 total_ram_bytes -= length; 4407 } 4408 break; 4409 4410 case RAM_SAVE_FLAG_ZERO: 4411 ch = qemu_get_byte(f); 4412 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4413 break; 4414 4415 case RAM_SAVE_FLAG_PAGE: 4416 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4417 break; 4418 4419 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4420 len = qemu_get_be32(f); 4421 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4422 error_report("Invalid compressed data length: %d", len); 4423 ret = -EINVAL; 4424 break; 4425 } 4426 decompress_data_with_multi_threads(f, host, len); 4427 break; 4428 4429 case RAM_SAVE_FLAG_XBZRLE: 4430 if (load_xbzrle(f, addr, host) < 0) { 4431 error_report("Failed to decompress XBZRLE page at " 4432 RAM_ADDR_FMT, addr); 4433 ret = -EINVAL; 4434 break; 4435 } 4436 break; 4437 case RAM_SAVE_FLAG_EOS: 4438 /* normal exit */ 4439 multifd_recv_sync_main(); 4440 break; 4441 default: 4442 if (flags & RAM_SAVE_FLAG_HOOK) { 4443 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4444 } else { 4445 error_report("Unknown combination of migration flags: 0x%x", 4446 flags); 4447 ret = -EINVAL; 4448 } 4449 } 4450 if (!ret) { 4451 ret = qemu_file_get_error(f); 4452 } 4453 if (!ret && host_bak) { 4454 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4455 } 4456 } 4457 4458 ret |= wait_for_decompress_done(); 4459 return ret; 4460 } 4461 4462 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4463 { 4464 int ret = 0; 4465 static uint64_t seq_iter; 4466 /* 4467 * If system is running in postcopy mode, page inserts to host memory must 4468 * be atomic 4469 */ 4470 bool postcopy_running = postcopy_is_running(); 4471 4472 seq_iter++; 4473 4474 if (version_id != 4) { 4475 return -EINVAL; 4476 } 4477 4478 /* 4479 * This RCU critical section can be very long running. 4480 * When RCU reclaims in the code start to become numerous, 4481 * it will be necessary to reduce the granularity of this 4482 * critical section. 4483 */ 4484 WITH_RCU_READ_LOCK_GUARD() { 4485 if (postcopy_running) { 4486 /* 4487 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4488 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4489 * service fast page faults. 4490 */ 4491 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4492 } else { 4493 ret = ram_load_precopy(f); 4494 } 4495 } 4496 trace_ram_load_complete(ret, seq_iter); 4497 4498 return ret; 4499 } 4500 4501 static bool ram_has_postcopy(void *opaque) 4502 { 4503 RAMBlock *rb; 4504 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4505 if (ramblock_is_pmem(rb)) { 4506 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4507 "is not supported now!", rb->idstr, rb->host); 4508 return false; 4509 } 4510 } 4511 4512 return migrate_postcopy_ram(); 4513 } 4514 4515 /* Sync all the dirty bitmap with destination VM. */ 4516 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4517 { 4518 RAMBlock *block; 4519 QEMUFile *file = s->to_dst_file; 4520 int ramblock_count = 0; 4521 4522 trace_ram_dirty_bitmap_sync_start(); 4523 4524 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4525 qemu_savevm_send_recv_bitmap(file, block->idstr); 4526 trace_ram_dirty_bitmap_request(block->idstr); 4527 ramblock_count++; 4528 } 4529 4530 trace_ram_dirty_bitmap_sync_wait(); 4531 4532 /* Wait until all the ramblocks' dirty bitmap synced */ 4533 while (ramblock_count--) { 4534 qemu_sem_wait(&s->rp_state.rp_sem); 4535 } 4536 4537 trace_ram_dirty_bitmap_sync_complete(); 4538 4539 return 0; 4540 } 4541 4542 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4543 { 4544 qemu_sem_post(&s->rp_state.rp_sem); 4545 } 4546 4547 /* 4548 * Read the received bitmap, revert it as the initial dirty bitmap. 4549 * This is only used when the postcopy migration is paused but wants 4550 * to resume from a middle point. 4551 */ 4552 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4553 { 4554 int ret = -EINVAL; 4555 /* from_dst_file is always valid because we're within rp_thread */ 4556 QEMUFile *file = s->rp_state.from_dst_file; 4557 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4558 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4559 uint64_t size, end_mark; 4560 4561 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4562 4563 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4564 error_report("%s: incorrect state %s", __func__, 4565 MigrationStatus_str(s->state)); 4566 return -EINVAL; 4567 } 4568 4569 /* 4570 * Note: see comments in ramblock_recv_bitmap_send() on why we 4571 * need the endianness conversion, and the paddings. 4572 */ 4573 local_size = ROUND_UP(local_size, 8); 4574 4575 /* Add paddings */ 4576 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4577 4578 size = qemu_get_be64(file); 4579 4580 /* The size of the bitmap should match with our ramblock */ 4581 if (size != local_size) { 4582 error_report("%s: ramblock '%s' bitmap size mismatch " 4583 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4584 block->idstr, size, local_size); 4585 ret = -EINVAL; 4586 goto out; 4587 } 4588 4589 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4590 end_mark = qemu_get_be64(file); 4591 4592 ret = qemu_file_get_error(file); 4593 if (ret || size != local_size) { 4594 error_report("%s: read bitmap failed for ramblock '%s': %d" 4595 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4596 __func__, block->idstr, ret, local_size, size); 4597 ret = -EIO; 4598 goto out; 4599 } 4600 4601 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4602 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4603 __func__, block->idstr, end_mark); 4604 ret = -EINVAL; 4605 goto out; 4606 } 4607 4608 /* 4609 * Endianness conversion. We are during postcopy (though paused). 4610 * The dirty bitmap won't change. We can directly modify it. 4611 */ 4612 bitmap_from_le(block->bmap, le_bitmap, nbits); 4613 4614 /* 4615 * What we received is "received bitmap". Revert it as the initial 4616 * dirty bitmap for this ramblock. 4617 */ 4618 bitmap_complement(block->bmap, block->bmap, nbits); 4619 4620 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4621 ramblock_dirty_bitmap_clear_discarded_pages(block); 4622 4623 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4624 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4625 4626 /* 4627 * We succeeded to sync bitmap for current ramblock. If this is 4628 * the last one to sync, we need to notify the main send thread. 4629 */ 4630 ram_dirty_bitmap_reload_notify(s); 4631 4632 ret = 0; 4633 out: 4634 g_free(le_bitmap); 4635 return ret; 4636 } 4637 4638 static int ram_resume_prepare(MigrationState *s, void *opaque) 4639 { 4640 RAMState *rs = *(RAMState **)opaque; 4641 int ret; 4642 4643 ret = ram_dirty_bitmap_sync_all(s, rs); 4644 if (ret) { 4645 return ret; 4646 } 4647 4648 ram_state_resume_prepare(rs, s->to_dst_file); 4649 4650 return 0; 4651 } 4652 4653 void postcopy_preempt_shutdown_file(MigrationState *s) 4654 { 4655 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4656 qemu_fflush(s->postcopy_qemufile_src); 4657 } 4658 4659 static SaveVMHandlers savevm_ram_handlers = { 4660 .save_setup = ram_save_setup, 4661 .save_live_iterate = ram_save_iterate, 4662 .save_live_complete_postcopy = ram_save_complete, 4663 .save_live_complete_precopy = ram_save_complete, 4664 .has_postcopy = ram_has_postcopy, 4665 .state_pending_exact = ram_state_pending_exact, 4666 .state_pending_estimate = ram_state_pending_estimate, 4667 .load_state = ram_load, 4668 .save_cleanup = ram_save_cleanup, 4669 .load_setup = ram_load_setup, 4670 .load_cleanup = ram_load_cleanup, 4671 .resume_prepare = ram_resume_prepare, 4672 }; 4673 4674 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4675 size_t old_size, size_t new_size) 4676 { 4677 PostcopyState ps = postcopy_state_get(); 4678 ram_addr_t offset; 4679 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4680 Error *err = NULL; 4681 4682 if (ramblock_is_ignored(rb)) { 4683 return; 4684 } 4685 4686 if (!migration_is_idle()) { 4687 /* 4688 * Precopy code on the source cannot deal with the size of RAM blocks 4689 * changing at random points in time - especially after sending the 4690 * RAM block sizes in the migration stream, they must no longer change. 4691 * Abort and indicate a proper reason. 4692 */ 4693 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4694 migration_cancel(err); 4695 error_free(err); 4696 } 4697 4698 switch (ps) { 4699 case POSTCOPY_INCOMING_ADVISE: 4700 /* 4701 * Update what ram_postcopy_incoming_init()->init_range() does at the 4702 * time postcopy was advised. Syncing RAM blocks with the source will 4703 * result in RAM resizes. 4704 */ 4705 if (old_size < new_size) { 4706 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4707 error_report("RAM block '%s' discard of resized RAM failed", 4708 rb->idstr); 4709 } 4710 } 4711 rb->postcopy_length = new_size; 4712 break; 4713 case POSTCOPY_INCOMING_NONE: 4714 case POSTCOPY_INCOMING_RUNNING: 4715 case POSTCOPY_INCOMING_END: 4716 /* 4717 * Once our guest is running, postcopy does no longer care about 4718 * resizes. When growing, the new memory was not available on the 4719 * source, no handler needed. 4720 */ 4721 break; 4722 default: 4723 error_report("RAM block '%s' resized during postcopy state: %d", 4724 rb->idstr, ps); 4725 exit(-1); 4726 } 4727 } 4728 4729 static RAMBlockNotifier ram_mig_ram_notifier = { 4730 .ram_block_resized = ram_mig_ram_block_resized, 4731 }; 4732 4733 void ram_mig_init(void) 4734 { 4735 qemu_mutex_init(&XBZRLE.lock); 4736 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4737 ram_block_notifier_add(&ram_mig_ram_notifier); 4738 } 4739