1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration-stats.h" 40 #include "migration/register.h" 41 #include "migration/misc.h" 42 #include "qemu-file.h" 43 #include "postcopy-ram.h" 44 #include "page_cache.h" 45 #include "qemu/error-report.h" 46 #include "qapi/error.h" 47 #include "qapi/qapi-types-migration.h" 48 #include "qapi/qapi-events-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "exec/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "block.h" 56 #include "sysemu/cpu-throttle.h" 57 #include "savevm.h" 58 #include "qemu/iov.h" 59 #include "multifd.h" 60 #include "sysemu/runstate.h" 61 #include "options.h" 62 63 #include "hw/boards.h" /* for machine_dump_guest_core() */ 64 65 #if defined(__linux__) 66 #include "qemu/userfaultfd.h" 67 #endif /* defined(__linux__) */ 68 69 /***********************************************************/ 70 /* ram save/restore */ 71 72 /* 73 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 74 * worked for pages that were filled with the same char. We switched 75 * it to only search for the zero value. And to avoid confusion with 76 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 77 */ 78 /* 79 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 80 */ 81 #define RAM_SAVE_FLAG_FULL 0x01 82 #define RAM_SAVE_FLAG_ZERO 0x02 83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 84 #define RAM_SAVE_FLAG_PAGE 0x08 85 #define RAM_SAVE_FLAG_EOS 0x10 86 #define RAM_SAVE_FLAG_CONTINUE 0x20 87 #define RAM_SAVE_FLAG_XBZRLE 0x40 88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 89 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 91 /* We can't use any flag that is bigger than 0x200 */ 92 93 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, 94 uint8_t *, int) = xbzrle_encode_buffer; 95 #if defined(CONFIG_AVX512BW_OPT) 96 #include "qemu/cpuid.h" 97 static void __attribute__((constructor)) init_cpu_flag(void) 98 { 99 unsigned max = __get_cpuid_max(0, NULL); 100 int a, b, c, d; 101 if (max >= 1) { 102 __cpuid(1, a, b, c, d); 103 /* We must check that AVX is not just available, but usable. */ 104 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { 105 int bv; 106 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); 107 __cpuid_count(7, 0, a, b, c, d); 108 /* 0xe6: 109 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 110 * and ZMM16-ZMM31 state are enabled by OS) 111 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) 112 */ 113 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { 114 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; 115 } 116 } 117 } 118 } 119 #endif 120 121 XBZRLECacheStats xbzrle_counters; 122 123 /* used by the search for pages to send */ 124 struct PageSearchStatus { 125 /* The migration channel used for a specific host page */ 126 QEMUFile *pss_channel; 127 /* Last block from where we have sent data */ 128 RAMBlock *last_sent_block; 129 /* Current block being searched */ 130 RAMBlock *block; 131 /* Current page to search from */ 132 unsigned long page; 133 /* Set once we wrap around */ 134 bool complete_round; 135 /* Whether we're sending a host page */ 136 bool host_page_sending; 137 /* The start/end of current host page. Invalid if host_page_sending==false */ 138 unsigned long host_page_start; 139 unsigned long host_page_end; 140 }; 141 typedef struct PageSearchStatus PageSearchStatus; 142 143 /* struct contains XBZRLE cache and a static page 144 used by the compression */ 145 static struct { 146 /* buffer used for XBZRLE encoding */ 147 uint8_t *encoded_buf; 148 /* buffer for storing page content */ 149 uint8_t *current_buf; 150 /* Cache for XBZRLE, Protected by lock. */ 151 PageCache *cache; 152 QemuMutex lock; 153 /* it will store a page full of zeros */ 154 uint8_t *zero_target_page; 155 /* buffer used for XBZRLE decoding */ 156 uint8_t *decoded_buf; 157 } XBZRLE; 158 159 static void XBZRLE_cache_lock(void) 160 { 161 if (migrate_xbzrle()) { 162 qemu_mutex_lock(&XBZRLE.lock); 163 } 164 } 165 166 static void XBZRLE_cache_unlock(void) 167 { 168 if (migrate_xbzrle()) { 169 qemu_mutex_unlock(&XBZRLE.lock); 170 } 171 } 172 173 /** 174 * xbzrle_cache_resize: resize the xbzrle cache 175 * 176 * This function is called from migrate_params_apply in main 177 * thread, possibly while a migration is in progress. A running 178 * migration may be using the cache and might finish during this call, 179 * hence changes to the cache are protected by XBZRLE.lock(). 180 * 181 * Returns 0 for success or -1 for error 182 * 183 * @new_size: new cache size 184 * @errp: set *errp if the check failed, with reason 185 */ 186 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 187 { 188 PageCache *new_cache; 189 int64_t ret = 0; 190 191 /* Check for truncation */ 192 if (new_size != (size_t)new_size) { 193 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 194 "exceeding address space"); 195 return -1; 196 } 197 198 if (new_size == migrate_xbzrle_cache_size()) { 199 /* nothing to do */ 200 return 0; 201 } 202 203 XBZRLE_cache_lock(); 204 205 if (XBZRLE.cache != NULL) { 206 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 207 if (!new_cache) { 208 ret = -1; 209 goto out; 210 } 211 212 cache_fini(XBZRLE.cache); 213 XBZRLE.cache = new_cache; 214 } 215 out: 216 XBZRLE_cache_unlock(); 217 return ret; 218 } 219 220 static bool postcopy_preempt_active(void) 221 { 222 return migrate_postcopy_preempt() && migration_in_postcopy(); 223 } 224 225 bool ramblock_is_ignored(RAMBlock *block) 226 { 227 return !qemu_ram_is_migratable(block) || 228 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 229 } 230 231 #undef RAMBLOCK_FOREACH 232 233 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 234 { 235 RAMBlock *block; 236 int ret = 0; 237 238 RCU_READ_LOCK_GUARD(); 239 240 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 241 ret = func(block, opaque); 242 if (ret) { 243 break; 244 } 245 } 246 return ret; 247 } 248 249 static void ramblock_recv_map_init(void) 250 { 251 RAMBlock *rb; 252 253 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 254 assert(!rb->receivedmap); 255 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 256 } 257 } 258 259 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 260 { 261 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 262 rb->receivedmap); 263 } 264 265 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 266 { 267 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 268 } 269 270 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 271 { 272 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 273 } 274 275 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 276 size_t nr) 277 { 278 bitmap_set_atomic(rb->receivedmap, 279 ramblock_recv_bitmap_offset(host_addr, rb), 280 nr); 281 } 282 283 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 284 285 /* 286 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 287 * 288 * Returns >0 if success with sent bytes, or <0 if error. 289 */ 290 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 291 const char *block_name) 292 { 293 RAMBlock *block = qemu_ram_block_by_name(block_name); 294 unsigned long *le_bitmap, nbits; 295 uint64_t size; 296 297 if (!block) { 298 error_report("%s: invalid block name: %s", __func__, block_name); 299 return -1; 300 } 301 302 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 303 304 /* 305 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 306 * machines we may need 4 more bytes for padding (see below 307 * comment). So extend it a bit before hand. 308 */ 309 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 310 311 /* 312 * Always use little endian when sending the bitmap. This is 313 * required that when source and destination VMs are not using the 314 * same endianness. (Note: big endian won't work.) 315 */ 316 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 317 318 /* Size of the bitmap, in bytes */ 319 size = DIV_ROUND_UP(nbits, 8); 320 321 /* 322 * size is always aligned to 8 bytes for 64bit machines, but it 323 * may not be true for 32bit machines. We need this padding to 324 * make sure the migration can survive even between 32bit and 325 * 64bit machines. 326 */ 327 size = ROUND_UP(size, 8); 328 329 qemu_put_be64(file, size); 330 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 331 /* 332 * Mark as an end, in case the middle part is screwed up due to 333 * some "mysterious" reason. 334 */ 335 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 336 qemu_fflush(file); 337 338 g_free(le_bitmap); 339 340 if (qemu_file_get_error(file)) { 341 return qemu_file_get_error(file); 342 } 343 344 return size + sizeof(size); 345 } 346 347 /* 348 * An outstanding page request, on the source, having been received 349 * and queued 350 */ 351 struct RAMSrcPageRequest { 352 RAMBlock *rb; 353 hwaddr offset; 354 hwaddr len; 355 356 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 357 }; 358 359 /* State of RAM for migration */ 360 struct RAMState { 361 /* 362 * PageSearchStatus structures for the channels when send pages. 363 * Protected by the bitmap_mutex. 364 */ 365 PageSearchStatus pss[RAM_CHANNEL_MAX]; 366 /* UFFD file descriptor, used in 'write-tracking' migration */ 367 int uffdio_fd; 368 /* total ram size in bytes */ 369 uint64_t ram_bytes_total; 370 /* Last block that we have visited searching for dirty pages */ 371 RAMBlock *last_seen_block; 372 /* Last dirty target page we have sent */ 373 ram_addr_t last_page; 374 /* last ram version we have seen */ 375 uint32_t last_version; 376 /* How many times we have dirty too many pages */ 377 int dirty_rate_high_cnt; 378 /* these variables are used for bitmap sync */ 379 /* last time we did a full bitmap_sync */ 380 int64_t time_last_bitmap_sync; 381 /* bytes transferred at start_time */ 382 uint64_t bytes_xfer_prev; 383 /* number of dirty pages since start_time */ 384 uint64_t num_dirty_pages_period; 385 /* xbzrle misses since the beginning of the period */ 386 uint64_t xbzrle_cache_miss_prev; 387 /* Amount of xbzrle pages since the beginning of the period */ 388 uint64_t xbzrle_pages_prev; 389 /* Amount of xbzrle encoded bytes since the beginning of the period */ 390 uint64_t xbzrle_bytes_prev; 391 /* Start using XBZRLE (e.g., after the first round). */ 392 bool xbzrle_enabled; 393 /* Are we on the last stage of migration */ 394 bool last_stage; 395 /* compression statistics since the beginning of the period */ 396 /* amount of count that no free thread to compress data */ 397 uint64_t compress_thread_busy_prev; 398 /* amount bytes after compression */ 399 uint64_t compressed_size_prev; 400 /* amount of compressed pages */ 401 uint64_t compress_pages_prev; 402 403 /* total handled target pages at the beginning of period */ 404 uint64_t target_page_count_prev; 405 /* total handled target pages since start */ 406 uint64_t target_page_count; 407 /* number of dirty bits in the bitmap */ 408 uint64_t migration_dirty_pages; 409 /* 410 * Protects: 411 * - dirty/clear bitmap 412 * - migration_dirty_pages 413 * - pss structures 414 */ 415 QemuMutex bitmap_mutex; 416 /* The RAMBlock used in the last src_page_requests */ 417 RAMBlock *last_req_rb; 418 /* Queue of outstanding page requests from the destination */ 419 QemuMutex src_page_req_mutex; 420 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 421 }; 422 typedef struct RAMState RAMState; 423 424 static RAMState *ram_state; 425 426 static NotifierWithReturnList precopy_notifier_list; 427 428 /* Whether postcopy has queued requests? */ 429 static bool postcopy_has_request(RAMState *rs) 430 { 431 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 432 } 433 434 void precopy_infrastructure_init(void) 435 { 436 notifier_with_return_list_init(&precopy_notifier_list); 437 } 438 439 void precopy_add_notifier(NotifierWithReturn *n) 440 { 441 notifier_with_return_list_add(&precopy_notifier_list, n); 442 } 443 444 void precopy_remove_notifier(NotifierWithReturn *n) 445 { 446 notifier_with_return_remove(n); 447 } 448 449 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 450 { 451 PrecopyNotifyData pnd; 452 pnd.reason = reason; 453 pnd.errp = errp; 454 455 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 456 } 457 458 uint64_t ram_bytes_remaining(void) 459 { 460 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 461 0; 462 } 463 464 void ram_transferred_add(uint64_t bytes) 465 { 466 if (runstate_is_running()) { 467 stat64_add(&mig_stats.precopy_bytes, bytes); 468 } else if (migration_in_postcopy()) { 469 stat64_add(&mig_stats.postcopy_bytes, bytes); 470 } else { 471 stat64_add(&mig_stats.downtime_bytes, bytes); 472 } 473 stat64_add(&mig_stats.transferred, bytes); 474 } 475 476 struct MigrationOps { 477 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 478 }; 479 typedef struct MigrationOps MigrationOps; 480 481 MigrationOps *migration_ops; 482 483 CompressionStats compression_counters; 484 485 struct CompressParam { 486 bool done; 487 bool quit; 488 bool zero_page; 489 QEMUFile *file; 490 QemuMutex mutex; 491 QemuCond cond; 492 RAMBlock *block; 493 ram_addr_t offset; 494 495 /* internally used fields */ 496 z_stream stream; 497 uint8_t *originbuf; 498 }; 499 typedef struct CompressParam CompressParam; 500 501 struct DecompressParam { 502 bool done; 503 bool quit; 504 QemuMutex mutex; 505 QemuCond cond; 506 void *des; 507 uint8_t *compbuf; 508 int len; 509 z_stream stream; 510 }; 511 typedef struct DecompressParam DecompressParam; 512 513 static CompressParam *comp_param; 514 static QemuThread *compress_threads; 515 /* comp_done_cond is used to wake up the migration thread when 516 * one of the compression threads has finished the compression. 517 * comp_done_lock is used to co-work with comp_done_cond. 518 */ 519 static QemuMutex comp_done_lock; 520 static QemuCond comp_done_cond; 521 522 static QEMUFile *decomp_file; 523 static DecompressParam *decomp_param; 524 static QemuThread *decompress_threads; 525 static QemuMutex decomp_done_lock; 526 static QemuCond decomp_done_cond; 527 528 static int ram_save_host_page_urgent(PageSearchStatus *pss); 529 530 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 531 ram_addr_t offset, uint8_t *source_buf); 532 533 /* NOTE: page is the PFN not real ram_addr_t. */ 534 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 535 { 536 pss->block = rb; 537 pss->page = page; 538 pss->complete_round = false; 539 } 540 541 /* 542 * Check whether two PSSs are actively sending the same page. Return true 543 * if it is, false otherwise. 544 */ 545 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 546 { 547 return pss1->host_page_sending && pss2->host_page_sending && 548 (pss1->host_page_start == pss2->host_page_start); 549 } 550 551 static void *do_data_compress(void *opaque) 552 { 553 CompressParam *param = opaque; 554 RAMBlock *block; 555 ram_addr_t offset; 556 bool zero_page; 557 558 qemu_mutex_lock(¶m->mutex); 559 while (!param->quit) { 560 if (param->block) { 561 block = param->block; 562 offset = param->offset; 563 param->block = NULL; 564 qemu_mutex_unlock(¶m->mutex); 565 566 zero_page = do_compress_ram_page(param->file, ¶m->stream, 567 block, offset, param->originbuf); 568 569 qemu_mutex_lock(&comp_done_lock); 570 param->done = true; 571 param->zero_page = zero_page; 572 qemu_cond_signal(&comp_done_cond); 573 qemu_mutex_unlock(&comp_done_lock); 574 575 qemu_mutex_lock(¶m->mutex); 576 } else { 577 qemu_cond_wait(¶m->cond, ¶m->mutex); 578 } 579 } 580 qemu_mutex_unlock(¶m->mutex); 581 582 return NULL; 583 } 584 585 static void compress_threads_save_cleanup(void) 586 { 587 int i, thread_count; 588 589 if (!migrate_compress() || !comp_param) { 590 return; 591 } 592 593 thread_count = migrate_compress_threads(); 594 for (i = 0; i < thread_count; i++) { 595 /* 596 * we use it as a indicator which shows if the thread is 597 * properly init'd or not 598 */ 599 if (!comp_param[i].file) { 600 break; 601 } 602 603 qemu_mutex_lock(&comp_param[i].mutex); 604 comp_param[i].quit = true; 605 qemu_cond_signal(&comp_param[i].cond); 606 qemu_mutex_unlock(&comp_param[i].mutex); 607 608 qemu_thread_join(compress_threads + i); 609 qemu_mutex_destroy(&comp_param[i].mutex); 610 qemu_cond_destroy(&comp_param[i].cond); 611 deflateEnd(&comp_param[i].stream); 612 g_free(comp_param[i].originbuf); 613 qemu_fclose(comp_param[i].file); 614 comp_param[i].file = NULL; 615 } 616 qemu_mutex_destroy(&comp_done_lock); 617 qemu_cond_destroy(&comp_done_cond); 618 g_free(compress_threads); 619 g_free(comp_param); 620 compress_threads = NULL; 621 comp_param = NULL; 622 } 623 624 static int compress_threads_save_setup(void) 625 { 626 int i, thread_count; 627 628 if (!migrate_compress()) { 629 return 0; 630 } 631 thread_count = migrate_compress_threads(); 632 compress_threads = g_new0(QemuThread, thread_count); 633 comp_param = g_new0(CompressParam, thread_count); 634 qemu_cond_init(&comp_done_cond); 635 qemu_mutex_init(&comp_done_lock); 636 for (i = 0; i < thread_count; i++) { 637 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 638 if (!comp_param[i].originbuf) { 639 goto exit; 640 } 641 642 if (deflateInit(&comp_param[i].stream, 643 migrate_compress_level()) != Z_OK) { 644 g_free(comp_param[i].originbuf); 645 goto exit; 646 } 647 648 /* comp_param[i].file is just used as a dummy buffer to save data, 649 * set its ops to empty. 650 */ 651 comp_param[i].file = qemu_file_new_output( 652 QIO_CHANNEL(qio_channel_null_new())); 653 comp_param[i].done = true; 654 comp_param[i].quit = false; 655 qemu_mutex_init(&comp_param[i].mutex); 656 qemu_cond_init(&comp_param[i].cond); 657 qemu_thread_create(compress_threads + i, "compress", 658 do_data_compress, comp_param + i, 659 QEMU_THREAD_JOINABLE); 660 } 661 return 0; 662 663 exit: 664 compress_threads_save_cleanup(); 665 return -1; 666 } 667 668 /** 669 * save_page_header: write page header to wire 670 * 671 * If this is the 1st block, it also writes the block identification 672 * 673 * Returns the number of bytes written 674 * 675 * @pss: current PSS channel status 676 * @block: block that contains the page we want to send 677 * @offset: offset inside the block for the page 678 * in the lower bits, it contains flags 679 */ 680 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 681 RAMBlock *block, ram_addr_t offset) 682 { 683 size_t size, len; 684 bool same_block = (block == pss->last_sent_block); 685 686 if (same_block) { 687 offset |= RAM_SAVE_FLAG_CONTINUE; 688 } 689 qemu_put_be64(f, offset); 690 size = 8; 691 692 if (!same_block) { 693 len = strlen(block->idstr); 694 qemu_put_byte(f, len); 695 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 696 size += 1 + len; 697 pss->last_sent_block = block; 698 } 699 return size; 700 } 701 702 /** 703 * mig_throttle_guest_down: throttle down the guest 704 * 705 * Reduce amount of guest cpu execution to hopefully slow down memory 706 * writes. If guest dirty memory rate is reduced below the rate at 707 * which we can transfer pages to the destination then we should be 708 * able to complete migration. Some workloads dirty memory way too 709 * fast and will not effectively converge, even with auto-converge. 710 */ 711 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 712 uint64_t bytes_dirty_threshold) 713 { 714 uint64_t pct_initial = migrate_cpu_throttle_initial(); 715 uint64_t pct_increment = migrate_cpu_throttle_increment(); 716 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 717 int pct_max = migrate_max_cpu_throttle(); 718 719 uint64_t throttle_now = cpu_throttle_get_percentage(); 720 uint64_t cpu_now, cpu_ideal, throttle_inc; 721 722 /* We have not started throttling yet. Let's start it. */ 723 if (!cpu_throttle_active()) { 724 cpu_throttle_set(pct_initial); 725 } else { 726 /* Throttling already on, just increase the rate */ 727 if (!pct_tailslow) { 728 throttle_inc = pct_increment; 729 } else { 730 /* Compute the ideal CPU percentage used by Guest, which may 731 * make the dirty rate match the dirty rate threshold. */ 732 cpu_now = 100 - throttle_now; 733 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 734 bytes_dirty_period); 735 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 736 } 737 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 738 } 739 } 740 741 void mig_throttle_counter_reset(void) 742 { 743 RAMState *rs = ram_state; 744 745 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 746 rs->num_dirty_pages_period = 0; 747 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred); 748 } 749 750 /** 751 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 752 * 753 * @rs: current RAM state 754 * @current_addr: address for the zero page 755 * 756 * Update the xbzrle cache to reflect a page that's been sent as all 0. 757 * The important thing is that a stale (not-yet-0'd) page be replaced 758 * by the new data. 759 * As a bonus, if the page wasn't in the cache it gets added so that 760 * when a small write is made into the 0'd page it gets XBZRLE sent. 761 */ 762 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 763 { 764 /* We don't care if this fails to allocate a new cache page 765 * as long as it updated an old one */ 766 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 767 stat64_get(&mig_stats.dirty_sync_count)); 768 } 769 770 #define ENCODING_FLAG_XBZRLE 0x1 771 772 /** 773 * save_xbzrle_page: compress and send current page 774 * 775 * Returns: 1 means that we wrote the page 776 * 0 means that page is identical to the one already sent 777 * -1 means that xbzrle would be longer than normal 778 * 779 * @rs: current RAM state 780 * @pss: current PSS channel 781 * @current_data: pointer to the address of the page contents 782 * @current_addr: addr of the page 783 * @block: block that contains the page we want to send 784 * @offset: offset inside the block for the page 785 */ 786 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 787 uint8_t **current_data, ram_addr_t current_addr, 788 RAMBlock *block, ram_addr_t offset) 789 { 790 int encoded_len = 0, bytes_xbzrle; 791 uint8_t *prev_cached_page; 792 QEMUFile *file = pss->pss_channel; 793 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 794 795 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 796 xbzrle_counters.cache_miss++; 797 if (!rs->last_stage) { 798 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 799 generation) == -1) { 800 return -1; 801 } else { 802 /* update *current_data when the page has been 803 inserted into cache */ 804 *current_data = get_cached_data(XBZRLE.cache, current_addr); 805 } 806 } 807 return -1; 808 } 809 810 /* 811 * Reaching here means the page has hit the xbzrle cache, no matter what 812 * encoding result it is (normal encoding, overflow or skipping the page), 813 * count the page as encoded. This is used to calculate the encoding rate. 814 * 815 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 816 * 2nd page turns out to be skipped (i.e. no new bytes written to the 817 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 818 * skipped page included. In this way, the encoding rate can tell if the 819 * guest page is good for xbzrle encoding. 820 */ 821 xbzrle_counters.pages++; 822 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 823 824 /* save current buffer into memory */ 825 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 826 827 /* XBZRLE encoding (if there is no overflow) */ 828 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, 829 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 830 TARGET_PAGE_SIZE); 831 832 /* 833 * Update the cache contents, so that it corresponds to the data 834 * sent, in all cases except where we skip the page. 835 */ 836 if (!rs->last_stage && encoded_len != 0) { 837 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 838 /* 839 * In the case where we couldn't compress, ensure that the caller 840 * sends the data from the cache, since the guest might have 841 * changed the RAM since we copied it. 842 */ 843 *current_data = prev_cached_page; 844 } 845 846 if (encoded_len == 0) { 847 trace_save_xbzrle_page_skipping(); 848 return 0; 849 } else if (encoded_len == -1) { 850 trace_save_xbzrle_page_overflow(); 851 xbzrle_counters.overflow++; 852 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 853 return -1; 854 } 855 856 /* Send XBZRLE based compressed page */ 857 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 858 offset | RAM_SAVE_FLAG_XBZRLE); 859 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 860 qemu_put_be16(file, encoded_len); 861 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 862 bytes_xbzrle += encoded_len + 1 + 2; 863 /* 864 * Like compressed_size (please see update_compress_thread_counts), 865 * the xbzrle encoded bytes don't count the 8 byte header with 866 * RAM_SAVE_FLAG_CONTINUE. 867 */ 868 xbzrle_counters.bytes += bytes_xbzrle - 8; 869 ram_transferred_add(bytes_xbzrle); 870 871 return 1; 872 } 873 874 /** 875 * pss_find_next_dirty: find the next dirty page of current ramblock 876 * 877 * This function updates pss->page to point to the next dirty page index 878 * within the ramblock to migrate, or the end of ramblock when nothing 879 * found. Note that when pss->host_page_sending==true it means we're 880 * during sending a host page, so we won't look for dirty page that is 881 * outside the host page boundary. 882 * 883 * @pss: the current page search status 884 */ 885 static void pss_find_next_dirty(PageSearchStatus *pss) 886 { 887 RAMBlock *rb = pss->block; 888 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 889 unsigned long *bitmap = rb->bmap; 890 891 if (ramblock_is_ignored(rb)) { 892 /* Points directly to the end, so we know no dirty page */ 893 pss->page = size; 894 return; 895 } 896 897 /* 898 * If during sending a host page, only look for dirty pages within the 899 * current host page being send. 900 */ 901 if (pss->host_page_sending) { 902 assert(pss->host_page_end); 903 size = MIN(size, pss->host_page_end); 904 } 905 906 pss->page = find_next_bit(bitmap, size, pss->page); 907 } 908 909 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 910 unsigned long page) 911 { 912 uint8_t shift; 913 hwaddr size, start; 914 915 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 916 return; 917 } 918 919 shift = rb->clear_bmap_shift; 920 /* 921 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 922 * can make things easier sometimes since then start address 923 * of the small chunk will always be 64 pages aligned so the 924 * bitmap will always be aligned to unsigned long. We should 925 * even be able to remove this restriction but I'm simply 926 * keeping it. 927 */ 928 assert(shift >= 6); 929 930 size = 1ULL << (TARGET_PAGE_BITS + shift); 931 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 932 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 933 memory_region_clear_dirty_bitmap(rb->mr, start, size); 934 } 935 936 static void 937 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 938 unsigned long start, 939 unsigned long npages) 940 { 941 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 942 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 943 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 944 945 /* 946 * Clear pages from start to start + npages - 1, so the end boundary is 947 * exclusive. 948 */ 949 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 950 migration_clear_memory_region_dirty_bitmap(rb, i); 951 } 952 } 953 954 /* 955 * colo_bitmap_find_diry:find contiguous dirty pages from start 956 * 957 * Returns the page offset within memory region of the start of the contiguout 958 * dirty page 959 * 960 * @rs: current RAM state 961 * @rb: RAMBlock where to search for dirty pages 962 * @start: page where we start the search 963 * @num: the number of contiguous dirty pages 964 */ 965 static inline 966 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 967 unsigned long start, unsigned long *num) 968 { 969 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 970 unsigned long *bitmap = rb->bmap; 971 unsigned long first, next; 972 973 *num = 0; 974 975 if (ramblock_is_ignored(rb)) { 976 return size; 977 } 978 979 first = find_next_bit(bitmap, size, start); 980 if (first >= size) { 981 return first; 982 } 983 next = find_next_zero_bit(bitmap, size, first + 1); 984 assert(next >= first); 985 *num = next - first; 986 return first; 987 } 988 989 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 990 RAMBlock *rb, 991 unsigned long page) 992 { 993 bool ret; 994 995 /* 996 * Clear dirty bitmap if needed. This _must_ be called before we 997 * send any of the page in the chunk because we need to make sure 998 * we can capture further page content changes when we sync dirty 999 * log the next time. So as long as we are going to send any of 1000 * the page in the chunk we clear the remote dirty bitmap for all. 1001 * Clearing it earlier won't be a problem, but too late will. 1002 */ 1003 migration_clear_memory_region_dirty_bitmap(rb, page); 1004 1005 ret = test_and_clear_bit(page, rb->bmap); 1006 if (ret) { 1007 rs->migration_dirty_pages--; 1008 } 1009 1010 return ret; 1011 } 1012 1013 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 1014 void *opaque) 1015 { 1016 const hwaddr offset = section->offset_within_region; 1017 const hwaddr size = int128_get64(section->size); 1018 const unsigned long start = offset >> TARGET_PAGE_BITS; 1019 const unsigned long npages = size >> TARGET_PAGE_BITS; 1020 RAMBlock *rb = section->mr->ram_block; 1021 uint64_t *cleared_bits = opaque; 1022 1023 /* 1024 * We don't grab ram_state->bitmap_mutex because we expect to run 1025 * only when starting migration or during postcopy recovery where 1026 * we don't have concurrent access. 1027 */ 1028 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1029 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1030 } 1031 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1032 bitmap_clear(rb->bmap, start, npages); 1033 } 1034 1035 /* 1036 * Exclude all dirty pages from migration that fall into a discarded range as 1037 * managed by a RamDiscardManager responsible for the mapped memory region of 1038 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1039 * 1040 * Discarded pages ("logically unplugged") have undefined content and must 1041 * not get migrated, because even reading these pages for migration might 1042 * result in undesired behavior. 1043 * 1044 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1045 * 1046 * Note: The result is only stable while migrating (precopy/postcopy). 1047 */ 1048 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1049 { 1050 uint64_t cleared_bits = 0; 1051 1052 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1053 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1054 MemoryRegionSection section = { 1055 .mr = rb->mr, 1056 .offset_within_region = 0, 1057 .size = int128_make64(qemu_ram_get_used_length(rb)), 1058 }; 1059 1060 ram_discard_manager_replay_discarded(rdm, §ion, 1061 dirty_bitmap_clear_section, 1062 &cleared_bits); 1063 } 1064 return cleared_bits; 1065 } 1066 1067 /* 1068 * Check if a host-page aligned page falls into a discarded range as managed by 1069 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1070 * 1071 * Note: The result is only stable while migrating (precopy/postcopy). 1072 */ 1073 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1074 { 1075 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1076 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1077 MemoryRegionSection section = { 1078 .mr = rb->mr, 1079 .offset_within_region = start, 1080 .size = int128_make64(qemu_ram_pagesize(rb)), 1081 }; 1082 1083 return !ram_discard_manager_is_populated(rdm, §ion); 1084 } 1085 return false; 1086 } 1087 1088 /* Called with RCU critical section */ 1089 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1090 { 1091 uint64_t new_dirty_pages = 1092 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1093 1094 rs->migration_dirty_pages += new_dirty_pages; 1095 rs->num_dirty_pages_period += new_dirty_pages; 1096 } 1097 1098 /** 1099 * ram_pagesize_summary: calculate all the pagesizes of a VM 1100 * 1101 * Returns a summary bitmap of the page sizes of all RAMBlocks 1102 * 1103 * For VMs with just normal pages this is equivalent to the host page 1104 * size. If it's got some huge pages then it's the OR of all the 1105 * different page sizes. 1106 */ 1107 uint64_t ram_pagesize_summary(void) 1108 { 1109 RAMBlock *block; 1110 uint64_t summary = 0; 1111 1112 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1113 summary |= block->page_size; 1114 } 1115 1116 return summary; 1117 } 1118 1119 uint64_t ram_get_total_transferred_pages(void) 1120 { 1121 return stat64_get(&mig_stats.normal_pages) + 1122 stat64_get(&mig_stats.zero_pages) + 1123 compression_counters.pages + xbzrle_counters.pages; 1124 } 1125 1126 static void migration_update_rates(RAMState *rs, int64_t end_time) 1127 { 1128 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1129 double compressed_size; 1130 1131 /* calculate period counters */ 1132 stat64_set(&mig_stats.dirty_pages_rate, 1133 rs->num_dirty_pages_period * 1000 / 1134 (end_time - rs->time_last_bitmap_sync)); 1135 1136 if (!page_count) { 1137 return; 1138 } 1139 1140 if (migrate_xbzrle()) { 1141 double encoded_size, unencoded_size; 1142 1143 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1144 rs->xbzrle_cache_miss_prev) / page_count; 1145 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1146 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1147 TARGET_PAGE_SIZE; 1148 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1149 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1150 xbzrle_counters.encoding_rate = 0; 1151 } else { 1152 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1153 } 1154 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1155 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1156 } 1157 1158 if (migrate_compress()) { 1159 compression_counters.busy_rate = (double)(compression_counters.busy - 1160 rs->compress_thread_busy_prev) / page_count; 1161 rs->compress_thread_busy_prev = compression_counters.busy; 1162 1163 compressed_size = compression_counters.compressed_size - 1164 rs->compressed_size_prev; 1165 if (compressed_size) { 1166 double uncompressed_size = (compression_counters.pages - 1167 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1168 1169 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1170 compression_counters.compression_rate = 1171 uncompressed_size / compressed_size; 1172 1173 rs->compress_pages_prev = compression_counters.pages; 1174 rs->compressed_size_prev = compression_counters.compressed_size; 1175 } 1176 } 1177 } 1178 1179 static void migration_trigger_throttle(RAMState *rs) 1180 { 1181 uint64_t threshold = migrate_throttle_trigger_threshold(); 1182 uint64_t bytes_xfer_period = 1183 stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev; 1184 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1185 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1186 1187 /* During block migration the auto-converge logic incorrectly detects 1188 * that ram migration makes no progress. Avoid this by disabling the 1189 * throttling logic during the bulk phase of block migration. */ 1190 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1191 /* The following detection logic can be refined later. For now: 1192 Check to see if the ratio between dirtied bytes and the approx. 1193 amount of bytes that just got transferred since the last time 1194 we were in this routine reaches the threshold. If that happens 1195 twice, start or increase throttling. */ 1196 1197 if ((bytes_dirty_period > bytes_dirty_threshold) && 1198 (++rs->dirty_rate_high_cnt >= 2)) { 1199 trace_migration_throttle(); 1200 rs->dirty_rate_high_cnt = 0; 1201 mig_throttle_guest_down(bytes_dirty_period, 1202 bytes_dirty_threshold); 1203 } 1204 } 1205 } 1206 1207 static void migration_bitmap_sync(RAMState *rs) 1208 { 1209 RAMBlock *block; 1210 int64_t end_time; 1211 1212 stat64_add(&mig_stats.dirty_sync_count, 1); 1213 1214 if (!rs->time_last_bitmap_sync) { 1215 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1216 } 1217 1218 trace_migration_bitmap_sync_start(); 1219 memory_global_dirty_log_sync(); 1220 1221 qemu_mutex_lock(&rs->bitmap_mutex); 1222 WITH_RCU_READ_LOCK_GUARD() { 1223 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1224 ramblock_sync_dirty_bitmap(rs, block); 1225 } 1226 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1227 } 1228 qemu_mutex_unlock(&rs->bitmap_mutex); 1229 1230 memory_global_after_dirty_log_sync(); 1231 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1232 1233 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1234 1235 /* more than 1 second = 1000 millisecons */ 1236 if (end_time > rs->time_last_bitmap_sync + 1000) { 1237 migration_trigger_throttle(rs); 1238 1239 migration_update_rates(rs, end_time); 1240 1241 rs->target_page_count_prev = rs->target_page_count; 1242 1243 /* reset period counters */ 1244 rs->time_last_bitmap_sync = end_time; 1245 rs->num_dirty_pages_period = 0; 1246 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred); 1247 } 1248 if (migrate_events()) { 1249 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1250 qapi_event_send_migration_pass(generation); 1251 } 1252 } 1253 1254 static void migration_bitmap_sync_precopy(RAMState *rs) 1255 { 1256 Error *local_err = NULL; 1257 1258 /* 1259 * The current notifier usage is just an optimization to migration, so we 1260 * don't stop the normal migration process in the error case. 1261 */ 1262 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1263 error_report_err(local_err); 1264 local_err = NULL; 1265 } 1266 1267 migration_bitmap_sync(rs); 1268 1269 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1270 error_report_err(local_err); 1271 } 1272 } 1273 1274 void ram_release_page(const char *rbname, uint64_t offset) 1275 { 1276 if (!migrate_release_ram() || !migration_in_postcopy()) { 1277 return; 1278 } 1279 1280 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1281 } 1282 1283 /** 1284 * save_zero_page_to_file: send the zero page to the file 1285 * 1286 * Returns the size of data written to the file, 0 means the page is not 1287 * a zero page 1288 * 1289 * @pss: current PSS channel 1290 * @block: block that contains the page we want to send 1291 * @offset: offset inside the block for the page 1292 */ 1293 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1294 RAMBlock *block, ram_addr_t offset) 1295 { 1296 uint8_t *p = block->host + offset; 1297 int len = 0; 1298 1299 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1300 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1301 qemu_put_byte(file, 0); 1302 len += 1; 1303 ram_release_page(block->idstr, offset); 1304 } 1305 return len; 1306 } 1307 1308 /** 1309 * save_zero_page: send the zero page to the stream 1310 * 1311 * Returns the number of pages written. 1312 * 1313 * @pss: current PSS channel 1314 * @block: block that contains the page we want to send 1315 * @offset: offset inside the block for the page 1316 */ 1317 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1318 ram_addr_t offset) 1319 { 1320 int len = save_zero_page_to_file(pss, f, block, offset); 1321 1322 if (len) { 1323 stat64_add(&mig_stats.zero_pages, 1); 1324 ram_transferred_add(len); 1325 return 1; 1326 } 1327 return -1; 1328 } 1329 1330 /* 1331 * @pages: the number of pages written by the control path, 1332 * < 0 - error 1333 * > 0 - number of pages written 1334 * 1335 * Return true if the pages has been saved, otherwise false is returned. 1336 */ 1337 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1338 ram_addr_t offset, int *pages) 1339 { 1340 uint64_t bytes_xmit = 0; 1341 int ret; 1342 1343 *pages = -1; 1344 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1345 TARGET_PAGE_SIZE, &bytes_xmit); 1346 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1347 return false; 1348 } 1349 1350 if (bytes_xmit) { 1351 ram_transferred_add(bytes_xmit); 1352 *pages = 1; 1353 } 1354 1355 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1356 return true; 1357 } 1358 1359 if (bytes_xmit > 0) { 1360 stat64_add(&mig_stats.normal_pages, 1); 1361 } else if (bytes_xmit == 0) { 1362 stat64_add(&mig_stats.zero_pages, 1); 1363 } 1364 1365 return true; 1366 } 1367 1368 /* 1369 * directly send the page to the stream 1370 * 1371 * Returns the number of pages written. 1372 * 1373 * @pss: current PSS channel 1374 * @block: block that contains the page we want to send 1375 * @offset: offset inside the block for the page 1376 * @buf: the page to be sent 1377 * @async: send to page asyncly 1378 */ 1379 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1380 ram_addr_t offset, uint8_t *buf, bool async) 1381 { 1382 QEMUFile *file = pss->pss_channel; 1383 1384 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1385 offset | RAM_SAVE_FLAG_PAGE)); 1386 if (async) { 1387 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1388 migrate_release_ram() && 1389 migration_in_postcopy()); 1390 } else { 1391 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1392 } 1393 ram_transferred_add(TARGET_PAGE_SIZE); 1394 stat64_add(&mig_stats.normal_pages, 1); 1395 return 1; 1396 } 1397 1398 /** 1399 * ram_save_page: send the given page to the stream 1400 * 1401 * Returns the number of pages written. 1402 * < 0 - error 1403 * >=0 - Number of pages written - this might legally be 0 1404 * if xbzrle noticed the page was the same. 1405 * 1406 * @rs: current RAM state 1407 * @block: block that contains the page we want to send 1408 * @offset: offset inside the block for the page 1409 */ 1410 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1411 { 1412 int pages = -1; 1413 uint8_t *p; 1414 bool send_async = true; 1415 RAMBlock *block = pss->block; 1416 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1417 ram_addr_t current_addr = block->offset + offset; 1418 1419 p = block->host + offset; 1420 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1421 1422 XBZRLE_cache_lock(); 1423 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1424 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1425 block, offset); 1426 if (!rs->last_stage) { 1427 /* Can't send this cached data async, since the cache page 1428 * might get updated before it gets to the wire 1429 */ 1430 send_async = false; 1431 } 1432 } 1433 1434 /* XBZRLE overflow or normal page */ 1435 if (pages == -1) { 1436 pages = save_normal_page(pss, block, offset, p, send_async); 1437 } 1438 1439 XBZRLE_cache_unlock(); 1440 1441 return pages; 1442 } 1443 1444 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1445 ram_addr_t offset) 1446 { 1447 if (multifd_queue_page(file, block, offset) < 0) { 1448 return -1; 1449 } 1450 stat64_add(&mig_stats.normal_pages, 1); 1451 1452 return 1; 1453 } 1454 1455 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1456 ram_addr_t offset, uint8_t *source_buf) 1457 { 1458 RAMState *rs = ram_state; 1459 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1460 uint8_t *p = block->host + offset; 1461 int ret; 1462 1463 if (save_zero_page_to_file(pss, f, block, offset)) { 1464 return true; 1465 } 1466 1467 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1468 1469 /* 1470 * copy it to a internal buffer to avoid it being modified by VM 1471 * so that we can catch up the error during compression and 1472 * decompression 1473 */ 1474 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1475 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1476 if (ret < 0) { 1477 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1478 error_report("compressed data failed!"); 1479 } 1480 return false; 1481 } 1482 1483 static void 1484 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1485 { 1486 ram_transferred_add(bytes_xmit); 1487 1488 if (param->zero_page) { 1489 stat64_add(&mig_stats.zero_pages, 1); 1490 return; 1491 } 1492 1493 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1494 compression_counters.compressed_size += bytes_xmit - 8; 1495 compression_counters.pages++; 1496 } 1497 1498 static bool save_page_use_compression(RAMState *rs); 1499 1500 static void flush_compressed_data(RAMState *rs) 1501 { 1502 MigrationState *ms = migrate_get_current(); 1503 int idx, len, thread_count; 1504 1505 if (!save_page_use_compression(rs)) { 1506 return; 1507 } 1508 thread_count = migrate_compress_threads(); 1509 1510 qemu_mutex_lock(&comp_done_lock); 1511 for (idx = 0; idx < thread_count; idx++) { 1512 while (!comp_param[idx].done) { 1513 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1514 } 1515 } 1516 qemu_mutex_unlock(&comp_done_lock); 1517 1518 for (idx = 0; idx < thread_count; idx++) { 1519 qemu_mutex_lock(&comp_param[idx].mutex); 1520 if (!comp_param[idx].quit) { 1521 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1522 /* 1523 * it's safe to fetch zero_page without holding comp_done_lock 1524 * as there is no further request submitted to the thread, 1525 * i.e, the thread should be waiting for a request at this point. 1526 */ 1527 update_compress_thread_counts(&comp_param[idx], len); 1528 } 1529 qemu_mutex_unlock(&comp_param[idx].mutex); 1530 } 1531 } 1532 1533 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1534 ram_addr_t offset) 1535 { 1536 param->block = block; 1537 param->offset = offset; 1538 } 1539 1540 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1541 { 1542 int idx, thread_count, bytes_xmit = -1, pages = -1; 1543 bool wait = migrate_compress_wait_thread(); 1544 MigrationState *ms = migrate_get_current(); 1545 1546 thread_count = migrate_compress_threads(); 1547 qemu_mutex_lock(&comp_done_lock); 1548 retry: 1549 for (idx = 0; idx < thread_count; idx++) { 1550 if (comp_param[idx].done) { 1551 comp_param[idx].done = false; 1552 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1553 comp_param[idx].file); 1554 qemu_mutex_lock(&comp_param[idx].mutex); 1555 set_compress_params(&comp_param[idx], block, offset); 1556 qemu_cond_signal(&comp_param[idx].cond); 1557 qemu_mutex_unlock(&comp_param[idx].mutex); 1558 pages = 1; 1559 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1560 break; 1561 } 1562 } 1563 1564 /* 1565 * wait for the free thread if the user specifies 'compress-wait-thread', 1566 * otherwise we will post the page out in the main thread as normal page. 1567 */ 1568 if (pages < 0 && wait) { 1569 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1570 goto retry; 1571 } 1572 qemu_mutex_unlock(&comp_done_lock); 1573 1574 return pages; 1575 } 1576 1577 #define PAGE_ALL_CLEAN 0 1578 #define PAGE_TRY_AGAIN 1 1579 #define PAGE_DIRTY_FOUND 2 1580 /** 1581 * find_dirty_block: find the next dirty page and update any state 1582 * associated with the search process. 1583 * 1584 * Returns: 1585 * <0: An error happened 1586 * PAGE_ALL_CLEAN: no dirty page found, give up 1587 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1588 * PAGE_DIRTY_FOUND: dirty page found 1589 * 1590 * @rs: current RAM state 1591 * @pss: data about the state of the current dirty page scan 1592 * @again: set to false if the search has scanned the whole of RAM 1593 */ 1594 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1595 { 1596 /* Update pss->page for the next dirty bit in ramblock */ 1597 pss_find_next_dirty(pss); 1598 1599 if (pss->complete_round && pss->block == rs->last_seen_block && 1600 pss->page >= rs->last_page) { 1601 /* 1602 * We've been once around the RAM and haven't found anything. 1603 * Give up. 1604 */ 1605 return PAGE_ALL_CLEAN; 1606 } 1607 if (!offset_in_ramblock(pss->block, 1608 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1609 /* Didn't find anything in this RAM Block */ 1610 pss->page = 0; 1611 pss->block = QLIST_NEXT_RCU(pss->block, next); 1612 if (!pss->block) { 1613 if (!migrate_multifd_flush_after_each_section()) { 1614 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1615 int ret = multifd_send_sync_main(f); 1616 if (ret < 0) { 1617 return ret; 1618 } 1619 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 1620 qemu_fflush(f); 1621 } 1622 /* 1623 * If memory migration starts over, we will meet a dirtied page 1624 * which may still exists in compression threads's ring, so we 1625 * should flush the compressed data to make sure the new page 1626 * is not overwritten by the old one in the destination. 1627 * 1628 * Also If xbzrle is on, stop using the data compression at this 1629 * point. In theory, xbzrle can do better than compression. 1630 */ 1631 flush_compressed_data(rs); 1632 1633 /* Hit the end of the list */ 1634 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1635 /* Flag that we've looped */ 1636 pss->complete_round = true; 1637 /* After the first round, enable XBZRLE. */ 1638 if (migrate_xbzrle()) { 1639 rs->xbzrle_enabled = true; 1640 } 1641 } 1642 /* Didn't find anything this time, but try again on the new block */ 1643 return PAGE_TRY_AGAIN; 1644 } else { 1645 /* We've found something */ 1646 return PAGE_DIRTY_FOUND; 1647 } 1648 } 1649 1650 /** 1651 * unqueue_page: gets a page of the queue 1652 * 1653 * Helper for 'get_queued_page' - gets a page off the queue 1654 * 1655 * Returns the block of the page (or NULL if none available) 1656 * 1657 * @rs: current RAM state 1658 * @offset: used to return the offset within the RAMBlock 1659 */ 1660 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1661 { 1662 struct RAMSrcPageRequest *entry; 1663 RAMBlock *block = NULL; 1664 1665 if (!postcopy_has_request(rs)) { 1666 return NULL; 1667 } 1668 1669 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1670 1671 /* 1672 * This should _never_ change even after we take the lock, because no one 1673 * should be taking anything off the request list other than us. 1674 */ 1675 assert(postcopy_has_request(rs)); 1676 1677 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1678 block = entry->rb; 1679 *offset = entry->offset; 1680 1681 if (entry->len > TARGET_PAGE_SIZE) { 1682 entry->len -= TARGET_PAGE_SIZE; 1683 entry->offset += TARGET_PAGE_SIZE; 1684 } else { 1685 memory_region_unref(block->mr); 1686 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1687 g_free(entry); 1688 migration_consume_urgent_request(); 1689 } 1690 1691 return block; 1692 } 1693 1694 #if defined(__linux__) 1695 /** 1696 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1697 * is found, return RAM block pointer and page offset 1698 * 1699 * Returns pointer to the RAMBlock containing faulting page, 1700 * NULL if no write faults are pending 1701 * 1702 * @rs: current RAM state 1703 * @offset: page offset from the beginning of the block 1704 */ 1705 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1706 { 1707 struct uffd_msg uffd_msg; 1708 void *page_address; 1709 RAMBlock *block; 1710 int res; 1711 1712 if (!migrate_background_snapshot()) { 1713 return NULL; 1714 } 1715 1716 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1717 if (res <= 0) { 1718 return NULL; 1719 } 1720 1721 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1722 block = qemu_ram_block_from_host(page_address, false, offset); 1723 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1724 return block; 1725 } 1726 1727 /** 1728 * ram_save_release_protection: release UFFD write protection after 1729 * a range of pages has been saved 1730 * 1731 * @rs: current RAM state 1732 * @pss: page-search-status structure 1733 * @start_page: index of the first page in the range relative to pss->block 1734 * 1735 * Returns 0 on success, negative value in case of an error 1736 */ 1737 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1738 unsigned long start_page) 1739 { 1740 int res = 0; 1741 1742 /* Check if page is from UFFD-managed region. */ 1743 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1744 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1745 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1746 1747 /* Flush async buffers before un-protect. */ 1748 qemu_fflush(pss->pss_channel); 1749 /* Un-protect memory range. */ 1750 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1751 false, false); 1752 } 1753 1754 return res; 1755 } 1756 1757 /* ram_write_tracking_available: check if kernel supports required UFFD features 1758 * 1759 * Returns true if supports, false otherwise 1760 */ 1761 bool ram_write_tracking_available(void) 1762 { 1763 uint64_t uffd_features; 1764 int res; 1765 1766 res = uffd_query_features(&uffd_features); 1767 return (res == 0 && 1768 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1769 } 1770 1771 /* ram_write_tracking_compatible: check if guest configuration is 1772 * compatible with 'write-tracking' 1773 * 1774 * Returns true if compatible, false otherwise 1775 */ 1776 bool ram_write_tracking_compatible(void) 1777 { 1778 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1779 int uffd_fd; 1780 RAMBlock *block; 1781 bool ret = false; 1782 1783 /* Open UFFD file descriptor */ 1784 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1785 if (uffd_fd < 0) { 1786 return false; 1787 } 1788 1789 RCU_READ_LOCK_GUARD(); 1790 1791 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1792 uint64_t uffd_ioctls; 1793 1794 /* Nothing to do with read-only and MMIO-writable regions */ 1795 if (block->mr->readonly || block->mr->rom_device) { 1796 continue; 1797 } 1798 /* Try to register block memory via UFFD-IO to track writes */ 1799 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1800 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1801 goto out; 1802 } 1803 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1804 goto out; 1805 } 1806 } 1807 ret = true; 1808 1809 out: 1810 uffd_close_fd(uffd_fd); 1811 return ret; 1812 } 1813 1814 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1815 ram_addr_t size) 1816 { 1817 const ram_addr_t end = offset + size; 1818 1819 /* 1820 * We read one byte of each page; this will preallocate page tables if 1821 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1822 * where no page was populated yet. This might require adaption when 1823 * supporting other mappings, like shmem. 1824 */ 1825 for (; offset < end; offset += block->page_size) { 1826 char tmp = *((char *)block->host + offset); 1827 1828 /* Don't optimize the read out */ 1829 asm volatile("" : "+r" (tmp)); 1830 } 1831 } 1832 1833 static inline int populate_read_section(MemoryRegionSection *section, 1834 void *opaque) 1835 { 1836 const hwaddr size = int128_get64(section->size); 1837 hwaddr offset = section->offset_within_region; 1838 RAMBlock *block = section->mr->ram_block; 1839 1840 populate_read_range(block, offset, size); 1841 return 0; 1842 } 1843 1844 /* 1845 * ram_block_populate_read: preallocate page tables and populate pages in the 1846 * RAM block by reading a byte of each page. 1847 * 1848 * Since it's solely used for userfault_fd WP feature, here we just 1849 * hardcode page size to qemu_real_host_page_size. 1850 * 1851 * @block: RAM block to populate 1852 */ 1853 static void ram_block_populate_read(RAMBlock *rb) 1854 { 1855 /* 1856 * Skip populating all pages that fall into a discarded range as managed by 1857 * a RamDiscardManager responsible for the mapped memory region of the 1858 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1859 * must not get populated automatically. We don't have to track 1860 * modifications via userfaultfd WP reliably, because these pages will 1861 * not be part of the migration stream either way -- see 1862 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1863 * 1864 * Note: The result is only stable while migrating (precopy/postcopy). 1865 */ 1866 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1867 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1868 MemoryRegionSection section = { 1869 .mr = rb->mr, 1870 .offset_within_region = 0, 1871 .size = rb->mr->size, 1872 }; 1873 1874 ram_discard_manager_replay_populated(rdm, §ion, 1875 populate_read_section, NULL); 1876 } else { 1877 populate_read_range(rb, 0, rb->used_length); 1878 } 1879 } 1880 1881 /* 1882 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1883 */ 1884 void ram_write_tracking_prepare(void) 1885 { 1886 RAMBlock *block; 1887 1888 RCU_READ_LOCK_GUARD(); 1889 1890 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1891 /* Nothing to do with read-only and MMIO-writable regions */ 1892 if (block->mr->readonly || block->mr->rom_device) { 1893 continue; 1894 } 1895 1896 /* 1897 * Populate pages of the RAM block before enabling userfault_fd 1898 * write protection. 1899 * 1900 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1901 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1902 * pages with pte_none() entries in page table. 1903 */ 1904 ram_block_populate_read(block); 1905 } 1906 } 1907 1908 static inline int uffd_protect_section(MemoryRegionSection *section, 1909 void *opaque) 1910 { 1911 const hwaddr size = int128_get64(section->size); 1912 const hwaddr offset = section->offset_within_region; 1913 RAMBlock *rb = section->mr->ram_block; 1914 int uffd_fd = (uintptr_t)opaque; 1915 1916 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1917 false); 1918 } 1919 1920 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1921 { 1922 assert(rb->flags & RAM_UF_WRITEPROTECT); 1923 1924 /* See ram_block_populate_read() */ 1925 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1926 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1927 MemoryRegionSection section = { 1928 .mr = rb->mr, 1929 .offset_within_region = 0, 1930 .size = rb->mr->size, 1931 }; 1932 1933 return ram_discard_manager_replay_populated(rdm, §ion, 1934 uffd_protect_section, 1935 (void *)(uintptr_t)uffd_fd); 1936 } 1937 return uffd_change_protection(uffd_fd, rb->host, 1938 rb->used_length, true, false); 1939 } 1940 1941 /* 1942 * ram_write_tracking_start: start UFFD-WP memory tracking 1943 * 1944 * Returns 0 for success or negative value in case of error 1945 */ 1946 int ram_write_tracking_start(void) 1947 { 1948 int uffd_fd; 1949 RAMState *rs = ram_state; 1950 RAMBlock *block; 1951 1952 /* Open UFFD file descriptor */ 1953 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1954 if (uffd_fd < 0) { 1955 return uffd_fd; 1956 } 1957 rs->uffdio_fd = uffd_fd; 1958 1959 RCU_READ_LOCK_GUARD(); 1960 1961 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1962 /* Nothing to do with read-only and MMIO-writable regions */ 1963 if (block->mr->readonly || block->mr->rom_device) { 1964 continue; 1965 } 1966 1967 /* Register block memory with UFFD to track writes */ 1968 if (uffd_register_memory(rs->uffdio_fd, block->host, 1969 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1970 goto fail; 1971 } 1972 block->flags |= RAM_UF_WRITEPROTECT; 1973 memory_region_ref(block->mr); 1974 1975 /* Apply UFFD write protection to the block memory range */ 1976 if (ram_block_uffd_protect(block, uffd_fd)) { 1977 goto fail; 1978 } 1979 1980 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1981 block->host, block->max_length); 1982 } 1983 1984 return 0; 1985 1986 fail: 1987 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1988 1989 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1990 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1991 continue; 1992 } 1993 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1994 /* Cleanup flags and remove reference */ 1995 block->flags &= ~RAM_UF_WRITEPROTECT; 1996 memory_region_unref(block->mr); 1997 } 1998 1999 uffd_close_fd(uffd_fd); 2000 rs->uffdio_fd = -1; 2001 return -1; 2002 } 2003 2004 /** 2005 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 2006 */ 2007 void ram_write_tracking_stop(void) 2008 { 2009 RAMState *rs = ram_state; 2010 RAMBlock *block; 2011 2012 RCU_READ_LOCK_GUARD(); 2013 2014 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2015 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 2016 continue; 2017 } 2018 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 2019 2020 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 2021 block->host, block->max_length); 2022 2023 /* Cleanup flags and remove reference */ 2024 block->flags &= ~RAM_UF_WRITEPROTECT; 2025 memory_region_unref(block->mr); 2026 } 2027 2028 /* Finally close UFFD file descriptor */ 2029 uffd_close_fd(rs->uffdio_fd); 2030 rs->uffdio_fd = -1; 2031 } 2032 2033 #else 2034 /* No target OS support, stubs just fail or ignore */ 2035 2036 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 2037 { 2038 (void) rs; 2039 (void) offset; 2040 2041 return NULL; 2042 } 2043 2044 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 2045 unsigned long start_page) 2046 { 2047 (void) rs; 2048 (void) pss; 2049 (void) start_page; 2050 2051 return 0; 2052 } 2053 2054 bool ram_write_tracking_available(void) 2055 { 2056 return false; 2057 } 2058 2059 bool ram_write_tracking_compatible(void) 2060 { 2061 assert(0); 2062 return false; 2063 } 2064 2065 int ram_write_tracking_start(void) 2066 { 2067 assert(0); 2068 return -1; 2069 } 2070 2071 void ram_write_tracking_stop(void) 2072 { 2073 assert(0); 2074 } 2075 #endif /* defined(__linux__) */ 2076 2077 /** 2078 * get_queued_page: unqueue a page from the postcopy requests 2079 * 2080 * Skips pages that are already sent (!dirty) 2081 * 2082 * Returns true if a queued page is found 2083 * 2084 * @rs: current RAM state 2085 * @pss: data about the state of the current dirty page scan 2086 */ 2087 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2088 { 2089 RAMBlock *block; 2090 ram_addr_t offset; 2091 bool dirty; 2092 2093 do { 2094 block = unqueue_page(rs, &offset); 2095 /* 2096 * We're sending this page, and since it's postcopy nothing else 2097 * will dirty it, and we must make sure it doesn't get sent again 2098 * even if this queue request was received after the background 2099 * search already sent it. 2100 */ 2101 if (block) { 2102 unsigned long page; 2103 2104 page = offset >> TARGET_PAGE_BITS; 2105 dirty = test_bit(page, block->bmap); 2106 if (!dirty) { 2107 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2108 page); 2109 } else { 2110 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2111 } 2112 } 2113 2114 } while (block && !dirty); 2115 2116 if (!block) { 2117 /* 2118 * Poll write faults too if background snapshot is enabled; that's 2119 * when we have vcpus got blocked by the write protected pages. 2120 */ 2121 block = poll_fault_page(rs, &offset); 2122 } 2123 2124 if (block) { 2125 /* 2126 * We want the background search to continue from the queued page 2127 * since the guest is likely to want other pages near to the page 2128 * it just requested. 2129 */ 2130 pss->block = block; 2131 pss->page = offset >> TARGET_PAGE_BITS; 2132 2133 /* 2134 * This unqueued page would break the "one round" check, even is 2135 * really rare. 2136 */ 2137 pss->complete_round = false; 2138 } 2139 2140 return !!block; 2141 } 2142 2143 /** 2144 * migration_page_queue_free: drop any remaining pages in the ram 2145 * request queue 2146 * 2147 * It should be empty at the end anyway, but in error cases there may 2148 * be some left. in case that there is any page left, we drop it. 2149 * 2150 */ 2151 static void migration_page_queue_free(RAMState *rs) 2152 { 2153 struct RAMSrcPageRequest *mspr, *next_mspr; 2154 /* This queue generally should be empty - but in the case of a failed 2155 * migration might have some droppings in. 2156 */ 2157 RCU_READ_LOCK_GUARD(); 2158 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2159 memory_region_unref(mspr->rb->mr); 2160 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2161 g_free(mspr); 2162 } 2163 } 2164 2165 /** 2166 * ram_save_queue_pages: queue the page for transmission 2167 * 2168 * A request from postcopy destination for example. 2169 * 2170 * Returns zero on success or negative on error 2171 * 2172 * @rbname: Name of the RAMBLock of the request. NULL means the 2173 * same that last one. 2174 * @start: starting address from the start of the RAMBlock 2175 * @len: length (in bytes) to send 2176 */ 2177 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2178 { 2179 RAMBlock *ramblock; 2180 RAMState *rs = ram_state; 2181 2182 stat64_add(&mig_stats.postcopy_requests, 1); 2183 RCU_READ_LOCK_GUARD(); 2184 2185 if (!rbname) { 2186 /* Reuse last RAMBlock */ 2187 ramblock = rs->last_req_rb; 2188 2189 if (!ramblock) { 2190 /* 2191 * Shouldn't happen, we can't reuse the last RAMBlock if 2192 * it's the 1st request. 2193 */ 2194 error_report("ram_save_queue_pages no previous block"); 2195 return -1; 2196 } 2197 } else { 2198 ramblock = qemu_ram_block_by_name(rbname); 2199 2200 if (!ramblock) { 2201 /* We shouldn't be asked for a non-existent RAMBlock */ 2202 error_report("ram_save_queue_pages no block '%s'", rbname); 2203 return -1; 2204 } 2205 rs->last_req_rb = ramblock; 2206 } 2207 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2208 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2209 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2210 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2211 __func__, start, len, ramblock->used_length); 2212 return -1; 2213 } 2214 2215 /* 2216 * When with postcopy preempt, we send back the page directly in the 2217 * rp-return thread. 2218 */ 2219 if (postcopy_preempt_active()) { 2220 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2221 size_t page_size = qemu_ram_pagesize(ramblock); 2222 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2223 int ret = 0; 2224 2225 qemu_mutex_lock(&rs->bitmap_mutex); 2226 2227 pss_init(pss, ramblock, page_start); 2228 /* 2229 * Always use the preempt channel, and make sure it's there. It's 2230 * safe to access without lock, because when rp-thread is running 2231 * we should be the only one who operates on the qemufile 2232 */ 2233 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2234 assert(pss->pss_channel); 2235 2236 /* 2237 * It must be either one or multiple of host page size. Just 2238 * assert; if something wrong we're mostly split brain anyway. 2239 */ 2240 assert(len % page_size == 0); 2241 while (len) { 2242 if (ram_save_host_page_urgent(pss)) { 2243 error_report("%s: ram_save_host_page_urgent() failed: " 2244 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2245 __func__, ramblock->idstr, start); 2246 ret = -1; 2247 break; 2248 } 2249 /* 2250 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2251 * will automatically be moved and point to the next host page 2252 * we're going to send, so no need to update here. 2253 * 2254 * Normally QEMU never sends >1 host page in requests, so 2255 * logically we don't even need that as the loop should only 2256 * run once, but just to be consistent. 2257 */ 2258 len -= page_size; 2259 }; 2260 qemu_mutex_unlock(&rs->bitmap_mutex); 2261 2262 return ret; 2263 } 2264 2265 struct RAMSrcPageRequest *new_entry = 2266 g_new0(struct RAMSrcPageRequest, 1); 2267 new_entry->rb = ramblock; 2268 new_entry->offset = start; 2269 new_entry->len = len; 2270 2271 memory_region_ref(ramblock->mr); 2272 qemu_mutex_lock(&rs->src_page_req_mutex); 2273 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2274 migration_make_urgent_request(); 2275 qemu_mutex_unlock(&rs->src_page_req_mutex); 2276 2277 return 0; 2278 } 2279 2280 static bool save_page_use_compression(RAMState *rs) 2281 { 2282 if (!migrate_compress()) { 2283 return false; 2284 } 2285 2286 /* 2287 * If xbzrle is enabled (e.g., after first round of migration), stop 2288 * using the data compression. In theory, xbzrle can do better than 2289 * compression. 2290 */ 2291 if (rs->xbzrle_enabled) { 2292 return false; 2293 } 2294 2295 return true; 2296 } 2297 2298 /* 2299 * try to compress the page before posting it out, return true if the page 2300 * has been properly handled by compression, otherwise needs other 2301 * paths to handle it 2302 */ 2303 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2304 RAMBlock *block, ram_addr_t offset) 2305 { 2306 if (!save_page_use_compression(rs)) { 2307 return false; 2308 } 2309 2310 /* 2311 * When starting the process of a new block, the first page of 2312 * the block should be sent out before other pages in the same 2313 * block, and all the pages in last block should have been sent 2314 * out, keeping this order is important, because the 'cont' flag 2315 * is used to avoid resending the block name. 2316 * 2317 * We post the fist page as normal page as compression will take 2318 * much CPU resource. 2319 */ 2320 if (block != pss->last_sent_block) { 2321 flush_compressed_data(rs); 2322 return false; 2323 } 2324 2325 if (compress_page_with_multi_thread(block, offset) > 0) { 2326 return true; 2327 } 2328 2329 compression_counters.busy++; 2330 return false; 2331 } 2332 2333 /** 2334 * ram_save_target_page_legacy: save one target page 2335 * 2336 * Returns the number of pages written 2337 * 2338 * @rs: current RAM state 2339 * @pss: data about the page we want to send 2340 */ 2341 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2342 { 2343 RAMBlock *block = pss->block; 2344 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2345 int res; 2346 2347 if (control_save_page(pss, block, offset, &res)) { 2348 return res; 2349 } 2350 2351 if (save_compress_page(rs, pss, block, offset)) { 2352 return 1; 2353 } 2354 2355 res = save_zero_page(pss, pss->pss_channel, block, offset); 2356 if (res > 0) { 2357 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2358 * page would be stale 2359 */ 2360 if (rs->xbzrle_enabled) { 2361 XBZRLE_cache_lock(); 2362 xbzrle_cache_zero_page(rs, block->offset + offset); 2363 XBZRLE_cache_unlock(); 2364 } 2365 return res; 2366 } 2367 2368 /* 2369 * Do not use multifd in postcopy as one whole host page should be 2370 * placed. Meanwhile postcopy requires atomic update of pages, so even 2371 * if host page size == guest page size the dest guest during run may 2372 * still see partially copied pages which is data corruption. 2373 */ 2374 if (migrate_multifd() && !migration_in_postcopy()) { 2375 return ram_save_multifd_page(pss->pss_channel, block, offset); 2376 } 2377 2378 return ram_save_page(rs, pss); 2379 } 2380 2381 /* Should be called before sending a host page */ 2382 static void pss_host_page_prepare(PageSearchStatus *pss) 2383 { 2384 /* How many guest pages are there in one host page? */ 2385 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2386 2387 pss->host_page_sending = true; 2388 if (guest_pfns <= 1) { 2389 /* 2390 * This covers both when guest psize == host psize, or when guest 2391 * has larger psize than the host (guest_pfns==0). 2392 * 2393 * For the latter, we always send one whole guest page per 2394 * iteration of the host page (example: an Alpha VM on x86 host 2395 * will have guest psize 8K while host psize 4K). 2396 */ 2397 pss->host_page_start = pss->page; 2398 pss->host_page_end = pss->page + 1; 2399 } else { 2400 /* 2401 * The host page spans over multiple guest pages, we send them 2402 * within the same host page iteration. 2403 */ 2404 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2405 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2406 } 2407 } 2408 2409 /* 2410 * Whether the page pointed by PSS is within the host page being sent. 2411 * Must be called after a previous pss_host_page_prepare(). 2412 */ 2413 static bool pss_within_range(PageSearchStatus *pss) 2414 { 2415 ram_addr_t ram_addr; 2416 2417 assert(pss->host_page_sending); 2418 2419 /* Over host-page boundary? */ 2420 if (pss->page >= pss->host_page_end) { 2421 return false; 2422 } 2423 2424 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2425 2426 return offset_in_ramblock(pss->block, ram_addr); 2427 } 2428 2429 static void pss_host_page_finish(PageSearchStatus *pss) 2430 { 2431 pss->host_page_sending = false; 2432 /* This is not needed, but just to reset it */ 2433 pss->host_page_start = pss->host_page_end = 0; 2434 } 2435 2436 /* 2437 * Send an urgent host page specified by `pss'. Need to be called with 2438 * bitmap_mutex held. 2439 * 2440 * Returns 0 if save host page succeeded, false otherwise. 2441 */ 2442 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2443 { 2444 bool page_dirty, sent = false; 2445 RAMState *rs = ram_state; 2446 int ret = 0; 2447 2448 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2449 pss_host_page_prepare(pss); 2450 2451 /* 2452 * If precopy is sending the same page, let it be done in precopy, or 2453 * we could send the same page in two channels and none of them will 2454 * receive the whole page. 2455 */ 2456 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2457 trace_postcopy_preempt_hit(pss->block->idstr, 2458 pss->page << TARGET_PAGE_BITS); 2459 return 0; 2460 } 2461 2462 do { 2463 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2464 2465 if (page_dirty) { 2466 /* Be strict to return code; it must be 1, or what else? */ 2467 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2468 error_report_once("%s: ram_save_target_page failed", __func__); 2469 ret = -1; 2470 goto out; 2471 } 2472 sent = true; 2473 } 2474 pss_find_next_dirty(pss); 2475 } while (pss_within_range(pss)); 2476 out: 2477 pss_host_page_finish(pss); 2478 /* For urgent requests, flush immediately if sent */ 2479 if (sent) { 2480 qemu_fflush(pss->pss_channel); 2481 } 2482 return ret; 2483 } 2484 2485 /** 2486 * ram_save_host_page: save a whole host page 2487 * 2488 * Starting at *offset send pages up to the end of the current host 2489 * page. It's valid for the initial offset to point into the middle of 2490 * a host page in which case the remainder of the hostpage is sent. 2491 * Only dirty target pages are sent. Note that the host page size may 2492 * be a huge page for this block. 2493 * 2494 * The saving stops at the boundary of the used_length of the block 2495 * if the RAMBlock isn't a multiple of the host page size. 2496 * 2497 * The caller must be with ram_state.bitmap_mutex held to call this 2498 * function. Note that this function can temporarily release the lock, but 2499 * when the function is returned it'll make sure the lock is still held. 2500 * 2501 * Returns the number of pages written or negative on error 2502 * 2503 * @rs: current RAM state 2504 * @pss: data about the page we want to send 2505 */ 2506 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2507 { 2508 bool page_dirty, preempt_active = postcopy_preempt_active(); 2509 int tmppages, pages = 0; 2510 size_t pagesize_bits = 2511 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2512 unsigned long start_page = pss->page; 2513 int res; 2514 2515 if (ramblock_is_ignored(pss->block)) { 2516 error_report("block %s should not be migrated !", pss->block->idstr); 2517 return 0; 2518 } 2519 2520 /* Update host page boundary information */ 2521 pss_host_page_prepare(pss); 2522 2523 do { 2524 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2525 2526 /* Check the pages is dirty and if it is send it */ 2527 if (page_dirty) { 2528 /* 2529 * Properly yield the lock only in postcopy preempt mode 2530 * because both migration thread and rp-return thread can 2531 * operate on the bitmaps. 2532 */ 2533 if (preempt_active) { 2534 qemu_mutex_unlock(&rs->bitmap_mutex); 2535 } 2536 tmppages = migration_ops->ram_save_target_page(rs, pss); 2537 if (tmppages >= 0) { 2538 pages += tmppages; 2539 /* 2540 * Allow rate limiting to happen in the middle of huge pages if 2541 * something is sent in the current iteration. 2542 */ 2543 if (pagesize_bits > 1 && tmppages > 0) { 2544 migration_rate_limit(); 2545 } 2546 } 2547 if (preempt_active) { 2548 qemu_mutex_lock(&rs->bitmap_mutex); 2549 } 2550 } else { 2551 tmppages = 0; 2552 } 2553 2554 if (tmppages < 0) { 2555 pss_host_page_finish(pss); 2556 return tmppages; 2557 } 2558 2559 pss_find_next_dirty(pss); 2560 } while (pss_within_range(pss)); 2561 2562 pss_host_page_finish(pss); 2563 2564 res = ram_save_release_protection(rs, pss, start_page); 2565 return (res < 0 ? res : pages); 2566 } 2567 2568 /** 2569 * ram_find_and_save_block: finds a dirty page and sends it to f 2570 * 2571 * Called within an RCU critical section. 2572 * 2573 * Returns the number of pages written where zero means no dirty pages, 2574 * or negative on error 2575 * 2576 * @rs: current RAM state 2577 * 2578 * On systems where host-page-size > target-page-size it will send all the 2579 * pages in a host page that are dirty. 2580 */ 2581 static int ram_find_and_save_block(RAMState *rs) 2582 { 2583 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2584 int pages = 0; 2585 2586 /* No dirty page as there is zero RAM */ 2587 if (!rs->ram_bytes_total) { 2588 return pages; 2589 } 2590 2591 /* 2592 * Always keep last_seen_block/last_page valid during this procedure, 2593 * because find_dirty_block() relies on these values (e.g., we compare 2594 * last_seen_block with pss.block to see whether we searched all the 2595 * ramblocks) to detect the completion of migration. Having NULL value 2596 * of last_seen_block can conditionally cause below loop to run forever. 2597 */ 2598 if (!rs->last_seen_block) { 2599 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2600 rs->last_page = 0; 2601 } 2602 2603 pss_init(pss, rs->last_seen_block, rs->last_page); 2604 2605 while (true){ 2606 if (!get_queued_page(rs, pss)) { 2607 /* priority queue empty, so just search for something dirty */ 2608 int res = find_dirty_block(rs, pss); 2609 if (res != PAGE_DIRTY_FOUND) { 2610 if (res == PAGE_ALL_CLEAN) { 2611 break; 2612 } else if (res == PAGE_TRY_AGAIN) { 2613 continue; 2614 } else if (res < 0) { 2615 pages = res; 2616 break; 2617 } 2618 } 2619 } 2620 pages = ram_save_host_page(rs, pss); 2621 if (pages) { 2622 break; 2623 } 2624 } 2625 2626 rs->last_seen_block = pss->block; 2627 rs->last_page = pss->page; 2628 2629 return pages; 2630 } 2631 2632 static uint64_t ram_bytes_total_with_ignored(void) 2633 { 2634 RAMBlock *block; 2635 uint64_t total = 0; 2636 2637 RCU_READ_LOCK_GUARD(); 2638 2639 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2640 total += block->used_length; 2641 } 2642 return total; 2643 } 2644 2645 uint64_t ram_bytes_total(void) 2646 { 2647 RAMBlock *block; 2648 uint64_t total = 0; 2649 2650 RCU_READ_LOCK_GUARD(); 2651 2652 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2653 total += block->used_length; 2654 } 2655 return total; 2656 } 2657 2658 static void xbzrle_load_setup(void) 2659 { 2660 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2661 } 2662 2663 static void xbzrle_load_cleanup(void) 2664 { 2665 g_free(XBZRLE.decoded_buf); 2666 XBZRLE.decoded_buf = NULL; 2667 } 2668 2669 static void ram_state_cleanup(RAMState **rsp) 2670 { 2671 if (*rsp) { 2672 migration_page_queue_free(*rsp); 2673 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2674 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2675 g_free(*rsp); 2676 *rsp = NULL; 2677 } 2678 } 2679 2680 static void xbzrle_cleanup(void) 2681 { 2682 XBZRLE_cache_lock(); 2683 if (XBZRLE.cache) { 2684 cache_fini(XBZRLE.cache); 2685 g_free(XBZRLE.encoded_buf); 2686 g_free(XBZRLE.current_buf); 2687 g_free(XBZRLE.zero_target_page); 2688 XBZRLE.cache = NULL; 2689 XBZRLE.encoded_buf = NULL; 2690 XBZRLE.current_buf = NULL; 2691 XBZRLE.zero_target_page = NULL; 2692 } 2693 XBZRLE_cache_unlock(); 2694 } 2695 2696 static void ram_save_cleanup(void *opaque) 2697 { 2698 RAMState **rsp = opaque; 2699 RAMBlock *block; 2700 2701 /* We don't use dirty log with background snapshots */ 2702 if (!migrate_background_snapshot()) { 2703 /* caller have hold iothread lock or is in a bh, so there is 2704 * no writing race against the migration bitmap 2705 */ 2706 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2707 /* 2708 * do not stop dirty log without starting it, since 2709 * memory_global_dirty_log_stop will assert that 2710 * memory_global_dirty_log_start/stop used in pairs 2711 */ 2712 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2713 } 2714 } 2715 2716 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2717 g_free(block->clear_bmap); 2718 block->clear_bmap = NULL; 2719 g_free(block->bmap); 2720 block->bmap = NULL; 2721 } 2722 2723 xbzrle_cleanup(); 2724 compress_threads_save_cleanup(); 2725 ram_state_cleanup(rsp); 2726 g_free(migration_ops); 2727 migration_ops = NULL; 2728 } 2729 2730 static void ram_state_reset(RAMState *rs) 2731 { 2732 int i; 2733 2734 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2735 rs->pss[i].last_sent_block = NULL; 2736 } 2737 2738 rs->last_seen_block = NULL; 2739 rs->last_page = 0; 2740 rs->last_version = ram_list.version; 2741 rs->xbzrle_enabled = false; 2742 } 2743 2744 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2745 2746 /* **** functions for postcopy ***** */ 2747 2748 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2749 { 2750 struct RAMBlock *block; 2751 2752 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2753 unsigned long *bitmap = block->bmap; 2754 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2755 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2756 2757 while (run_start < range) { 2758 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2759 ram_discard_range(block->idstr, 2760 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2761 ((ram_addr_t)(run_end - run_start)) 2762 << TARGET_PAGE_BITS); 2763 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2764 } 2765 } 2766 } 2767 2768 /** 2769 * postcopy_send_discard_bm_ram: discard a RAMBlock 2770 * 2771 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2772 * 2773 * @ms: current migration state 2774 * @block: RAMBlock to discard 2775 */ 2776 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2777 { 2778 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2779 unsigned long current; 2780 unsigned long *bitmap = block->bmap; 2781 2782 for (current = 0; current < end; ) { 2783 unsigned long one = find_next_bit(bitmap, end, current); 2784 unsigned long zero, discard_length; 2785 2786 if (one >= end) { 2787 break; 2788 } 2789 2790 zero = find_next_zero_bit(bitmap, end, one + 1); 2791 2792 if (zero >= end) { 2793 discard_length = end - one; 2794 } else { 2795 discard_length = zero - one; 2796 } 2797 postcopy_discard_send_range(ms, one, discard_length); 2798 current = one + discard_length; 2799 } 2800 } 2801 2802 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2803 2804 /** 2805 * postcopy_each_ram_send_discard: discard all RAMBlocks 2806 * 2807 * Utility for the outgoing postcopy code. 2808 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2809 * passing it bitmap indexes and name. 2810 * (qemu_ram_foreach_block ends up passing unscaled lengths 2811 * which would mean postcopy code would have to deal with target page) 2812 * 2813 * @ms: current migration state 2814 */ 2815 static void postcopy_each_ram_send_discard(MigrationState *ms) 2816 { 2817 struct RAMBlock *block; 2818 2819 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2820 postcopy_discard_send_init(ms, block->idstr); 2821 2822 /* 2823 * Deal with TPS != HPS and huge pages. It discard any partially sent 2824 * host-page size chunks, mark any partially dirty host-page size 2825 * chunks as all dirty. In this case the host-page is the host-page 2826 * for the particular RAMBlock, i.e. it might be a huge page. 2827 */ 2828 postcopy_chunk_hostpages_pass(ms, block); 2829 2830 /* 2831 * Postcopy sends chunks of bitmap over the wire, but it 2832 * just needs indexes at this point, avoids it having 2833 * target page specific code. 2834 */ 2835 postcopy_send_discard_bm_ram(ms, block); 2836 postcopy_discard_send_finish(ms); 2837 } 2838 } 2839 2840 /** 2841 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2842 * 2843 * Helper for postcopy_chunk_hostpages; it's called twice to 2844 * canonicalize the two bitmaps, that are similar, but one is 2845 * inverted. 2846 * 2847 * Postcopy requires that all target pages in a hostpage are dirty or 2848 * clean, not a mix. This function canonicalizes the bitmaps. 2849 * 2850 * @ms: current migration state 2851 * @block: block that contains the page we want to canonicalize 2852 */ 2853 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2854 { 2855 RAMState *rs = ram_state; 2856 unsigned long *bitmap = block->bmap; 2857 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2858 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2859 unsigned long run_start; 2860 2861 if (block->page_size == TARGET_PAGE_SIZE) { 2862 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2863 return; 2864 } 2865 2866 /* Find a dirty page */ 2867 run_start = find_next_bit(bitmap, pages, 0); 2868 2869 while (run_start < pages) { 2870 2871 /* 2872 * If the start of this run of pages is in the middle of a host 2873 * page, then we need to fixup this host page. 2874 */ 2875 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2876 /* Find the end of this run */ 2877 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2878 /* 2879 * If the end isn't at the start of a host page, then the 2880 * run doesn't finish at the end of a host page 2881 * and we need to discard. 2882 */ 2883 } 2884 2885 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2886 unsigned long page; 2887 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2888 host_ratio); 2889 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2890 2891 /* Clean up the bitmap */ 2892 for (page = fixup_start_addr; 2893 page < fixup_start_addr + host_ratio; page++) { 2894 /* 2895 * Remark them as dirty, updating the count for any pages 2896 * that weren't previously dirty. 2897 */ 2898 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2899 } 2900 } 2901 2902 /* Find the next dirty page for the next iteration */ 2903 run_start = find_next_bit(bitmap, pages, run_start); 2904 } 2905 } 2906 2907 /** 2908 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2909 * 2910 * Transmit the set of pages to be discarded after precopy to the target 2911 * these are pages that: 2912 * a) Have been previously transmitted but are now dirty again 2913 * b) Pages that have never been transmitted, this ensures that 2914 * any pages on the destination that have been mapped by background 2915 * tasks get discarded (transparent huge pages is the specific concern) 2916 * Hopefully this is pretty sparse 2917 * 2918 * @ms: current migration state 2919 */ 2920 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2921 { 2922 RAMState *rs = ram_state; 2923 2924 RCU_READ_LOCK_GUARD(); 2925 2926 /* This should be our last sync, the src is now paused */ 2927 migration_bitmap_sync(rs); 2928 2929 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2930 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2931 rs->last_seen_block = NULL; 2932 rs->last_page = 0; 2933 2934 postcopy_each_ram_send_discard(ms); 2935 2936 trace_ram_postcopy_send_discard_bitmap(); 2937 } 2938 2939 /** 2940 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2941 * 2942 * Returns zero on success 2943 * 2944 * @rbname: name of the RAMBlock of the request. NULL means the 2945 * same that last one. 2946 * @start: RAMBlock starting page 2947 * @length: RAMBlock size 2948 */ 2949 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2950 { 2951 trace_ram_discard_range(rbname, start, length); 2952 2953 RCU_READ_LOCK_GUARD(); 2954 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2955 2956 if (!rb) { 2957 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2958 return -1; 2959 } 2960 2961 /* 2962 * On source VM, we don't need to update the received bitmap since 2963 * we don't even have one. 2964 */ 2965 if (rb->receivedmap) { 2966 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2967 length >> qemu_target_page_bits()); 2968 } 2969 2970 return ram_block_discard_range(rb, start, length); 2971 } 2972 2973 /* 2974 * For every allocation, we will try not to crash the VM if the 2975 * allocation failed. 2976 */ 2977 static int xbzrle_init(void) 2978 { 2979 Error *local_err = NULL; 2980 2981 if (!migrate_xbzrle()) { 2982 return 0; 2983 } 2984 2985 XBZRLE_cache_lock(); 2986 2987 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2988 if (!XBZRLE.zero_target_page) { 2989 error_report("%s: Error allocating zero page", __func__); 2990 goto err_out; 2991 } 2992 2993 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2994 TARGET_PAGE_SIZE, &local_err); 2995 if (!XBZRLE.cache) { 2996 error_report_err(local_err); 2997 goto free_zero_page; 2998 } 2999 3000 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 3001 if (!XBZRLE.encoded_buf) { 3002 error_report("%s: Error allocating encoded_buf", __func__); 3003 goto free_cache; 3004 } 3005 3006 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 3007 if (!XBZRLE.current_buf) { 3008 error_report("%s: Error allocating current_buf", __func__); 3009 goto free_encoded_buf; 3010 } 3011 3012 /* We are all good */ 3013 XBZRLE_cache_unlock(); 3014 return 0; 3015 3016 free_encoded_buf: 3017 g_free(XBZRLE.encoded_buf); 3018 XBZRLE.encoded_buf = NULL; 3019 free_cache: 3020 cache_fini(XBZRLE.cache); 3021 XBZRLE.cache = NULL; 3022 free_zero_page: 3023 g_free(XBZRLE.zero_target_page); 3024 XBZRLE.zero_target_page = NULL; 3025 err_out: 3026 XBZRLE_cache_unlock(); 3027 return -ENOMEM; 3028 } 3029 3030 static int ram_state_init(RAMState **rsp) 3031 { 3032 *rsp = g_try_new0(RAMState, 1); 3033 3034 if (!*rsp) { 3035 error_report("%s: Init ramstate fail", __func__); 3036 return -1; 3037 } 3038 3039 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3040 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3041 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3042 (*rsp)->ram_bytes_total = ram_bytes_total(); 3043 3044 /* 3045 * Count the total number of pages used by ram blocks not including any 3046 * gaps due to alignment or unplugs. 3047 * This must match with the initial values of dirty bitmap. 3048 */ 3049 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 3050 ram_state_reset(*rsp); 3051 3052 return 0; 3053 } 3054 3055 static void ram_list_init_bitmaps(void) 3056 { 3057 MigrationState *ms = migrate_get_current(); 3058 RAMBlock *block; 3059 unsigned long pages; 3060 uint8_t shift; 3061 3062 /* Skip setting bitmap if there is no RAM */ 3063 if (ram_bytes_total()) { 3064 shift = ms->clear_bitmap_shift; 3065 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3066 error_report("clear_bitmap_shift (%u) too big, using " 3067 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3068 shift = CLEAR_BITMAP_SHIFT_MAX; 3069 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3070 error_report("clear_bitmap_shift (%u) too small, using " 3071 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3072 shift = CLEAR_BITMAP_SHIFT_MIN; 3073 } 3074 3075 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3076 pages = block->max_length >> TARGET_PAGE_BITS; 3077 /* 3078 * The initial dirty bitmap for migration must be set with all 3079 * ones to make sure we'll migrate every guest RAM page to 3080 * destination. 3081 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3082 * new migration after a failed migration, ram_list. 3083 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3084 * guest memory. 3085 */ 3086 block->bmap = bitmap_new(pages); 3087 bitmap_set(block->bmap, 0, pages); 3088 block->clear_bmap_shift = shift; 3089 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3090 } 3091 } 3092 } 3093 3094 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3095 { 3096 unsigned long pages; 3097 RAMBlock *rb; 3098 3099 RCU_READ_LOCK_GUARD(); 3100 3101 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3102 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3103 rs->migration_dirty_pages -= pages; 3104 } 3105 } 3106 3107 static void ram_init_bitmaps(RAMState *rs) 3108 { 3109 /* For memory_global_dirty_log_start below. */ 3110 qemu_mutex_lock_iothread(); 3111 qemu_mutex_lock_ramlist(); 3112 3113 WITH_RCU_READ_LOCK_GUARD() { 3114 ram_list_init_bitmaps(); 3115 /* We don't use dirty log with background snapshots */ 3116 if (!migrate_background_snapshot()) { 3117 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3118 migration_bitmap_sync_precopy(rs); 3119 } 3120 } 3121 qemu_mutex_unlock_ramlist(); 3122 qemu_mutex_unlock_iothread(); 3123 3124 /* 3125 * After an eventual first bitmap sync, fixup the initial bitmap 3126 * containing all 1s to exclude any discarded pages from migration. 3127 */ 3128 migration_bitmap_clear_discarded_pages(rs); 3129 } 3130 3131 static int ram_init_all(RAMState **rsp) 3132 { 3133 if (ram_state_init(rsp)) { 3134 return -1; 3135 } 3136 3137 if (xbzrle_init()) { 3138 ram_state_cleanup(rsp); 3139 return -1; 3140 } 3141 3142 ram_init_bitmaps(*rsp); 3143 3144 return 0; 3145 } 3146 3147 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3148 { 3149 RAMBlock *block; 3150 uint64_t pages = 0; 3151 3152 /* 3153 * Postcopy is not using xbzrle/compression, so no need for that. 3154 * Also, since source are already halted, we don't need to care 3155 * about dirty page logging as well. 3156 */ 3157 3158 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3159 pages += bitmap_count_one(block->bmap, 3160 block->used_length >> TARGET_PAGE_BITS); 3161 } 3162 3163 /* This may not be aligned with current bitmaps. Recalculate. */ 3164 rs->migration_dirty_pages = pages; 3165 3166 ram_state_reset(rs); 3167 3168 /* Update RAMState cache of output QEMUFile */ 3169 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3170 3171 trace_ram_state_resume_prepare(pages); 3172 } 3173 3174 /* 3175 * This function clears bits of the free pages reported by the caller from the 3176 * migration dirty bitmap. @addr is the host address corresponding to the 3177 * start of the continuous guest free pages, and @len is the total bytes of 3178 * those pages. 3179 */ 3180 void qemu_guest_free_page_hint(void *addr, size_t len) 3181 { 3182 RAMBlock *block; 3183 ram_addr_t offset; 3184 size_t used_len, start, npages; 3185 MigrationState *s = migrate_get_current(); 3186 3187 /* This function is currently expected to be used during live migration */ 3188 if (!migration_is_setup_or_active(s->state)) { 3189 return; 3190 } 3191 3192 for (; len > 0; len -= used_len, addr += used_len) { 3193 block = qemu_ram_block_from_host(addr, false, &offset); 3194 if (unlikely(!block || offset >= block->used_length)) { 3195 /* 3196 * The implementation might not support RAMBlock resize during 3197 * live migration, but it could happen in theory with future 3198 * updates. So we add a check here to capture that case. 3199 */ 3200 error_report_once("%s unexpected error", __func__); 3201 return; 3202 } 3203 3204 if (len <= block->used_length - offset) { 3205 used_len = len; 3206 } else { 3207 used_len = block->used_length - offset; 3208 } 3209 3210 start = offset >> TARGET_PAGE_BITS; 3211 npages = used_len >> TARGET_PAGE_BITS; 3212 3213 qemu_mutex_lock(&ram_state->bitmap_mutex); 3214 /* 3215 * The skipped free pages are equavalent to be sent from clear_bmap's 3216 * perspective, so clear the bits from the memory region bitmap which 3217 * are initially set. Otherwise those skipped pages will be sent in 3218 * the next round after syncing from the memory region bitmap. 3219 */ 3220 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3221 ram_state->migration_dirty_pages -= 3222 bitmap_count_one_with_offset(block->bmap, start, npages); 3223 bitmap_clear(block->bmap, start, npages); 3224 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3225 } 3226 } 3227 3228 /* 3229 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3230 * long-running RCU critical section. When rcu-reclaims in the code 3231 * start to become numerous it will be necessary to reduce the 3232 * granularity of these critical sections. 3233 */ 3234 3235 /** 3236 * ram_save_setup: Setup RAM for migration 3237 * 3238 * Returns zero to indicate success and negative for error 3239 * 3240 * @f: QEMUFile where to send the data 3241 * @opaque: RAMState pointer 3242 */ 3243 static int ram_save_setup(QEMUFile *f, void *opaque) 3244 { 3245 RAMState **rsp = opaque; 3246 RAMBlock *block; 3247 int ret; 3248 3249 if (compress_threads_save_setup()) { 3250 return -1; 3251 } 3252 3253 /* migration has already setup the bitmap, reuse it. */ 3254 if (!migration_in_colo_state()) { 3255 if (ram_init_all(rsp) != 0) { 3256 compress_threads_save_cleanup(); 3257 return -1; 3258 } 3259 } 3260 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3261 3262 WITH_RCU_READ_LOCK_GUARD() { 3263 qemu_put_be64(f, ram_bytes_total_with_ignored() 3264 | RAM_SAVE_FLAG_MEM_SIZE); 3265 3266 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3267 qemu_put_byte(f, strlen(block->idstr)); 3268 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3269 qemu_put_be64(f, block->used_length); 3270 if (migrate_postcopy_ram() && block->page_size != 3271 qemu_host_page_size) { 3272 qemu_put_be64(f, block->page_size); 3273 } 3274 if (migrate_ignore_shared()) { 3275 qemu_put_be64(f, block->mr->addr); 3276 } 3277 } 3278 } 3279 3280 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3281 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3282 3283 migration_ops = g_malloc0(sizeof(MigrationOps)); 3284 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3285 ret = multifd_send_sync_main(f); 3286 if (ret < 0) { 3287 return ret; 3288 } 3289 3290 if (!migrate_multifd_flush_after_each_section()) { 3291 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3292 } 3293 3294 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3295 qemu_fflush(f); 3296 3297 return 0; 3298 } 3299 3300 /** 3301 * ram_save_iterate: iterative stage for migration 3302 * 3303 * Returns zero to indicate success and negative for error 3304 * 3305 * @f: QEMUFile where to send the data 3306 * @opaque: RAMState pointer 3307 */ 3308 static int ram_save_iterate(QEMUFile *f, void *opaque) 3309 { 3310 RAMState **temp = opaque; 3311 RAMState *rs = *temp; 3312 int ret = 0; 3313 int i; 3314 int64_t t0; 3315 int done = 0; 3316 3317 if (blk_mig_bulk_active()) { 3318 /* Avoid transferring ram during bulk phase of block migration as 3319 * the bulk phase will usually take a long time and transferring 3320 * ram updates during that time is pointless. */ 3321 goto out; 3322 } 3323 3324 /* 3325 * We'll take this lock a little bit long, but it's okay for two reasons. 3326 * Firstly, the only possible other thread to take it is who calls 3327 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3328 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3329 * guarantees that we'll at least released it in a regular basis. 3330 */ 3331 qemu_mutex_lock(&rs->bitmap_mutex); 3332 WITH_RCU_READ_LOCK_GUARD() { 3333 if (ram_list.version != rs->last_version) { 3334 ram_state_reset(rs); 3335 } 3336 3337 /* Read version before ram_list.blocks */ 3338 smp_rmb(); 3339 3340 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3341 3342 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3343 i = 0; 3344 while ((ret = qemu_file_rate_limit(f)) == 0 || 3345 postcopy_has_request(rs)) { 3346 int pages; 3347 3348 if (qemu_file_get_error(f)) { 3349 break; 3350 } 3351 3352 pages = ram_find_and_save_block(rs); 3353 /* no more pages to sent */ 3354 if (pages == 0) { 3355 done = 1; 3356 break; 3357 } 3358 3359 if (pages < 0) { 3360 qemu_file_set_error(f, pages); 3361 break; 3362 } 3363 3364 rs->target_page_count += pages; 3365 3366 /* 3367 * During postcopy, it is necessary to make sure one whole host 3368 * page is sent in one chunk. 3369 */ 3370 if (migrate_postcopy_ram()) { 3371 flush_compressed_data(rs); 3372 } 3373 3374 /* 3375 * we want to check in the 1st loop, just in case it was the 1st 3376 * time and we had to sync the dirty bitmap. 3377 * qemu_clock_get_ns() is a bit expensive, so we only check each 3378 * some iterations 3379 */ 3380 if ((i & 63) == 0) { 3381 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3382 1000000; 3383 if (t1 > MAX_WAIT) { 3384 trace_ram_save_iterate_big_wait(t1, i); 3385 break; 3386 } 3387 } 3388 i++; 3389 } 3390 } 3391 qemu_mutex_unlock(&rs->bitmap_mutex); 3392 3393 /* 3394 * Must occur before EOS (or any QEMUFile operation) 3395 * because of RDMA protocol. 3396 */ 3397 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3398 3399 out: 3400 if (ret >= 0 3401 && migration_is_setup_or_active(migrate_get_current()->state)) { 3402 if (migrate_multifd_flush_after_each_section()) { 3403 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3404 if (ret < 0) { 3405 return ret; 3406 } 3407 } 3408 3409 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3410 qemu_fflush(f); 3411 ram_transferred_add(8); 3412 3413 ret = qemu_file_get_error(f); 3414 } 3415 if (ret < 0) { 3416 return ret; 3417 } 3418 3419 return done; 3420 } 3421 3422 /** 3423 * ram_save_complete: function called to send the remaining amount of ram 3424 * 3425 * Returns zero to indicate success or negative on error 3426 * 3427 * Called with iothread lock 3428 * 3429 * @f: QEMUFile where to send the data 3430 * @opaque: RAMState pointer 3431 */ 3432 static int ram_save_complete(QEMUFile *f, void *opaque) 3433 { 3434 RAMState **temp = opaque; 3435 RAMState *rs = *temp; 3436 int ret = 0; 3437 3438 rs->last_stage = !migration_in_colo_state(); 3439 3440 WITH_RCU_READ_LOCK_GUARD() { 3441 if (!migration_in_postcopy()) { 3442 migration_bitmap_sync_precopy(rs); 3443 } 3444 3445 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3446 3447 /* try transferring iterative blocks of memory */ 3448 3449 /* flush all remaining blocks regardless of rate limiting */ 3450 qemu_mutex_lock(&rs->bitmap_mutex); 3451 while (true) { 3452 int pages; 3453 3454 pages = ram_find_and_save_block(rs); 3455 /* no more blocks to sent */ 3456 if (pages == 0) { 3457 break; 3458 } 3459 if (pages < 0) { 3460 ret = pages; 3461 break; 3462 } 3463 } 3464 qemu_mutex_unlock(&rs->bitmap_mutex); 3465 3466 flush_compressed_data(rs); 3467 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3468 } 3469 3470 if (ret < 0) { 3471 return ret; 3472 } 3473 3474 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3475 if (ret < 0) { 3476 return ret; 3477 } 3478 3479 if (!migrate_multifd_flush_after_each_section()) { 3480 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3481 } 3482 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3483 qemu_fflush(f); 3484 3485 return 0; 3486 } 3487 3488 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3489 uint64_t *can_postcopy) 3490 { 3491 RAMState **temp = opaque; 3492 RAMState *rs = *temp; 3493 3494 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3495 3496 if (migrate_postcopy_ram()) { 3497 /* We can do postcopy, and all the data is postcopiable */ 3498 *can_postcopy += remaining_size; 3499 } else { 3500 *must_precopy += remaining_size; 3501 } 3502 } 3503 3504 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3505 uint64_t *can_postcopy) 3506 { 3507 MigrationState *s = migrate_get_current(); 3508 RAMState **temp = opaque; 3509 RAMState *rs = *temp; 3510 3511 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3512 3513 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3514 qemu_mutex_lock_iothread(); 3515 WITH_RCU_READ_LOCK_GUARD() { 3516 migration_bitmap_sync_precopy(rs); 3517 } 3518 qemu_mutex_unlock_iothread(); 3519 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3520 } 3521 3522 if (migrate_postcopy_ram()) { 3523 /* We can do postcopy, and all the data is postcopiable */ 3524 *can_postcopy += remaining_size; 3525 } else { 3526 *must_precopy += remaining_size; 3527 } 3528 } 3529 3530 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3531 { 3532 unsigned int xh_len; 3533 int xh_flags; 3534 uint8_t *loaded_data; 3535 3536 /* extract RLE header */ 3537 xh_flags = qemu_get_byte(f); 3538 xh_len = qemu_get_be16(f); 3539 3540 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3541 error_report("Failed to load XBZRLE page - wrong compression!"); 3542 return -1; 3543 } 3544 3545 if (xh_len > TARGET_PAGE_SIZE) { 3546 error_report("Failed to load XBZRLE page - len overflow!"); 3547 return -1; 3548 } 3549 loaded_data = XBZRLE.decoded_buf; 3550 /* load data and decode */ 3551 /* it can change loaded_data to point to an internal buffer */ 3552 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3553 3554 /* decode RLE */ 3555 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3556 TARGET_PAGE_SIZE) == -1) { 3557 error_report("Failed to load XBZRLE page - decode error!"); 3558 return -1; 3559 } 3560 3561 return 0; 3562 } 3563 3564 /** 3565 * ram_block_from_stream: read a RAMBlock id from the migration stream 3566 * 3567 * Must be called from within a rcu critical section. 3568 * 3569 * Returns a pointer from within the RCU-protected ram_list. 3570 * 3571 * @mis: the migration incoming state pointer 3572 * @f: QEMUFile where to read the data from 3573 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3574 * @channel: the channel we're using 3575 */ 3576 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3577 QEMUFile *f, int flags, 3578 int channel) 3579 { 3580 RAMBlock *block = mis->last_recv_block[channel]; 3581 char id[256]; 3582 uint8_t len; 3583 3584 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3585 if (!block) { 3586 error_report("Ack, bad migration stream!"); 3587 return NULL; 3588 } 3589 return block; 3590 } 3591 3592 len = qemu_get_byte(f); 3593 qemu_get_buffer(f, (uint8_t *)id, len); 3594 id[len] = 0; 3595 3596 block = qemu_ram_block_by_name(id); 3597 if (!block) { 3598 error_report("Can't find block %s", id); 3599 return NULL; 3600 } 3601 3602 if (ramblock_is_ignored(block)) { 3603 error_report("block %s should not be migrated !", id); 3604 return NULL; 3605 } 3606 3607 mis->last_recv_block[channel] = block; 3608 3609 return block; 3610 } 3611 3612 static inline void *host_from_ram_block_offset(RAMBlock *block, 3613 ram_addr_t offset) 3614 { 3615 if (!offset_in_ramblock(block, offset)) { 3616 return NULL; 3617 } 3618 3619 return block->host + offset; 3620 } 3621 3622 static void *host_page_from_ram_block_offset(RAMBlock *block, 3623 ram_addr_t offset) 3624 { 3625 /* Note: Explicitly no check against offset_in_ramblock(). */ 3626 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3627 block->page_size); 3628 } 3629 3630 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3631 ram_addr_t offset) 3632 { 3633 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3634 } 3635 3636 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3637 ram_addr_t offset, bool record_bitmap) 3638 { 3639 if (!offset_in_ramblock(block, offset)) { 3640 return NULL; 3641 } 3642 if (!block->colo_cache) { 3643 error_report("%s: colo_cache is NULL in block :%s", 3644 __func__, block->idstr); 3645 return NULL; 3646 } 3647 3648 /* 3649 * During colo checkpoint, we need bitmap of these migrated pages. 3650 * It help us to decide which pages in ram cache should be flushed 3651 * into VM's RAM later. 3652 */ 3653 if (record_bitmap && 3654 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3655 ram_state->migration_dirty_pages++; 3656 } 3657 return block->colo_cache + offset; 3658 } 3659 3660 /** 3661 * ram_handle_compressed: handle the zero page case 3662 * 3663 * If a page (or a whole RDMA chunk) has been 3664 * determined to be zero, then zap it. 3665 * 3666 * @host: host address for the zero page 3667 * @ch: what the page is filled from. We only support zero 3668 * @size: size of the zero page 3669 */ 3670 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3671 { 3672 if (ch != 0 || !buffer_is_zero(host, size)) { 3673 memset(host, ch, size); 3674 } 3675 } 3676 3677 /* return the size after decompression, or negative value on error */ 3678 static int 3679 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3680 const uint8_t *source, size_t source_len) 3681 { 3682 int err; 3683 3684 err = inflateReset(stream); 3685 if (err != Z_OK) { 3686 return -1; 3687 } 3688 3689 stream->avail_in = source_len; 3690 stream->next_in = (uint8_t *)source; 3691 stream->avail_out = dest_len; 3692 stream->next_out = dest; 3693 3694 err = inflate(stream, Z_NO_FLUSH); 3695 if (err != Z_STREAM_END) { 3696 return -1; 3697 } 3698 3699 return stream->total_out; 3700 } 3701 3702 static void *do_data_decompress(void *opaque) 3703 { 3704 DecompressParam *param = opaque; 3705 unsigned long pagesize; 3706 uint8_t *des; 3707 int len, ret; 3708 3709 qemu_mutex_lock(¶m->mutex); 3710 while (!param->quit) { 3711 if (param->des) { 3712 des = param->des; 3713 len = param->len; 3714 param->des = 0; 3715 qemu_mutex_unlock(¶m->mutex); 3716 3717 pagesize = TARGET_PAGE_SIZE; 3718 3719 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3720 param->compbuf, len); 3721 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3722 error_report("decompress data failed"); 3723 qemu_file_set_error(decomp_file, ret); 3724 } 3725 3726 qemu_mutex_lock(&decomp_done_lock); 3727 param->done = true; 3728 qemu_cond_signal(&decomp_done_cond); 3729 qemu_mutex_unlock(&decomp_done_lock); 3730 3731 qemu_mutex_lock(¶m->mutex); 3732 } else { 3733 qemu_cond_wait(¶m->cond, ¶m->mutex); 3734 } 3735 } 3736 qemu_mutex_unlock(¶m->mutex); 3737 3738 return NULL; 3739 } 3740 3741 static int wait_for_decompress_done(void) 3742 { 3743 int idx, thread_count; 3744 3745 if (!migrate_compress()) { 3746 return 0; 3747 } 3748 3749 thread_count = migrate_decompress_threads(); 3750 qemu_mutex_lock(&decomp_done_lock); 3751 for (idx = 0; idx < thread_count; idx++) { 3752 while (!decomp_param[idx].done) { 3753 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3754 } 3755 } 3756 qemu_mutex_unlock(&decomp_done_lock); 3757 return qemu_file_get_error(decomp_file); 3758 } 3759 3760 static void compress_threads_load_cleanup(void) 3761 { 3762 int i, thread_count; 3763 3764 if (!migrate_compress()) { 3765 return; 3766 } 3767 thread_count = migrate_decompress_threads(); 3768 for (i = 0; i < thread_count; i++) { 3769 /* 3770 * we use it as a indicator which shows if the thread is 3771 * properly init'd or not 3772 */ 3773 if (!decomp_param[i].compbuf) { 3774 break; 3775 } 3776 3777 qemu_mutex_lock(&decomp_param[i].mutex); 3778 decomp_param[i].quit = true; 3779 qemu_cond_signal(&decomp_param[i].cond); 3780 qemu_mutex_unlock(&decomp_param[i].mutex); 3781 } 3782 for (i = 0; i < thread_count; i++) { 3783 if (!decomp_param[i].compbuf) { 3784 break; 3785 } 3786 3787 qemu_thread_join(decompress_threads + i); 3788 qemu_mutex_destroy(&decomp_param[i].mutex); 3789 qemu_cond_destroy(&decomp_param[i].cond); 3790 inflateEnd(&decomp_param[i].stream); 3791 g_free(decomp_param[i].compbuf); 3792 decomp_param[i].compbuf = NULL; 3793 } 3794 g_free(decompress_threads); 3795 g_free(decomp_param); 3796 decompress_threads = NULL; 3797 decomp_param = NULL; 3798 decomp_file = NULL; 3799 } 3800 3801 static int compress_threads_load_setup(QEMUFile *f) 3802 { 3803 int i, thread_count; 3804 3805 if (!migrate_compress()) { 3806 return 0; 3807 } 3808 3809 thread_count = migrate_decompress_threads(); 3810 decompress_threads = g_new0(QemuThread, thread_count); 3811 decomp_param = g_new0(DecompressParam, thread_count); 3812 qemu_mutex_init(&decomp_done_lock); 3813 qemu_cond_init(&decomp_done_cond); 3814 decomp_file = f; 3815 for (i = 0; i < thread_count; i++) { 3816 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3817 goto exit; 3818 } 3819 3820 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3821 qemu_mutex_init(&decomp_param[i].mutex); 3822 qemu_cond_init(&decomp_param[i].cond); 3823 decomp_param[i].done = true; 3824 decomp_param[i].quit = false; 3825 qemu_thread_create(decompress_threads + i, "decompress", 3826 do_data_decompress, decomp_param + i, 3827 QEMU_THREAD_JOINABLE); 3828 } 3829 return 0; 3830 exit: 3831 compress_threads_load_cleanup(); 3832 return -1; 3833 } 3834 3835 static void decompress_data_with_multi_threads(QEMUFile *f, 3836 void *host, int len) 3837 { 3838 int idx, thread_count; 3839 3840 thread_count = migrate_decompress_threads(); 3841 QEMU_LOCK_GUARD(&decomp_done_lock); 3842 while (true) { 3843 for (idx = 0; idx < thread_count; idx++) { 3844 if (decomp_param[idx].done) { 3845 decomp_param[idx].done = false; 3846 qemu_mutex_lock(&decomp_param[idx].mutex); 3847 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3848 decomp_param[idx].des = host; 3849 decomp_param[idx].len = len; 3850 qemu_cond_signal(&decomp_param[idx].cond); 3851 qemu_mutex_unlock(&decomp_param[idx].mutex); 3852 break; 3853 } 3854 } 3855 if (idx < thread_count) { 3856 break; 3857 } else { 3858 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3859 } 3860 } 3861 } 3862 3863 static void colo_init_ram_state(void) 3864 { 3865 ram_state_init(&ram_state); 3866 } 3867 3868 /* 3869 * colo cache: this is for secondary VM, we cache the whole 3870 * memory of the secondary VM, it is need to hold the global lock 3871 * to call this helper. 3872 */ 3873 int colo_init_ram_cache(void) 3874 { 3875 RAMBlock *block; 3876 3877 WITH_RCU_READ_LOCK_GUARD() { 3878 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3879 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3880 NULL, false, false); 3881 if (!block->colo_cache) { 3882 error_report("%s: Can't alloc memory for COLO cache of block %s," 3883 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3884 block->used_length); 3885 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3886 if (block->colo_cache) { 3887 qemu_anon_ram_free(block->colo_cache, block->used_length); 3888 block->colo_cache = NULL; 3889 } 3890 } 3891 return -errno; 3892 } 3893 if (!machine_dump_guest_core(current_machine)) { 3894 qemu_madvise(block->colo_cache, block->used_length, 3895 QEMU_MADV_DONTDUMP); 3896 } 3897 } 3898 } 3899 3900 /* 3901 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3902 * with to decide which page in cache should be flushed into SVM's RAM. Here 3903 * we use the same name 'ram_bitmap' as for migration. 3904 */ 3905 if (ram_bytes_total()) { 3906 RAMBlock *block; 3907 3908 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3909 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3910 block->bmap = bitmap_new(pages); 3911 } 3912 } 3913 3914 colo_init_ram_state(); 3915 return 0; 3916 } 3917 3918 /* TODO: duplicated with ram_init_bitmaps */ 3919 void colo_incoming_start_dirty_log(void) 3920 { 3921 RAMBlock *block = NULL; 3922 /* For memory_global_dirty_log_start below. */ 3923 qemu_mutex_lock_iothread(); 3924 qemu_mutex_lock_ramlist(); 3925 3926 memory_global_dirty_log_sync(); 3927 WITH_RCU_READ_LOCK_GUARD() { 3928 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3929 ramblock_sync_dirty_bitmap(ram_state, block); 3930 /* Discard this dirty bitmap record */ 3931 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3932 } 3933 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3934 } 3935 ram_state->migration_dirty_pages = 0; 3936 qemu_mutex_unlock_ramlist(); 3937 qemu_mutex_unlock_iothread(); 3938 } 3939 3940 /* It is need to hold the global lock to call this helper */ 3941 void colo_release_ram_cache(void) 3942 { 3943 RAMBlock *block; 3944 3945 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3946 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3947 g_free(block->bmap); 3948 block->bmap = NULL; 3949 } 3950 3951 WITH_RCU_READ_LOCK_GUARD() { 3952 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3953 if (block->colo_cache) { 3954 qemu_anon_ram_free(block->colo_cache, block->used_length); 3955 block->colo_cache = NULL; 3956 } 3957 } 3958 } 3959 ram_state_cleanup(&ram_state); 3960 } 3961 3962 /** 3963 * ram_load_setup: Setup RAM for migration incoming side 3964 * 3965 * Returns zero to indicate success and negative for error 3966 * 3967 * @f: QEMUFile where to receive the data 3968 * @opaque: RAMState pointer 3969 */ 3970 static int ram_load_setup(QEMUFile *f, void *opaque) 3971 { 3972 if (compress_threads_load_setup(f)) { 3973 return -1; 3974 } 3975 3976 xbzrle_load_setup(); 3977 ramblock_recv_map_init(); 3978 3979 return 0; 3980 } 3981 3982 static int ram_load_cleanup(void *opaque) 3983 { 3984 RAMBlock *rb; 3985 3986 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3987 qemu_ram_block_writeback(rb); 3988 } 3989 3990 xbzrle_load_cleanup(); 3991 compress_threads_load_cleanup(); 3992 3993 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3994 g_free(rb->receivedmap); 3995 rb->receivedmap = NULL; 3996 } 3997 3998 return 0; 3999 } 4000 4001 /** 4002 * ram_postcopy_incoming_init: allocate postcopy data structures 4003 * 4004 * Returns 0 for success and negative if there was one error 4005 * 4006 * @mis: current migration incoming state 4007 * 4008 * Allocate data structures etc needed by incoming migration with 4009 * postcopy-ram. postcopy-ram's similarly names 4010 * postcopy_ram_incoming_init does the work. 4011 */ 4012 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 4013 { 4014 return postcopy_ram_incoming_init(mis); 4015 } 4016 4017 /** 4018 * ram_load_postcopy: load a page in postcopy case 4019 * 4020 * Returns 0 for success or -errno in case of error 4021 * 4022 * Called in postcopy mode by ram_load(). 4023 * rcu_read_lock is taken prior to this being called. 4024 * 4025 * @f: QEMUFile where to send the data 4026 * @channel: the channel to use for loading 4027 */ 4028 int ram_load_postcopy(QEMUFile *f, int channel) 4029 { 4030 int flags = 0, ret = 0; 4031 bool place_needed = false; 4032 bool matches_target_page_size = false; 4033 MigrationIncomingState *mis = migration_incoming_get_current(); 4034 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 4035 4036 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4037 ram_addr_t addr; 4038 void *page_buffer = NULL; 4039 void *place_source = NULL; 4040 RAMBlock *block = NULL; 4041 uint8_t ch; 4042 int len; 4043 4044 addr = qemu_get_be64(f); 4045 4046 /* 4047 * If qemu file error, we should stop here, and then "addr" 4048 * may be invalid 4049 */ 4050 ret = qemu_file_get_error(f); 4051 if (ret) { 4052 break; 4053 } 4054 4055 flags = addr & ~TARGET_PAGE_MASK; 4056 addr &= TARGET_PAGE_MASK; 4057 4058 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 4059 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4060 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 4061 block = ram_block_from_stream(mis, f, flags, channel); 4062 if (!block) { 4063 ret = -EINVAL; 4064 break; 4065 } 4066 4067 /* 4068 * Relying on used_length is racy and can result in false positives. 4069 * We might place pages beyond used_length in case RAM was shrunk 4070 * while in postcopy, which is fine - trying to place via 4071 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4072 */ 4073 if (!block->host || addr >= block->postcopy_length) { 4074 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4075 ret = -EINVAL; 4076 break; 4077 } 4078 tmp_page->target_pages++; 4079 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4080 /* 4081 * Postcopy requires that we place whole host pages atomically; 4082 * these may be huge pages for RAMBlocks that are backed by 4083 * hugetlbfs. 4084 * To make it atomic, the data is read into a temporary page 4085 * that's moved into place later. 4086 * The migration protocol uses, possibly smaller, target-pages 4087 * however the source ensures it always sends all the components 4088 * of a host page in one chunk. 4089 */ 4090 page_buffer = tmp_page->tmp_huge_page + 4091 host_page_offset_from_ram_block_offset(block, addr); 4092 /* If all TP are zero then we can optimise the place */ 4093 if (tmp_page->target_pages == 1) { 4094 tmp_page->host_addr = 4095 host_page_from_ram_block_offset(block, addr); 4096 } else if (tmp_page->host_addr != 4097 host_page_from_ram_block_offset(block, addr)) { 4098 /* not the 1st TP within the HP */ 4099 error_report("Non-same host page detected on channel %d: " 4100 "Target host page %p, received host page %p " 4101 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4102 channel, tmp_page->host_addr, 4103 host_page_from_ram_block_offset(block, addr), 4104 block->idstr, addr, tmp_page->target_pages); 4105 ret = -EINVAL; 4106 break; 4107 } 4108 4109 /* 4110 * If it's the last part of a host page then we place the host 4111 * page 4112 */ 4113 if (tmp_page->target_pages == 4114 (block->page_size / TARGET_PAGE_SIZE)) { 4115 place_needed = true; 4116 } 4117 place_source = tmp_page->tmp_huge_page; 4118 } 4119 4120 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4121 case RAM_SAVE_FLAG_ZERO: 4122 ch = qemu_get_byte(f); 4123 /* 4124 * Can skip to set page_buffer when 4125 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4126 */ 4127 if (ch || !matches_target_page_size) { 4128 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4129 } 4130 if (ch) { 4131 tmp_page->all_zero = false; 4132 } 4133 break; 4134 4135 case RAM_SAVE_FLAG_PAGE: 4136 tmp_page->all_zero = false; 4137 if (!matches_target_page_size) { 4138 /* For huge pages, we always use temporary buffer */ 4139 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4140 } else { 4141 /* 4142 * For small pages that matches target page size, we 4143 * avoid the qemu_file copy. Instead we directly use 4144 * the buffer of QEMUFile to place the page. Note: we 4145 * cannot do any QEMUFile operation before using that 4146 * buffer to make sure the buffer is valid when 4147 * placing the page. 4148 */ 4149 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4150 TARGET_PAGE_SIZE); 4151 } 4152 break; 4153 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4154 tmp_page->all_zero = false; 4155 len = qemu_get_be32(f); 4156 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4157 error_report("Invalid compressed data length: %d", len); 4158 ret = -EINVAL; 4159 break; 4160 } 4161 decompress_data_with_multi_threads(f, page_buffer, len); 4162 break; 4163 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4164 multifd_recv_sync_main(); 4165 break; 4166 case RAM_SAVE_FLAG_EOS: 4167 /* normal exit */ 4168 if (migrate_multifd_flush_after_each_section()) { 4169 multifd_recv_sync_main(); 4170 } 4171 break; 4172 default: 4173 error_report("Unknown combination of migration flags: 0x%x" 4174 " (postcopy mode)", flags); 4175 ret = -EINVAL; 4176 break; 4177 } 4178 4179 /* Got the whole host page, wait for decompress before placing. */ 4180 if (place_needed) { 4181 ret |= wait_for_decompress_done(); 4182 } 4183 4184 /* Detect for any possible file errors */ 4185 if (!ret && qemu_file_get_error(f)) { 4186 ret = qemu_file_get_error(f); 4187 } 4188 4189 if (!ret && place_needed) { 4190 if (tmp_page->all_zero) { 4191 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4192 } else { 4193 ret = postcopy_place_page(mis, tmp_page->host_addr, 4194 place_source, block); 4195 } 4196 place_needed = false; 4197 postcopy_temp_page_reset(tmp_page); 4198 } 4199 } 4200 4201 return ret; 4202 } 4203 4204 static bool postcopy_is_running(void) 4205 { 4206 PostcopyState ps = postcopy_state_get(); 4207 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4208 } 4209 4210 /* 4211 * Flush content of RAM cache into SVM's memory. 4212 * Only flush the pages that be dirtied by PVM or SVM or both. 4213 */ 4214 void colo_flush_ram_cache(void) 4215 { 4216 RAMBlock *block = NULL; 4217 void *dst_host; 4218 void *src_host; 4219 unsigned long offset = 0; 4220 4221 memory_global_dirty_log_sync(); 4222 WITH_RCU_READ_LOCK_GUARD() { 4223 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4224 ramblock_sync_dirty_bitmap(ram_state, block); 4225 } 4226 } 4227 4228 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4229 WITH_RCU_READ_LOCK_GUARD() { 4230 block = QLIST_FIRST_RCU(&ram_list.blocks); 4231 4232 while (block) { 4233 unsigned long num = 0; 4234 4235 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4236 if (!offset_in_ramblock(block, 4237 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4238 offset = 0; 4239 num = 0; 4240 block = QLIST_NEXT_RCU(block, next); 4241 } else { 4242 unsigned long i = 0; 4243 4244 for (i = 0; i < num; i++) { 4245 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4246 } 4247 dst_host = block->host 4248 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4249 src_host = block->colo_cache 4250 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4251 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4252 offset += num; 4253 } 4254 } 4255 } 4256 trace_colo_flush_ram_cache_end(); 4257 } 4258 4259 /** 4260 * ram_load_precopy: load pages in precopy case 4261 * 4262 * Returns 0 for success or -errno in case of error 4263 * 4264 * Called in precopy mode by ram_load(). 4265 * rcu_read_lock is taken prior to this being called. 4266 * 4267 * @f: QEMUFile where to send the data 4268 */ 4269 static int ram_load_precopy(QEMUFile *f) 4270 { 4271 MigrationIncomingState *mis = migration_incoming_get_current(); 4272 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4273 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4274 bool postcopy_advised = migration_incoming_postcopy_advised(); 4275 if (!migrate_compress()) { 4276 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4277 } 4278 4279 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4280 ram_addr_t addr, total_ram_bytes; 4281 void *host = NULL, *host_bak = NULL; 4282 uint8_t ch; 4283 4284 /* 4285 * Yield periodically to let main loop run, but an iteration of 4286 * the main loop is expensive, so do it each some iterations 4287 */ 4288 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4289 aio_co_schedule(qemu_get_current_aio_context(), 4290 qemu_coroutine_self()); 4291 qemu_coroutine_yield(); 4292 } 4293 i++; 4294 4295 addr = qemu_get_be64(f); 4296 flags = addr & ~TARGET_PAGE_MASK; 4297 addr &= TARGET_PAGE_MASK; 4298 4299 if (flags & invalid_flags) { 4300 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4301 error_report("Received an unexpected compressed page"); 4302 } 4303 4304 ret = -EINVAL; 4305 break; 4306 } 4307 4308 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4309 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4310 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4311 RAM_CHANNEL_PRECOPY); 4312 4313 host = host_from_ram_block_offset(block, addr); 4314 /* 4315 * After going into COLO stage, we should not load the page 4316 * into SVM's memory directly, we put them into colo_cache firstly. 4317 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4318 * Previously, we copied all these memory in preparing stage of COLO 4319 * while we need to stop VM, which is a time-consuming process. 4320 * Here we optimize it by a trick, back-up every page while in 4321 * migration process while COLO is enabled, though it affects the 4322 * speed of the migration, but it obviously reduce the downtime of 4323 * back-up all SVM'S memory in COLO preparing stage. 4324 */ 4325 if (migration_incoming_colo_enabled()) { 4326 if (migration_incoming_in_colo_state()) { 4327 /* In COLO stage, put all pages into cache temporarily */ 4328 host = colo_cache_from_block_offset(block, addr, true); 4329 } else { 4330 /* 4331 * In migration stage but before COLO stage, 4332 * Put all pages into both cache and SVM's memory. 4333 */ 4334 host_bak = colo_cache_from_block_offset(block, addr, false); 4335 } 4336 } 4337 if (!host) { 4338 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4339 ret = -EINVAL; 4340 break; 4341 } 4342 if (!migration_incoming_in_colo_state()) { 4343 ramblock_recv_bitmap_set(block, host); 4344 } 4345 4346 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4347 } 4348 4349 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4350 case RAM_SAVE_FLAG_MEM_SIZE: 4351 /* Synchronize RAM block list */ 4352 total_ram_bytes = addr; 4353 while (!ret && total_ram_bytes) { 4354 RAMBlock *block; 4355 char id[256]; 4356 ram_addr_t length; 4357 4358 len = qemu_get_byte(f); 4359 qemu_get_buffer(f, (uint8_t *)id, len); 4360 id[len] = 0; 4361 length = qemu_get_be64(f); 4362 4363 block = qemu_ram_block_by_name(id); 4364 if (block && !qemu_ram_is_migratable(block)) { 4365 error_report("block %s should not be migrated !", id); 4366 ret = -EINVAL; 4367 } else if (block) { 4368 if (length != block->used_length) { 4369 Error *local_err = NULL; 4370 4371 ret = qemu_ram_resize(block, length, 4372 &local_err); 4373 if (local_err) { 4374 error_report_err(local_err); 4375 } 4376 } 4377 /* For postcopy we need to check hugepage sizes match */ 4378 if (postcopy_advised && migrate_postcopy_ram() && 4379 block->page_size != qemu_host_page_size) { 4380 uint64_t remote_page_size = qemu_get_be64(f); 4381 if (remote_page_size != block->page_size) { 4382 error_report("Mismatched RAM page size %s " 4383 "(local) %zd != %" PRId64, 4384 id, block->page_size, 4385 remote_page_size); 4386 ret = -EINVAL; 4387 } 4388 } 4389 if (migrate_ignore_shared()) { 4390 hwaddr addr = qemu_get_be64(f); 4391 if (ramblock_is_ignored(block) && 4392 block->mr->addr != addr) { 4393 error_report("Mismatched GPAs for block %s " 4394 "%" PRId64 "!= %" PRId64, 4395 id, (uint64_t)addr, 4396 (uint64_t)block->mr->addr); 4397 ret = -EINVAL; 4398 } 4399 } 4400 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4401 block->idstr); 4402 } else { 4403 error_report("Unknown ramblock \"%s\", cannot " 4404 "accept migration", id); 4405 ret = -EINVAL; 4406 } 4407 4408 total_ram_bytes -= length; 4409 } 4410 break; 4411 4412 case RAM_SAVE_FLAG_ZERO: 4413 ch = qemu_get_byte(f); 4414 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4415 break; 4416 4417 case RAM_SAVE_FLAG_PAGE: 4418 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4419 break; 4420 4421 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4422 len = qemu_get_be32(f); 4423 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4424 error_report("Invalid compressed data length: %d", len); 4425 ret = -EINVAL; 4426 break; 4427 } 4428 decompress_data_with_multi_threads(f, host, len); 4429 break; 4430 4431 case RAM_SAVE_FLAG_XBZRLE: 4432 if (load_xbzrle(f, addr, host) < 0) { 4433 error_report("Failed to decompress XBZRLE page at " 4434 RAM_ADDR_FMT, addr); 4435 ret = -EINVAL; 4436 break; 4437 } 4438 break; 4439 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4440 multifd_recv_sync_main(); 4441 break; 4442 case RAM_SAVE_FLAG_EOS: 4443 /* normal exit */ 4444 if (migrate_multifd_flush_after_each_section()) { 4445 multifd_recv_sync_main(); 4446 } 4447 break; 4448 default: 4449 if (flags & RAM_SAVE_FLAG_HOOK) { 4450 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4451 } else { 4452 error_report("Unknown combination of migration flags: 0x%x", 4453 flags); 4454 ret = -EINVAL; 4455 } 4456 } 4457 if (!ret) { 4458 ret = qemu_file_get_error(f); 4459 } 4460 if (!ret && host_bak) { 4461 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4462 } 4463 } 4464 4465 ret |= wait_for_decompress_done(); 4466 return ret; 4467 } 4468 4469 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4470 { 4471 int ret = 0; 4472 static uint64_t seq_iter; 4473 /* 4474 * If system is running in postcopy mode, page inserts to host memory must 4475 * be atomic 4476 */ 4477 bool postcopy_running = postcopy_is_running(); 4478 4479 seq_iter++; 4480 4481 if (version_id != 4) { 4482 return -EINVAL; 4483 } 4484 4485 /* 4486 * This RCU critical section can be very long running. 4487 * When RCU reclaims in the code start to become numerous, 4488 * it will be necessary to reduce the granularity of this 4489 * critical section. 4490 */ 4491 WITH_RCU_READ_LOCK_GUARD() { 4492 if (postcopy_running) { 4493 /* 4494 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4495 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4496 * service fast page faults. 4497 */ 4498 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4499 } else { 4500 ret = ram_load_precopy(f); 4501 } 4502 } 4503 trace_ram_load_complete(ret, seq_iter); 4504 4505 return ret; 4506 } 4507 4508 static bool ram_has_postcopy(void *opaque) 4509 { 4510 RAMBlock *rb; 4511 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4512 if (ramblock_is_pmem(rb)) { 4513 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4514 "is not supported now!", rb->idstr, rb->host); 4515 return false; 4516 } 4517 } 4518 4519 return migrate_postcopy_ram(); 4520 } 4521 4522 /* Sync all the dirty bitmap with destination VM. */ 4523 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4524 { 4525 RAMBlock *block; 4526 QEMUFile *file = s->to_dst_file; 4527 int ramblock_count = 0; 4528 4529 trace_ram_dirty_bitmap_sync_start(); 4530 4531 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4532 qemu_savevm_send_recv_bitmap(file, block->idstr); 4533 trace_ram_dirty_bitmap_request(block->idstr); 4534 ramblock_count++; 4535 } 4536 4537 trace_ram_dirty_bitmap_sync_wait(); 4538 4539 /* Wait until all the ramblocks' dirty bitmap synced */ 4540 while (ramblock_count--) { 4541 qemu_sem_wait(&s->rp_state.rp_sem); 4542 } 4543 4544 trace_ram_dirty_bitmap_sync_complete(); 4545 4546 return 0; 4547 } 4548 4549 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4550 { 4551 qemu_sem_post(&s->rp_state.rp_sem); 4552 } 4553 4554 /* 4555 * Read the received bitmap, revert it as the initial dirty bitmap. 4556 * This is only used when the postcopy migration is paused but wants 4557 * to resume from a middle point. 4558 */ 4559 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4560 { 4561 int ret = -EINVAL; 4562 /* from_dst_file is always valid because we're within rp_thread */ 4563 QEMUFile *file = s->rp_state.from_dst_file; 4564 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4565 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4566 uint64_t size, end_mark; 4567 4568 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4569 4570 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4571 error_report("%s: incorrect state %s", __func__, 4572 MigrationStatus_str(s->state)); 4573 return -EINVAL; 4574 } 4575 4576 /* 4577 * Note: see comments in ramblock_recv_bitmap_send() on why we 4578 * need the endianness conversion, and the paddings. 4579 */ 4580 local_size = ROUND_UP(local_size, 8); 4581 4582 /* Add paddings */ 4583 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4584 4585 size = qemu_get_be64(file); 4586 4587 /* The size of the bitmap should match with our ramblock */ 4588 if (size != local_size) { 4589 error_report("%s: ramblock '%s' bitmap size mismatch " 4590 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4591 block->idstr, size, local_size); 4592 ret = -EINVAL; 4593 goto out; 4594 } 4595 4596 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4597 end_mark = qemu_get_be64(file); 4598 4599 ret = qemu_file_get_error(file); 4600 if (ret || size != local_size) { 4601 error_report("%s: read bitmap failed for ramblock '%s': %d" 4602 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4603 __func__, block->idstr, ret, local_size, size); 4604 ret = -EIO; 4605 goto out; 4606 } 4607 4608 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4609 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4610 __func__, block->idstr, end_mark); 4611 ret = -EINVAL; 4612 goto out; 4613 } 4614 4615 /* 4616 * Endianness conversion. We are during postcopy (though paused). 4617 * The dirty bitmap won't change. We can directly modify it. 4618 */ 4619 bitmap_from_le(block->bmap, le_bitmap, nbits); 4620 4621 /* 4622 * What we received is "received bitmap". Revert it as the initial 4623 * dirty bitmap for this ramblock. 4624 */ 4625 bitmap_complement(block->bmap, block->bmap, nbits); 4626 4627 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4628 ramblock_dirty_bitmap_clear_discarded_pages(block); 4629 4630 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4631 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4632 4633 /* 4634 * We succeeded to sync bitmap for current ramblock. If this is 4635 * the last one to sync, we need to notify the main send thread. 4636 */ 4637 ram_dirty_bitmap_reload_notify(s); 4638 4639 ret = 0; 4640 out: 4641 g_free(le_bitmap); 4642 return ret; 4643 } 4644 4645 static int ram_resume_prepare(MigrationState *s, void *opaque) 4646 { 4647 RAMState *rs = *(RAMState **)opaque; 4648 int ret; 4649 4650 ret = ram_dirty_bitmap_sync_all(s, rs); 4651 if (ret) { 4652 return ret; 4653 } 4654 4655 ram_state_resume_prepare(rs, s->to_dst_file); 4656 4657 return 0; 4658 } 4659 4660 void postcopy_preempt_shutdown_file(MigrationState *s) 4661 { 4662 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4663 qemu_fflush(s->postcopy_qemufile_src); 4664 } 4665 4666 static SaveVMHandlers savevm_ram_handlers = { 4667 .save_setup = ram_save_setup, 4668 .save_live_iterate = ram_save_iterate, 4669 .save_live_complete_postcopy = ram_save_complete, 4670 .save_live_complete_precopy = ram_save_complete, 4671 .has_postcopy = ram_has_postcopy, 4672 .state_pending_exact = ram_state_pending_exact, 4673 .state_pending_estimate = ram_state_pending_estimate, 4674 .load_state = ram_load, 4675 .save_cleanup = ram_save_cleanup, 4676 .load_setup = ram_load_setup, 4677 .load_cleanup = ram_load_cleanup, 4678 .resume_prepare = ram_resume_prepare, 4679 }; 4680 4681 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4682 size_t old_size, size_t new_size) 4683 { 4684 PostcopyState ps = postcopy_state_get(); 4685 ram_addr_t offset; 4686 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4687 Error *err = NULL; 4688 4689 if (ramblock_is_ignored(rb)) { 4690 return; 4691 } 4692 4693 if (!migration_is_idle()) { 4694 /* 4695 * Precopy code on the source cannot deal with the size of RAM blocks 4696 * changing at random points in time - especially after sending the 4697 * RAM block sizes in the migration stream, they must no longer change. 4698 * Abort and indicate a proper reason. 4699 */ 4700 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4701 migration_cancel(err); 4702 error_free(err); 4703 } 4704 4705 switch (ps) { 4706 case POSTCOPY_INCOMING_ADVISE: 4707 /* 4708 * Update what ram_postcopy_incoming_init()->init_range() does at the 4709 * time postcopy was advised. Syncing RAM blocks with the source will 4710 * result in RAM resizes. 4711 */ 4712 if (old_size < new_size) { 4713 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4714 error_report("RAM block '%s' discard of resized RAM failed", 4715 rb->idstr); 4716 } 4717 } 4718 rb->postcopy_length = new_size; 4719 break; 4720 case POSTCOPY_INCOMING_NONE: 4721 case POSTCOPY_INCOMING_RUNNING: 4722 case POSTCOPY_INCOMING_END: 4723 /* 4724 * Once our guest is running, postcopy does no longer care about 4725 * resizes. When growing, the new memory was not available on the 4726 * source, no handler needed. 4727 */ 4728 break; 4729 default: 4730 error_report("RAM block '%s' resized during postcopy state: %d", 4731 rb->idstr, ps); 4732 exit(-1); 4733 } 4734 } 4735 4736 static RAMBlockNotifier ram_mig_ram_notifier = { 4737 .ram_block_resized = ram_mig_ram_block_resized, 4738 }; 4739 4740 void ram_mig_init(void) 4741 { 4742 qemu_mutex_init(&XBZRLE.lock); 4743 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4744 ram_block_notifier_add(&ram_mig_ram_notifier); 4745 } 4746