1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 #include "options.h" 61 62 #include "hw/boards.h" /* for machine_dump_guest_core() */ 63 64 #if defined(__linux__) 65 #include "qemu/userfaultfd.h" 66 #endif /* defined(__linux__) */ 67 68 /***********************************************************/ 69 /* ram save/restore */ 70 71 /* 72 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 73 * worked for pages that were filled with the same char. We switched 74 * it to only search for the zero value. And to avoid confusion with 75 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 76 */ 77 /* 78 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 79 */ 80 #define RAM_SAVE_FLAG_FULL 0x01 81 #define RAM_SAVE_FLAG_ZERO 0x02 82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 83 #define RAM_SAVE_FLAG_PAGE 0x08 84 #define RAM_SAVE_FLAG_EOS 0x10 85 #define RAM_SAVE_FLAG_CONTINUE 0x20 86 #define RAM_SAVE_FLAG_XBZRLE 0x40 87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 88 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 89 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 90 /* We can't use any flag that is bigger than 0x200 */ 91 92 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, 93 uint8_t *, int) = xbzrle_encode_buffer; 94 #if defined(CONFIG_AVX512BW_OPT) 95 #include "qemu/cpuid.h" 96 static void __attribute__((constructor)) init_cpu_flag(void) 97 { 98 unsigned max = __get_cpuid_max(0, NULL); 99 int a, b, c, d; 100 if (max >= 1) { 101 __cpuid(1, a, b, c, d); 102 /* We must check that AVX is not just available, but usable. */ 103 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { 104 int bv; 105 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); 106 __cpuid_count(7, 0, a, b, c, d); 107 /* 0xe6: 108 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 109 * and ZMM16-ZMM31 state are enabled by OS) 110 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) 111 */ 112 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { 113 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; 114 } 115 } 116 } 117 } 118 #endif 119 120 XBZRLECacheStats xbzrle_counters; 121 122 /* used by the search for pages to send */ 123 struct PageSearchStatus { 124 /* The migration channel used for a specific host page */ 125 QEMUFile *pss_channel; 126 /* Last block from where we have sent data */ 127 RAMBlock *last_sent_block; 128 /* Current block being searched */ 129 RAMBlock *block; 130 /* Current page to search from */ 131 unsigned long page; 132 /* Set once we wrap around */ 133 bool complete_round; 134 /* Whether we're sending a host page */ 135 bool host_page_sending; 136 /* The start/end of current host page. Invalid if host_page_sending==false */ 137 unsigned long host_page_start; 138 unsigned long host_page_end; 139 }; 140 typedef struct PageSearchStatus PageSearchStatus; 141 142 /* struct contains XBZRLE cache and a static page 143 used by the compression */ 144 static struct { 145 /* buffer used for XBZRLE encoding */ 146 uint8_t *encoded_buf; 147 /* buffer for storing page content */ 148 uint8_t *current_buf; 149 /* Cache for XBZRLE, Protected by lock. */ 150 PageCache *cache; 151 QemuMutex lock; 152 /* it will store a page full of zeros */ 153 uint8_t *zero_target_page; 154 /* buffer used for XBZRLE decoding */ 155 uint8_t *decoded_buf; 156 } XBZRLE; 157 158 static void XBZRLE_cache_lock(void) 159 { 160 if (migrate_xbzrle()) { 161 qemu_mutex_lock(&XBZRLE.lock); 162 } 163 } 164 165 static void XBZRLE_cache_unlock(void) 166 { 167 if (migrate_xbzrle()) { 168 qemu_mutex_unlock(&XBZRLE.lock); 169 } 170 } 171 172 /** 173 * xbzrle_cache_resize: resize the xbzrle cache 174 * 175 * This function is called from migrate_params_apply in main 176 * thread, possibly while a migration is in progress. A running 177 * migration may be using the cache and might finish during this call, 178 * hence changes to the cache are protected by XBZRLE.lock(). 179 * 180 * Returns 0 for success or -1 for error 181 * 182 * @new_size: new cache size 183 * @errp: set *errp if the check failed, with reason 184 */ 185 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 186 { 187 PageCache *new_cache; 188 int64_t ret = 0; 189 190 /* Check for truncation */ 191 if (new_size != (size_t)new_size) { 192 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 193 "exceeding address space"); 194 return -1; 195 } 196 197 if (new_size == migrate_xbzrle_cache_size()) { 198 /* nothing to do */ 199 return 0; 200 } 201 202 XBZRLE_cache_lock(); 203 204 if (XBZRLE.cache != NULL) { 205 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 206 if (!new_cache) { 207 ret = -1; 208 goto out; 209 } 210 211 cache_fini(XBZRLE.cache); 212 XBZRLE.cache = new_cache; 213 } 214 out: 215 XBZRLE_cache_unlock(); 216 return ret; 217 } 218 219 static bool postcopy_preempt_active(void) 220 { 221 return migrate_postcopy_preempt() && migration_in_postcopy(); 222 } 223 224 bool ramblock_is_ignored(RAMBlock *block) 225 { 226 return !qemu_ram_is_migratable(block) || 227 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 228 } 229 230 #undef RAMBLOCK_FOREACH 231 232 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 233 { 234 RAMBlock *block; 235 int ret = 0; 236 237 RCU_READ_LOCK_GUARD(); 238 239 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 240 ret = func(block, opaque); 241 if (ret) { 242 break; 243 } 244 } 245 return ret; 246 } 247 248 static void ramblock_recv_map_init(void) 249 { 250 RAMBlock *rb; 251 252 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 253 assert(!rb->receivedmap); 254 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 255 } 256 } 257 258 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 259 { 260 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 261 rb->receivedmap); 262 } 263 264 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 265 { 266 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 267 } 268 269 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 270 { 271 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 272 } 273 274 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 275 size_t nr) 276 { 277 bitmap_set_atomic(rb->receivedmap, 278 ramblock_recv_bitmap_offset(host_addr, rb), 279 nr); 280 } 281 282 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 283 284 /* 285 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 286 * 287 * Returns >0 if success with sent bytes, or <0 if error. 288 */ 289 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 290 const char *block_name) 291 { 292 RAMBlock *block = qemu_ram_block_by_name(block_name); 293 unsigned long *le_bitmap, nbits; 294 uint64_t size; 295 296 if (!block) { 297 error_report("%s: invalid block name: %s", __func__, block_name); 298 return -1; 299 } 300 301 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 302 303 /* 304 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 305 * machines we may need 4 more bytes for padding (see below 306 * comment). So extend it a bit before hand. 307 */ 308 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 309 310 /* 311 * Always use little endian when sending the bitmap. This is 312 * required that when source and destination VMs are not using the 313 * same endianness. (Note: big endian won't work.) 314 */ 315 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 316 317 /* Size of the bitmap, in bytes */ 318 size = DIV_ROUND_UP(nbits, 8); 319 320 /* 321 * size is always aligned to 8 bytes for 64bit machines, but it 322 * may not be true for 32bit machines. We need this padding to 323 * make sure the migration can survive even between 32bit and 324 * 64bit machines. 325 */ 326 size = ROUND_UP(size, 8); 327 328 qemu_put_be64(file, size); 329 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 330 /* 331 * Mark as an end, in case the middle part is screwed up due to 332 * some "mysterious" reason. 333 */ 334 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 335 qemu_fflush(file); 336 337 g_free(le_bitmap); 338 339 if (qemu_file_get_error(file)) { 340 return qemu_file_get_error(file); 341 } 342 343 return size + sizeof(size); 344 } 345 346 /* 347 * An outstanding page request, on the source, having been received 348 * and queued 349 */ 350 struct RAMSrcPageRequest { 351 RAMBlock *rb; 352 hwaddr offset; 353 hwaddr len; 354 355 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 356 }; 357 358 /* State of RAM for migration */ 359 struct RAMState { 360 /* 361 * PageSearchStatus structures for the channels when send pages. 362 * Protected by the bitmap_mutex. 363 */ 364 PageSearchStatus pss[RAM_CHANNEL_MAX]; 365 /* UFFD file descriptor, used in 'write-tracking' migration */ 366 int uffdio_fd; 367 /* total ram size in bytes */ 368 uint64_t ram_bytes_total; 369 /* Last block that we have visited searching for dirty pages */ 370 RAMBlock *last_seen_block; 371 /* Last dirty target page we have sent */ 372 ram_addr_t last_page; 373 /* last ram version we have seen */ 374 uint32_t last_version; 375 /* How many times we have dirty too many pages */ 376 int dirty_rate_high_cnt; 377 /* these variables are used for bitmap sync */ 378 /* last time we did a full bitmap_sync */ 379 int64_t time_last_bitmap_sync; 380 /* bytes transferred at start_time */ 381 uint64_t bytes_xfer_prev; 382 /* number of dirty pages since start_time */ 383 uint64_t num_dirty_pages_period; 384 /* xbzrle misses since the beginning of the period */ 385 uint64_t xbzrle_cache_miss_prev; 386 /* Amount of xbzrle pages since the beginning of the period */ 387 uint64_t xbzrle_pages_prev; 388 /* Amount of xbzrle encoded bytes since the beginning of the period */ 389 uint64_t xbzrle_bytes_prev; 390 /* Start using XBZRLE (e.g., after the first round). */ 391 bool xbzrle_enabled; 392 /* Are we on the last stage of migration */ 393 bool last_stage; 394 /* compression statistics since the beginning of the period */ 395 /* amount of count that no free thread to compress data */ 396 uint64_t compress_thread_busy_prev; 397 /* amount bytes after compression */ 398 uint64_t compressed_size_prev; 399 /* amount of compressed pages */ 400 uint64_t compress_pages_prev; 401 402 /* total handled target pages at the beginning of period */ 403 uint64_t target_page_count_prev; 404 /* total handled target pages since start */ 405 uint64_t target_page_count; 406 /* number of dirty bits in the bitmap */ 407 uint64_t migration_dirty_pages; 408 /* 409 * Protects: 410 * - dirty/clear bitmap 411 * - migration_dirty_pages 412 * - pss structures 413 */ 414 QemuMutex bitmap_mutex; 415 /* The RAMBlock used in the last src_page_requests */ 416 RAMBlock *last_req_rb; 417 /* Queue of outstanding page requests from the destination */ 418 QemuMutex src_page_req_mutex; 419 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 420 }; 421 typedef struct RAMState RAMState; 422 423 static RAMState *ram_state; 424 425 static NotifierWithReturnList precopy_notifier_list; 426 427 /* Whether postcopy has queued requests? */ 428 static bool postcopy_has_request(RAMState *rs) 429 { 430 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 431 } 432 433 void precopy_infrastructure_init(void) 434 { 435 notifier_with_return_list_init(&precopy_notifier_list); 436 } 437 438 void precopy_add_notifier(NotifierWithReturn *n) 439 { 440 notifier_with_return_list_add(&precopy_notifier_list, n); 441 } 442 443 void precopy_remove_notifier(NotifierWithReturn *n) 444 { 445 notifier_with_return_remove(n); 446 } 447 448 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 449 { 450 PrecopyNotifyData pnd; 451 pnd.reason = reason; 452 pnd.errp = errp; 453 454 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 455 } 456 457 uint64_t ram_bytes_remaining(void) 458 { 459 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 460 0; 461 } 462 463 RAMStats ram_counters; 464 465 void ram_transferred_add(uint64_t bytes) 466 { 467 if (runstate_is_running()) { 468 stat64_add(&ram_counters.precopy_bytes, bytes); 469 } else if (migration_in_postcopy()) { 470 stat64_add(&ram_counters.postcopy_bytes, bytes); 471 } else { 472 stat64_add(&ram_counters.downtime_bytes, bytes); 473 } 474 stat64_add(&ram_counters.transferred, bytes); 475 } 476 477 struct MigrationOps { 478 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 479 }; 480 typedef struct MigrationOps MigrationOps; 481 482 MigrationOps *migration_ops; 483 484 CompressionStats compression_counters; 485 486 struct CompressParam { 487 bool done; 488 bool quit; 489 bool zero_page; 490 QEMUFile *file; 491 QemuMutex mutex; 492 QemuCond cond; 493 RAMBlock *block; 494 ram_addr_t offset; 495 496 /* internally used fields */ 497 z_stream stream; 498 uint8_t *originbuf; 499 }; 500 typedef struct CompressParam CompressParam; 501 502 struct DecompressParam { 503 bool done; 504 bool quit; 505 QemuMutex mutex; 506 QemuCond cond; 507 void *des; 508 uint8_t *compbuf; 509 int len; 510 z_stream stream; 511 }; 512 typedef struct DecompressParam DecompressParam; 513 514 static CompressParam *comp_param; 515 static QemuThread *compress_threads; 516 /* comp_done_cond is used to wake up the migration thread when 517 * one of the compression threads has finished the compression. 518 * comp_done_lock is used to co-work with comp_done_cond. 519 */ 520 static QemuMutex comp_done_lock; 521 static QemuCond comp_done_cond; 522 523 static QEMUFile *decomp_file; 524 static DecompressParam *decomp_param; 525 static QemuThread *decompress_threads; 526 static QemuMutex decomp_done_lock; 527 static QemuCond decomp_done_cond; 528 529 static int ram_save_host_page_urgent(PageSearchStatus *pss); 530 531 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 532 ram_addr_t offset, uint8_t *source_buf); 533 534 /* NOTE: page is the PFN not real ram_addr_t. */ 535 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 536 { 537 pss->block = rb; 538 pss->page = page; 539 pss->complete_round = false; 540 } 541 542 /* 543 * Check whether two PSSs are actively sending the same page. Return true 544 * if it is, false otherwise. 545 */ 546 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 547 { 548 return pss1->host_page_sending && pss2->host_page_sending && 549 (pss1->host_page_start == pss2->host_page_start); 550 } 551 552 static void *do_data_compress(void *opaque) 553 { 554 CompressParam *param = opaque; 555 RAMBlock *block; 556 ram_addr_t offset; 557 bool zero_page; 558 559 qemu_mutex_lock(¶m->mutex); 560 while (!param->quit) { 561 if (param->block) { 562 block = param->block; 563 offset = param->offset; 564 param->block = NULL; 565 qemu_mutex_unlock(¶m->mutex); 566 567 zero_page = do_compress_ram_page(param->file, ¶m->stream, 568 block, offset, param->originbuf); 569 570 qemu_mutex_lock(&comp_done_lock); 571 param->done = true; 572 param->zero_page = zero_page; 573 qemu_cond_signal(&comp_done_cond); 574 qemu_mutex_unlock(&comp_done_lock); 575 576 qemu_mutex_lock(¶m->mutex); 577 } else { 578 qemu_cond_wait(¶m->cond, ¶m->mutex); 579 } 580 } 581 qemu_mutex_unlock(¶m->mutex); 582 583 return NULL; 584 } 585 586 static void compress_threads_save_cleanup(void) 587 { 588 int i, thread_count; 589 590 if (!migrate_compress() || !comp_param) { 591 return; 592 } 593 594 thread_count = migrate_compress_threads(); 595 for (i = 0; i < thread_count; i++) { 596 /* 597 * we use it as a indicator which shows if the thread is 598 * properly init'd or not 599 */ 600 if (!comp_param[i].file) { 601 break; 602 } 603 604 qemu_mutex_lock(&comp_param[i].mutex); 605 comp_param[i].quit = true; 606 qemu_cond_signal(&comp_param[i].cond); 607 qemu_mutex_unlock(&comp_param[i].mutex); 608 609 qemu_thread_join(compress_threads + i); 610 qemu_mutex_destroy(&comp_param[i].mutex); 611 qemu_cond_destroy(&comp_param[i].cond); 612 deflateEnd(&comp_param[i].stream); 613 g_free(comp_param[i].originbuf); 614 qemu_fclose(comp_param[i].file); 615 comp_param[i].file = NULL; 616 } 617 qemu_mutex_destroy(&comp_done_lock); 618 qemu_cond_destroy(&comp_done_cond); 619 g_free(compress_threads); 620 g_free(comp_param); 621 compress_threads = NULL; 622 comp_param = NULL; 623 } 624 625 static int compress_threads_save_setup(void) 626 { 627 int i, thread_count; 628 629 if (!migrate_compress()) { 630 return 0; 631 } 632 thread_count = migrate_compress_threads(); 633 compress_threads = g_new0(QemuThread, thread_count); 634 comp_param = g_new0(CompressParam, thread_count); 635 qemu_cond_init(&comp_done_cond); 636 qemu_mutex_init(&comp_done_lock); 637 for (i = 0; i < thread_count; i++) { 638 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 639 if (!comp_param[i].originbuf) { 640 goto exit; 641 } 642 643 if (deflateInit(&comp_param[i].stream, 644 migrate_compress_level()) != Z_OK) { 645 g_free(comp_param[i].originbuf); 646 goto exit; 647 } 648 649 /* comp_param[i].file is just used as a dummy buffer to save data, 650 * set its ops to empty. 651 */ 652 comp_param[i].file = qemu_file_new_output( 653 QIO_CHANNEL(qio_channel_null_new())); 654 comp_param[i].done = true; 655 comp_param[i].quit = false; 656 qemu_mutex_init(&comp_param[i].mutex); 657 qemu_cond_init(&comp_param[i].cond); 658 qemu_thread_create(compress_threads + i, "compress", 659 do_data_compress, comp_param + i, 660 QEMU_THREAD_JOINABLE); 661 } 662 return 0; 663 664 exit: 665 compress_threads_save_cleanup(); 666 return -1; 667 } 668 669 /** 670 * save_page_header: write page header to wire 671 * 672 * If this is the 1st block, it also writes the block identification 673 * 674 * Returns the number of bytes written 675 * 676 * @pss: current PSS channel status 677 * @block: block that contains the page we want to send 678 * @offset: offset inside the block for the page 679 * in the lower bits, it contains flags 680 */ 681 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 682 RAMBlock *block, ram_addr_t offset) 683 { 684 size_t size, len; 685 bool same_block = (block == pss->last_sent_block); 686 687 if (same_block) { 688 offset |= RAM_SAVE_FLAG_CONTINUE; 689 } 690 qemu_put_be64(f, offset); 691 size = 8; 692 693 if (!same_block) { 694 len = strlen(block->idstr); 695 qemu_put_byte(f, len); 696 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 697 size += 1 + len; 698 pss->last_sent_block = block; 699 } 700 return size; 701 } 702 703 /** 704 * mig_throttle_guest_down: throttle down the guest 705 * 706 * Reduce amount of guest cpu execution to hopefully slow down memory 707 * writes. If guest dirty memory rate is reduced below the rate at 708 * which we can transfer pages to the destination then we should be 709 * able to complete migration. Some workloads dirty memory way too 710 * fast and will not effectively converge, even with auto-converge. 711 */ 712 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 713 uint64_t bytes_dirty_threshold) 714 { 715 uint64_t pct_initial = migrate_cpu_throttle_initial(); 716 uint64_t pct_increment = migrate_cpu_throttle_increment(); 717 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 718 int pct_max = migrate_max_cpu_throttle(); 719 720 uint64_t throttle_now = cpu_throttle_get_percentage(); 721 uint64_t cpu_now, cpu_ideal, throttle_inc; 722 723 /* We have not started throttling yet. Let's start it. */ 724 if (!cpu_throttle_active()) { 725 cpu_throttle_set(pct_initial); 726 } else { 727 /* Throttling already on, just increase the rate */ 728 if (!pct_tailslow) { 729 throttle_inc = pct_increment; 730 } else { 731 /* Compute the ideal CPU percentage used by Guest, which may 732 * make the dirty rate match the dirty rate threshold. */ 733 cpu_now = 100 - throttle_now; 734 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 735 bytes_dirty_period); 736 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 737 } 738 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 739 } 740 } 741 742 void mig_throttle_counter_reset(void) 743 { 744 RAMState *rs = ram_state; 745 746 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 747 rs->num_dirty_pages_period = 0; 748 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); 749 } 750 751 /** 752 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 753 * 754 * @rs: current RAM state 755 * @current_addr: address for the zero page 756 * 757 * Update the xbzrle cache to reflect a page that's been sent as all 0. 758 * The important thing is that a stale (not-yet-0'd) page be replaced 759 * by the new data. 760 * As a bonus, if the page wasn't in the cache it gets added so that 761 * when a small write is made into the 0'd page it gets XBZRLE sent. 762 */ 763 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 764 { 765 /* We don't care if this fails to allocate a new cache page 766 * as long as it updated an old one */ 767 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 768 stat64_get(&ram_counters.dirty_sync_count)); 769 } 770 771 #define ENCODING_FLAG_XBZRLE 0x1 772 773 /** 774 * save_xbzrle_page: compress and send current page 775 * 776 * Returns: 1 means that we wrote the page 777 * 0 means that page is identical to the one already sent 778 * -1 means that xbzrle would be longer than normal 779 * 780 * @rs: current RAM state 781 * @pss: current PSS channel 782 * @current_data: pointer to the address of the page contents 783 * @current_addr: addr of the page 784 * @block: block that contains the page we want to send 785 * @offset: offset inside the block for the page 786 */ 787 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 788 uint8_t **current_data, ram_addr_t current_addr, 789 RAMBlock *block, ram_addr_t offset) 790 { 791 int encoded_len = 0, bytes_xbzrle; 792 uint8_t *prev_cached_page; 793 QEMUFile *file = pss->pss_channel; 794 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); 795 796 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 797 xbzrle_counters.cache_miss++; 798 if (!rs->last_stage) { 799 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 800 generation) == -1) { 801 return -1; 802 } else { 803 /* update *current_data when the page has been 804 inserted into cache */ 805 *current_data = get_cached_data(XBZRLE.cache, current_addr); 806 } 807 } 808 return -1; 809 } 810 811 /* 812 * Reaching here means the page has hit the xbzrle cache, no matter what 813 * encoding result it is (normal encoding, overflow or skipping the page), 814 * count the page as encoded. This is used to calculate the encoding rate. 815 * 816 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 817 * 2nd page turns out to be skipped (i.e. no new bytes written to the 818 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 819 * skipped page included. In this way, the encoding rate can tell if the 820 * guest page is good for xbzrle encoding. 821 */ 822 xbzrle_counters.pages++; 823 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 824 825 /* save current buffer into memory */ 826 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 827 828 /* XBZRLE encoding (if there is no overflow) */ 829 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, 830 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 831 TARGET_PAGE_SIZE); 832 833 /* 834 * Update the cache contents, so that it corresponds to the data 835 * sent, in all cases except where we skip the page. 836 */ 837 if (!rs->last_stage && encoded_len != 0) { 838 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 839 /* 840 * In the case where we couldn't compress, ensure that the caller 841 * sends the data from the cache, since the guest might have 842 * changed the RAM since we copied it. 843 */ 844 *current_data = prev_cached_page; 845 } 846 847 if (encoded_len == 0) { 848 trace_save_xbzrle_page_skipping(); 849 return 0; 850 } else if (encoded_len == -1) { 851 trace_save_xbzrle_page_overflow(); 852 xbzrle_counters.overflow++; 853 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 854 return -1; 855 } 856 857 /* Send XBZRLE based compressed page */ 858 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 859 offset | RAM_SAVE_FLAG_XBZRLE); 860 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 861 qemu_put_be16(file, encoded_len); 862 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 863 bytes_xbzrle += encoded_len + 1 + 2; 864 /* 865 * Like compressed_size (please see update_compress_thread_counts), 866 * the xbzrle encoded bytes don't count the 8 byte header with 867 * RAM_SAVE_FLAG_CONTINUE. 868 */ 869 xbzrle_counters.bytes += bytes_xbzrle - 8; 870 ram_transferred_add(bytes_xbzrle); 871 872 return 1; 873 } 874 875 /** 876 * pss_find_next_dirty: find the next dirty page of current ramblock 877 * 878 * This function updates pss->page to point to the next dirty page index 879 * within the ramblock to migrate, or the end of ramblock when nothing 880 * found. Note that when pss->host_page_sending==true it means we're 881 * during sending a host page, so we won't look for dirty page that is 882 * outside the host page boundary. 883 * 884 * @pss: the current page search status 885 */ 886 static void pss_find_next_dirty(PageSearchStatus *pss) 887 { 888 RAMBlock *rb = pss->block; 889 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 890 unsigned long *bitmap = rb->bmap; 891 892 if (ramblock_is_ignored(rb)) { 893 /* Points directly to the end, so we know no dirty page */ 894 pss->page = size; 895 return; 896 } 897 898 /* 899 * If during sending a host page, only look for dirty pages within the 900 * current host page being send. 901 */ 902 if (pss->host_page_sending) { 903 assert(pss->host_page_end); 904 size = MIN(size, pss->host_page_end); 905 } 906 907 pss->page = find_next_bit(bitmap, size, pss->page); 908 } 909 910 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 911 unsigned long page) 912 { 913 uint8_t shift; 914 hwaddr size, start; 915 916 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 917 return; 918 } 919 920 shift = rb->clear_bmap_shift; 921 /* 922 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 923 * can make things easier sometimes since then start address 924 * of the small chunk will always be 64 pages aligned so the 925 * bitmap will always be aligned to unsigned long. We should 926 * even be able to remove this restriction but I'm simply 927 * keeping it. 928 */ 929 assert(shift >= 6); 930 931 size = 1ULL << (TARGET_PAGE_BITS + shift); 932 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 933 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 934 memory_region_clear_dirty_bitmap(rb->mr, start, size); 935 } 936 937 static void 938 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 939 unsigned long start, 940 unsigned long npages) 941 { 942 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 943 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 944 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 945 946 /* 947 * Clear pages from start to start + npages - 1, so the end boundary is 948 * exclusive. 949 */ 950 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 951 migration_clear_memory_region_dirty_bitmap(rb, i); 952 } 953 } 954 955 /* 956 * colo_bitmap_find_diry:find contiguous dirty pages from start 957 * 958 * Returns the page offset within memory region of the start of the contiguout 959 * dirty page 960 * 961 * @rs: current RAM state 962 * @rb: RAMBlock where to search for dirty pages 963 * @start: page where we start the search 964 * @num: the number of contiguous dirty pages 965 */ 966 static inline 967 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 968 unsigned long start, unsigned long *num) 969 { 970 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 971 unsigned long *bitmap = rb->bmap; 972 unsigned long first, next; 973 974 *num = 0; 975 976 if (ramblock_is_ignored(rb)) { 977 return size; 978 } 979 980 first = find_next_bit(bitmap, size, start); 981 if (first >= size) { 982 return first; 983 } 984 next = find_next_zero_bit(bitmap, size, first + 1); 985 assert(next >= first); 986 *num = next - first; 987 return first; 988 } 989 990 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 991 RAMBlock *rb, 992 unsigned long page) 993 { 994 bool ret; 995 996 /* 997 * Clear dirty bitmap if needed. This _must_ be called before we 998 * send any of the page in the chunk because we need to make sure 999 * we can capture further page content changes when we sync dirty 1000 * log the next time. So as long as we are going to send any of 1001 * the page in the chunk we clear the remote dirty bitmap for all. 1002 * Clearing it earlier won't be a problem, but too late will. 1003 */ 1004 migration_clear_memory_region_dirty_bitmap(rb, page); 1005 1006 ret = test_and_clear_bit(page, rb->bmap); 1007 if (ret) { 1008 rs->migration_dirty_pages--; 1009 } 1010 1011 return ret; 1012 } 1013 1014 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 1015 void *opaque) 1016 { 1017 const hwaddr offset = section->offset_within_region; 1018 const hwaddr size = int128_get64(section->size); 1019 const unsigned long start = offset >> TARGET_PAGE_BITS; 1020 const unsigned long npages = size >> TARGET_PAGE_BITS; 1021 RAMBlock *rb = section->mr->ram_block; 1022 uint64_t *cleared_bits = opaque; 1023 1024 /* 1025 * We don't grab ram_state->bitmap_mutex because we expect to run 1026 * only when starting migration or during postcopy recovery where 1027 * we don't have concurrent access. 1028 */ 1029 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1030 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1031 } 1032 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1033 bitmap_clear(rb->bmap, start, npages); 1034 } 1035 1036 /* 1037 * Exclude all dirty pages from migration that fall into a discarded range as 1038 * managed by a RamDiscardManager responsible for the mapped memory region of 1039 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1040 * 1041 * Discarded pages ("logically unplugged") have undefined content and must 1042 * not get migrated, because even reading these pages for migration might 1043 * result in undesired behavior. 1044 * 1045 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1046 * 1047 * Note: The result is only stable while migrating (precopy/postcopy). 1048 */ 1049 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1050 { 1051 uint64_t cleared_bits = 0; 1052 1053 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1054 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1055 MemoryRegionSection section = { 1056 .mr = rb->mr, 1057 .offset_within_region = 0, 1058 .size = int128_make64(qemu_ram_get_used_length(rb)), 1059 }; 1060 1061 ram_discard_manager_replay_discarded(rdm, §ion, 1062 dirty_bitmap_clear_section, 1063 &cleared_bits); 1064 } 1065 return cleared_bits; 1066 } 1067 1068 /* 1069 * Check if a host-page aligned page falls into a discarded range as managed by 1070 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1071 * 1072 * Note: The result is only stable while migrating (precopy/postcopy). 1073 */ 1074 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1075 { 1076 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1077 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1078 MemoryRegionSection section = { 1079 .mr = rb->mr, 1080 .offset_within_region = start, 1081 .size = int128_make64(qemu_ram_pagesize(rb)), 1082 }; 1083 1084 return !ram_discard_manager_is_populated(rdm, §ion); 1085 } 1086 return false; 1087 } 1088 1089 /* Called with RCU critical section */ 1090 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1091 { 1092 uint64_t new_dirty_pages = 1093 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1094 1095 rs->migration_dirty_pages += new_dirty_pages; 1096 rs->num_dirty_pages_period += new_dirty_pages; 1097 } 1098 1099 /** 1100 * ram_pagesize_summary: calculate all the pagesizes of a VM 1101 * 1102 * Returns a summary bitmap of the page sizes of all RAMBlocks 1103 * 1104 * For VMs with just normal pages this is equivalent to the host page 1105 * size. If it's got some huge pages then it's the OR of all the 1106 * different page sizes. 1107 */ 1108 uint64_t ram_pagesize_summary(void) 1109 { 1110 RAMBlock *block; 1111 uint64_t summary = 0; 1112 1113 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1114 summary |= block->page_size; 1115 } 1116 1117 return summary; 1118 } 1119 1120 uint64_t ram_get_total_transferred_pages(void) 1121 { 1122 return stat64_get(&ram_counters.normal_pages) + 1123 stat64_get(&ram_counters.zero_pages) + 1124 compression_counters.pages + xbzrle_counters.pages; 1125 } 1126 1127 static void migration_update_rates(RAMState *rs, int64_t end_time) 1128 { 1129 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1130 double compressed_size; 1131 1132 /* calculate period counters */ 1133 stat64_set(&ram_counters.dirty_pages_rate, 1134 rs->num_dirty_pages_period * 1000 / 1135 (end_time - rs->time_last_bitmap_sync)); 1136 1137 if (!page_count) { 1138 return; 1139 } 1140 1141 if (migrate_xbzrle()) { 1142 double encoded_size, unencoded_size; 1143 1144 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1145 rs->xbzrle_cache_miss_prev) / page_count; 1146 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1147 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1148 TARGET_PAGE_SIZE; 1149 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1150 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1151 xbzrle_counters.encoding_rate = 0; 1152 } else { 1153 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1154 } 1155 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1156 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1157 } 1158 1159 if (migrate_compress()) { 1160 compression_counters.busy_rate = (double)(compression_counters.busy - 1161 rs->compress_thread_busy_prev) / page_count; 1162 rs->compress_thread_busy_prev = compression_counters.busy; 1163 1164 compressed_size = compression_counters.compressed_size - 1165 rs->compressed_size_prev; 1166 if (compressed_size) { 1167 double uncompressed_size = (compression_counters.pages - 1168 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1169 1170 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1171 compression_counters.compression_rate = 1172 uncompressed_size / compressed_size; 1173 1174 rs->compress_pages_prev = compression_counters.pages; 1175 rs->compressed_size_prev = compression_counters.compressed_size; 1176 } 1177 } 1178 } 1179 1180 static void migration_trigger_throttle(RAMState *rs) 1181 { 1182 uint64_t threshold = migrate_throttle_trigger_threshold(); 1183 uint64_t bytes_xfer_period = 1184 stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev; 1185 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1186 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1187 1188 /* During block migration the auto-converge logic incorrectly detects 1189 * that ram migration makes no progress. Avoid this by disabling the 1190 * throttling logic during the bulk phase of block migration. */ 1191 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1192 /* The following detection logic can be refined later. For now: 1193 Check to see if the ratio between dirtied bytes and the approx. 1194 amount of bytes that just got transferred since the last time 1195 we were in this routine reaches the threshold. If that happens 1196 twice, start or increase throttling. */ 1197 1198 if ((bytes_dirty_period > bytes_dirty_threshold) && 1199 (++rs->dirty_rate_high_cnt >= 2)) { 1200 trace_migration_throttle(); 1201 rs->dirty_rate_high_cnt = 0; 1202 mig_throttle_guest_down(bytes_dirty_period, 1203 bytes_dirty_threshold); 1204 } 1205 } 1206 } 1207 1208 static void migration_bitmap_sync(RAMState *rs) 1209 { 1210 RAMBlock *block; 1211 int64_t end_time; 1212 1213 stat64_add(&ram_counters.dirty_sync_count, 1); 1214 1215 if (!rs->time_last_bitmap_sync) { 1216 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1217 } 1218 1219 trace_migration_bitmap_sync_start(); 1220 memory_global_dirty_log_sync(); 1221 1222 qemu_mutex_lock(&rs->bitmap_mutex); 1223 WITH_RCU_READ_LOCK_GUARD() { 1224 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1225 ramblock_sync_dirty_bitmap(rs, block); 1226 } 1227 stat64_set(&ram_counters.dirty_bytes_last_sync, ram_bytes_remaining()); 1228 } 1229 qemu_mutex_unlock(&rs->bitmap_mutex); 1230 1231 memory_global_after_dirty_log_sync(); 1232 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1233 1234 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1235 1236 /* more than 1 second = 1000 millisecons */ 1237 if (end_time > rs->time_last_bitmap_sync + 1000) { 1238 migration_trigger_throttle(rs); 1239 1240 migration_update_rates(rs, end_time); 1241 1242 rs->target_page_count_prev = rs->target_page_count; 1243 1244 /* reset period counters */ 1245 rs->time_last_bitmap_sync = end_time; 1246 rs->num_dirty_pages_period = 0; 1247 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); 1248 } 1249 if (migrate_events()) { 1250 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); 1251 qapi_event_send_migration_pass(generation); 1252 } 1253 } 1254 1255 static void migration_bitmap_sync_precopy(RAMState *rs) 1256 { 1257 Error *local_err = NULL; 1258 1259 /* 1260 * The current notifier usage is just an optimization to migration, so we 1261 * don't stop the normal migration process in the error case. 1262 */ 1263 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1264 error_report_err(local_err); 1265 local_err = NULL; 1266 } 1267 1268 migration_bitmap_sync(rs); 1269 1270 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1271 error_report_err(local_err); 1272 } 1273 } 1274 1275 void ram_release_page(const char *rbname, uint64_t offset) 1276 { 1277 if (!migrate_release_ram() || !migration_in_postcopy()) { 1278 return; 1279 } 1280 1281 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1282 } 1283 1284 /** 1285 * save_zero_page_to_file: send the zero page to the file 1286 * 1287 * Returns the size of data written to the file, 0 means the page is not 1288 * a zero page 1289 * 1290 * @pss: current PSS channel 1291 * @block: block that contains the page we want to send 1292 * @offset: offset inside the block for the page 1293 */ 1294 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1295 RAMBlock *block, ram_addr_t offset) 1296 { 1297 uint8_t *p = block->host + offset; 1298 int len = 0; 1299 1300 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1301 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1302 qemu_put_byte(file, 0); 1303 len += 1; 1304 ram_release_page(block->idstr, offset); 1305 } 1306 return len; 1307 } 1308 1309 /** 1310 * save_zero_page: send the zero page to the stream 1311 * 1312 * Returns the number of pages written. 1313 * 1314 * @pss: current PSS channel 1315 * @block: block that contains the page we want to send 1316 * @offset: offset inside the block for the page 1317 */ 1318 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1319 ram_addr_t offset) 1320 { 1321 int len = save_zero_page_to_file(pss, f, block, offset); 1322 1323 if (len) { 1324 stat64_add(&ram_counters.zero_pages, 1); 1325 ram_transferred_add(len); 1326 return 1; 1327 } 1328 return -1; 1329 } 1330 1331 /* 1332 * @pages: the number of pages written by the control path, 1333 * < 0 - error 1334 * > 0 - number of pages written 1335 * 1336 * Return true if the pages has been saved, otherwise false is returned. 1337 */ 1338 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1339 ram_addr_t offset, int *pages) 1340 { 1341 uint64_t bytes_xmit = 0; 1342 int ret; 1343 1344 *pages = -1; 1345 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1346 TARGET_PAGE_SIZE, &bytes_xmit); 1347 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1348 return false; 1349 } 1350 1351 if (bytes_xmit) { 1352 ram_transferred_add(bytes_xmit); 1353 *pages = 1; 1354 } 1355 1356 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1357 return true; 1358 } 1359 1360 if (bytes_xmit > 0) { 1361 stat64_add(&ram_counters.normal_pages, 1); 1362 } else if (bytes_xmit == 0) { 1363 stat64_add(&ram_counters.zero_pages, 1); 1364 } 1365 1366 return true; 1367 } 1368 1369 /* 1370 * directly send the page to the stream 1371 * 1372 * Returns the number of pages written. 1373 * 1374 * @pss: current PSS channel 1375 * @block: block that contains the page we want to send 1376 * @offset: offset inside the block for the page 1377 * @buf: the page to be sent 1378 * @async: send to page asyncly 1379 */ 1380 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1381 ram_addr_t offset, uint8_t *buf, bool async) 1382 { 1383 QEMUFile *file = pss->pss_channel; 1384 1385 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1386 offset | RAM_SAVE_FLAG_PAGE)); 1387 if (async) { 1388 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1389 migrate_release_ram() && 1390 migration_in_postcopy()); 1391 } else { 1392 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1393 } 1394 ram_transferred_add(TARGET_PAGE_SIZE); 1395 stat64_add(&ram_counters.normal_pages, 1); 1396 return 1; 1397 } 1398 1399 /** 1400 * ram_save_page: send the given page to the stream 1401 * 1402 * Returns the number of pages written. 1403 * < 0 - error 1404 * >=0 - Number of pages written - this might legally be 0 1405 * if xbzrle noticed the page was the same. 1406 * 1407 * @rs: current RAM state 1408 * @block: block that contains the page we want to send 1409 * @offset: offset inside the block for the page 1410 */ 1411 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1412 { 1413 int pages = -1; 1414 uint8_t *p; 1415 bool send_async = true; 1416 RAMBlock *block = pss->block; 1417 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1418 ram_addr_t current_addr = block->offset + offset; 1419 1420 p = block->host + offset; 1421 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1422 1423 XBZRLE_cache_lock(); 1424 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1425 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1426 block, offset); 1427 if (!rs->last_stage) { 1428 /* Can't send this cached data async, since the cache page 1429 * might get updated before it gets to the wire 1430 */ 1431 send_async = false; 1432 } 1433 } 1434 1435 /* XBZRLE overflow or normal page */ 1436 if (pages == -1) { 1437 pages = save_normal_page(pss, block, offset, p, send_async); 1438 } 1439 1440 XBZRLE_cache_unlock(); 1441 1442 return pages; 1443 } 1444 1445 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1446 ram_addr_t offset) 1447 { 1448 if (multifd_queue_page(file, block, offset) < 0) { 1449 return -1; 1450 } 1451 stat64_add(&ram_counters.normal_pages, 1); 1452 1453 return 1; 1454 } 1455 1456 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1457 ram_addr_t offset, uint8_t *source_buf) 1458 { 1459 RAMState *rs = ram_state; 1460 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1461 uint8_t *p = block->host + offset; 1462 int ret; 1463 1464 if (save_zero_page_to_file(pss, f, block, offset)) { 1465 return true; 1466 } 1467 1468 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1469 1470 /* 1471 * copy it to a internal buffer to avoid it being modified by VM 1472 * so that we can catch up the error during compression and 1473 * decompression 1474 */ 1475 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1476 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1477 if (ret < 0) { 1478 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1479 error_report("compressed data failed!"); 1480 } 1481 return false; 1482 } 1483 1484 static void 1485 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1486 { 1487 ram_transferred_add(bytes_xmit); 1488 1489 if (param->zero_page) { 1490 stat64_add(&ram_counters.zero_pages, 1); 1491 return; 1492 } 1493 1494 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1495 compression_counters.compressed_size += bytes_xmit - 8; 1496 compression_counters.pages++; 1497 } 1498 1499 static bool save_page_use_compression(RAMState *rs); 1500 1501 static void flush_compressed_data(RAMState *rs) 1502 { 1503 MigrationState *ms = migrate_get_current(); 1504 int idx, len, thread_count; 1505 1506 if (!save_page_use_compression(rs)) { 1507 return; 1508 } 1509 thread_count = migrate_compress_threads(); 1510 1511 qemu_mutex_lock(&comp_done_lock); 1512 for (idx = 0; idx < thread_count; idx++) { 1513 while (!comp_param[idx].done) { 1514 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1515 } 1516 } 1517 qemu_mutex_unlock(&comp_done_lock); 1518 1519 for (idx = 0; idx < thread_count; idx++) { 1520 qemu_mutex_lock(&comp_param[idx].mutex); 1521 if (!comp_param[idx].quit) { 1522 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1523 /* 1524 * it's safe to fetch zero_page without holding comp_done_lock 1525 * as there is no further request submitted to the thread, 1526 * i.e, the thread should be waiting for a request at this point. 1527 */ 1528 update_compress_thread_counts(&comp_param[idx], len); 1529 } 1530 qemu_mutex_unlock(&comp_param[idx].mutex); 1531 } 1532 } 1533 1534 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1535 ram_addr_t offset) 1536 { 1537 param->block = block; 1538 param->offset = offset; 1539 } 1540 1541 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1542 { 1543 int idx, thread_count, bytes_xmit = -1, pages = -1; 1544 bool wait = migrate_compress_wait_thread(); 1545 MigrationState *ms = migrate_get_current(); 1546 1547 thread_count = migrate_compress_threads(); 1548 qemu_mutex_lock(&comp_done_lock); 1549 retry: 1550 for (idx = 0; idx < thread_count; idx++) { 1551 if (comp_param[idx].done) { 1552 comp_param[idx].done = false; 1553 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1554 comp_param[idx].file); 1555 qemu_mutex_lock(&comp_param[idx].mutex); 1556 set_compress_params(&comp_param[idx], block, offset); 1557 qemu_cond_signal(&comp_param[idx].cond); 1558 qemu_mutex_unlock(&comp_param[idx].mutex); 1559 pages = 1; 1560 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1561 break; 1562 } 1563 } 1564 1565 /* 1566 * wait for the free thread if the user specifies 'compress-wait-thread', 1567 * otherwise we will post the page out in the main thread as normal page. 1568 */ 1569 if (pages < 0 && wait) { 1570 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1571 goto retry; 1572 } 1573 qemu_mutex_unlock(&comp_done_lock); 1574 1575 return pages; 1576 } 1577 1578 #define PAGE_ALL_CLEAN 0 1579 #define PAGE_TRY_AGAIN 1 1580 #define PAGE_DIRTY_FOUND 2 1581 /** 1582 * find_dirty_block: find the next dirty page and update any state 1583 * associated with the search process. 1584 * 1585 * Returns: 1586 * <0: An error happened 1587 * PAGE_ALL_CLEAN: no dirty page found, give up 1588 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1589 * PAGE_DIRTY_FOUND: dirty page found 1590 * 1591 * @rs: current RAM state 1592 * @pss: data about the state of the current dirty page scan 1593 * @again: set to false if the search has scanned the whole of RAM 1594 */ 1595 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1596 { 1597 /* Update pss->page for the next dirty bit in ramblock */ 1598 pss_find_next_dirty(pss); 1599 1600 if (pss->complete_round && pss->block == rs->last_seen_block && 1601 pss->page >= rs->last_page) { 1602 /* 1603 * We've been once around the RAM and haven't found anything. 1604 * Give up. 1605 */ 1606 return PAGE_ALL_CLEAN; 1607 } 1608 if (!offset_in_ramblock(pss->block, 1609 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1610 /* Didn't find anything in this RAM Block */ 1611 pss->page = 0; 1612 pss->block = QLIST_NEXT_RCU(pss->block, next); 1613 if (!pss->block) { 1614 if (!migrate_multifd_flush_after_each_section()) { 1615 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1616 int ret = multifd_send_sync_main(f); 1617 if (ret < 0) { 1618 return ret; 1619 } 1620 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 1621 qemu_fflush(f); 1622 } 1623 /* 1624 * If memory migration starts over, we will meet a dirtied page 1625 * which may still exists in compression threads's ring, so we 1626 * should flush the compressed data to make sure the new page 1627 * is not overwritten by the old one in the destination. 1628 * 1629 * Also If xbzrle is on, stop using the data compression at this 1630 * point. In theory, xbzrle can do better than compression. 1631 */ 1632 flush_compressed_data(rs); 1633 1634 /* Hit the end of the list */ 1635 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1636 /* Flag that we've looped */ 1637 pss->complete_round = true; 1638 /* After the first round, enable XBZRLE. */ 1639 if (migrate_xbzrle()) { 1640 rs->xbzrle_enabled = true; 1641 } 1642 } 1643 /* Didn't find anything this time, but try again on the new block */ 1644 return PAGE_TRY_AGAIN; 1645 } else { 1646 /* We've found something */ 1647 return PAGE_DIRTY_FOUND; 1648 } 1649 } 1650 1651 /** 1652 * unqueue_page: gets a page of the queue 1653 * 1654 * Helper for 'get_queued_page' - gets a page off the queue 1655 * 1656 * Returns the block of the page (or NULL if none available) 1657 * 1658 * @rs: current RAM state 1659 * @offset: used to return the offset within the RAMBlock 1660 */ 1661 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1662 { 1663 struct RAMSrcPageRequest *entry; 1664 RAMBlock *block = NULL; 1665 1666 if (!postcopy_has_request(rs)) { 1667 return NULL; 1668 } 1669 1670 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1671 1672 /* 1673 * This should _never_ change even after we take the lock, because no one 1674 * should be taking anything off the request list other than us. 1675 */ 1676 assert(postcopy_has_request(rs)); 1677 1678 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1679 block = entry->rb; 1680 *offset = entry->offset; 1681 1682 if (entry->len > TARGET_PAGE_SIZE) { 1683 entry->len -= TARGET_PAGE_SIZE; 1684 entry->offset += TARGET_PAGE_SIZE; 1685 } else { 1686 memory_region_unref(block->mr); 1687 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1688 g_free(entry); 1689 migration_consume_urgent_request(); 1690 } 1691 1692 return block; 1693 } 1694 1695 #if defined(__linux__) 1696 /** 1697 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1698 * is found, return RAM block pointer and page offset 1699 * 1700 * Returns pointer to the RAMBlock containing faulting page, 1701 * NULL if no write faults are pending 1702 * 1703 * @rs: current RAM state 1704 * @offset: page offset from the beginning of the block 1705 */ 1706 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1707 { 1708 struct uffd_msg uffd_msg; 1709 void *page_address; 1710 RAMBlock *block; 1711 int res; 1712 1713 if (!migrate_background_snapshot()) { 1714 return NULL; 1715 } 1716 1717 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1718 if (res <= 0) { 1719 return NULL; 1720 } 1721 1722 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1723 block = qemu_ram_block_from_host(page_address, false, offset); 1724 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1725 return block; 1726 } 1727 1728 /** 1729 * ram_save_release_protection: release UFFD write protection after 1730 * a range of pages has been saved 1731 * 1732 * @rs: current RAM state 1733 * @pss: page-search-status structure 1734 * @start_page: index of the first page in the range relative to pss->block 1735 * 1736 * Returns 0 on success, negative value in case of an error 1737 */ 1738 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1739 unsigned long start_page) 1740 { 1741 int res = 0; 1742 1743 /* Check if page is from UFFD-managed region. */ 1744 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1745 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1746 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1747 1748 /* Flush async buffers before un-protect. */ 1749 qemu_fflush(pss->pss_channel); 1750 /* Un-protect memory range. */ 1751 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1752 false, false); 1753 } 1754 1755 return res; 1756 } 1757 1758 /* ram_write_tracking_available: check if kernel supports required UFFD features 1759 * 1760 * Returns true if supports, false otherwise 1761 */ 1762 bool ram_write_tracking_available(void) 1763 { 1764 uint64_t uffd_features; 1765 int res; 1766 1767 res = uffd_query_features(&uffd_features); 1768 return (res == 0 && 1769 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1770 } 1771 1772 /* ram_write_tracking_compatible: check if guest configuration is 1773 * compatible with 'write-tracking' 1774 * 1775 * Returns true if compatible, false otherwise 1776 */ 1777 bool ram_write_tracking_compatible(void) 1778 { 1779 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1780 int uffd_fd; 1781 RAMBlock *block; 1782 bool ret = false; 1783 1784 /* Open UFFD file descriptor */ 1785 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1786 if (uffd_fd < 0) { 1787 return false; 1788 } 1789 1790 RCU_READ_LOCK_GUARD(); 1791 1792 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1793 uint64_t uffd_ioctls; 1794 1795 /* Nothing to do with read-only and MMIO-writable regions */ 1796 if (block->mr->readonly || block->mr->rom_device) { 1797 continue; 1798 } 1799 /* Try to register block memory via UFFD-IO to track writes */ 1800 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1801 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1802 goto out; 1803 } 1804 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1805 goto out; 1806 } 1807 } 1808 ret = true; 1809 1810 out: 1811 uffd_close_fd(uffd_fd); 1812 return ret; 1813 } 1814 1815 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1816 ram_addr_t size) 1817 { 1818 const ram_addr_t end = offset + size; 1819 1820 /* 1821 * We read one byte of each page; this will preallocate page tables if 1822 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1823 * where no page was populated yet. This might require adaption when 1824 * supporting other mappings, like shmem. 1825 */ 1826 for (; offset < end; offset += block->page_size) { 1827 char tmp = *((char *)block->host + offset); 1828 1829 /* Don't optimize the read out */ 1830 asm volatile("" : "+r" (tmp)); 1831 } 1832 } 1833 1834 static inline int populate_read_section(MemoryRegionSection *section, 1835 void *opaque) 1836 { 1837 const hwaddr size = int128_get64(section->size); 1838 hwaddr offset = section->offset_within_region; 1839 RAMBlock *block = section->mr->ram_block; 1840 1841 populate_read_range(block, offset, size); 1842 return 0; 1843 } 1844 1845 /* 1846 * ram_block_populate_read: preallocate page tables and populate pages in the 1847 * RAM block by reading a byte of each page. 1848 * 1849 * Since it's solely used for userfault_fd WP feature, here we just 1850 * hardcode page size to qemu_real_host_page_size. 1851 * 1852 * @block: RAM block to populate 1853 */ 1854 static void ram_block_populate_read(RAMBlock *rb) 1855 { 1856 /* 1857 * Skip populating all pages that fall into a discarded range as managed by 1858 * a RamDiscardManager responsible for the mapped memory region of the 1859 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1860 * must not get populated automatically. We don't have to track 1861 * modifications via userfaultfd WP reliably, because these pages will 1862 * not be part of the migration stream either way -- see 1863 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1864 * 1865 * Note: The result is only stable while migrating (precopy/postcopy). 1866 */ 1867 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1868 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1869 MemoryRegionSection section = { 1870 .mr = rb->mr, 1871 .offset_within_region = 0, 1872 .size = rb->mr->size, 1873 }; 1874 1875 ram_discard_manager_replay_populated(rdm, §ion, 1876 populate_read_section, NULL); 1877 } else { 1878 populate_read_range(rb, 0, rb->used_length); 1879 } 1880 } 1881 1882 /* 1883 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1884 */ 1885 void ram_write_tracking_prepare(void) 1886 { 1887 RAMBlock *block; 1888 1889 RCU_READ_LOCK_GUARD(); 1890 1891 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1892 /* Nothing to do with read-only and MMIO-writable regions */ 1893 if (block->mr->readonly || block->mr->rom_device) { 1894 continue; 1895 } 1896 1897 /* 1898 * Populate pages of the RAM block before enabling userfault_fd 1899 * write protection. 1900 * 1901 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1902 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1903 * pages with pte_none() entries in page table. 1904 */ 1905 ram_block_populate_read(block); 1906 } 1907 } 1908 1909 static inline int uffd_protect_section(MemoryRegionSection *section, 1910 void *opaque) 1911 { 1912 const hwaddr size = int128_get64(section->size); 1913 const hwaddr offset = section->offset_within_region; 1914 RAMBlock *rb = section->mr->ram_block; 1915 int uffd_fd = (uintptr_t)opaque; 1916 1917 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1918 false); 1919 } 1920 1921 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1922 { 1923 assert(rb->flags & RAM_UF_WRITEPROTECT); 1924 1925 /* See ram_block_populate_read() */ 1926 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1927 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1928 MemoryRegionSection section = { 1929 .mr = rb->mr, 1930 .offset_within_region = 0, 1931 .size = rb->mr->size, 1932 }; 1933 1934 return ram_discard_manager_replay_populated(rdm, §ion, 1935 uffd_protect_section, 1936 (void *)(uintptr_t)uffd_fd); 1937 } 1938 return uffd_change_protection(uffd_fd, rb->host, 1939 rb->used_length, true, false); 1940 } 1941 1942 /* 1943 * ram_write_tracking_start: start UFFD-WP memory tracking 1944 * 1945 * Returns 0 for success or negative value in case of error 1946 */ 1947 int ram_write_tracking_start(void) 1948 { 1949 int uffd_fd; 1950 RAMState *rs = ram_state; 1951 RAMBlock *block; 1952 1953 /* Open UFFD file descriptor */ 1954 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1955 if (uffd_fd < 0) { 1956 return uffd_fd; 1957 } 1958 rs->uffdio_fd = uffd_fd; 1959 1960 RCU_READ_LOCK_GUARD(); 1961 1962 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1963 /* Nothing to do with read-only and MMIO-writable regions */ 1964 if (block->mr->readonly || block->mr->rom_device) { 1965 continue; 1966 } 1967 1968 /* Register block memory with UFFD to track writes */ 1969 if (uffd_register_memory(rs->uffdio_fd, block->host, 1970 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1971 goto fail; 1972 } 1973 block->flags |= RAM_UF_WRITEPROTECT; 1974 memory_region_ref(block->mr); 1975 1976 /* Apply UFFD write protection to the block memory range */ 1977 if (ram_block_uffd_protect(block, uffd_fd)) { 1978 goto fail; 1979 } 1980 1981 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1982 block->host, block->max_length); 1983 } 1984 1985 return 0; 1986 1987 fail: 1988 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1989 1990 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1991 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1992 continue; 1993 } 1994 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1995 /* Cleanup flags and remove reference */ 1996 block->flags &= ~RAM_UF_WRITEPROTECT; 1997 memory_region_unref(block->mr); 1998 } 1999 2000 uffd_close_fd(uffd_fd); 2001 rs->uffdio_fd = -1; 2002 return -1; 2003 } 2004 2005 /** 2006 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 2007 */ 2008 void ram_write_tracking_stop(void) 2009 { 2010 RAMState *rs = ram_state; 2011 RAMBlock *block; 2012 2013 RCU_READ_LOCK_GUARD(); 2014 2015 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2016 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 2017 continue; 2018 } 2019 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 2020 2021 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 2022 block->host, block->max_length); 2023 2024 /* Cleanup flags and remove reference */ 2025 block->flags &= ~RAM_UF_WRITEPROTECT; 2026 memory_region_unref(block->mr); 2027 } 2028 2029 /* Finally close UFFD file descriptor */ 2030 uffd_close_fd(rs->uffdio_fd); 2031 rs->uffdio_fd = -1; 2032 } 2033 2034 #else 2035 /* No target OS support, stubs just fail or ignore */ 2036 2037 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 2038 { 2039 (void) rs; 2040 (void) offset; 2041 2042 return NULL; 2043 } 2044 2045 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 2046 unsigned long start_page) 2047 { 2048 (void) rs; 2049 (void) pss; 2050 (void) start_page; 2051 2052 return 0; 2053 } 2054 2055 bool ram_write_tracking_available(void) 2056 { 2057 return false; 2058 } 2059 2060 bool ram_write_tracking_compatible(void) 2061 { 2062 assert(0); 2063 return false; 2064 } 2065 2066 int ram_write_tracking_start(void) 2067 { 2068 assert(0); 2069 return -1; 2070 } 2071 2072 void ram_write_tracking_stop(void) 2073 { 2074 assert(0); 2075 } 2076 #endif /* defined(__linux__) */ 2077 2078 /** 2079 * get_queued_page: unqueue a page from the postcopy requests 2080 * 2081 * Skips pages that are already sent (!dirty) 2082 * 2083 * Returns true if a queued page is found 2084 * 2085 * @rs: current RAM state 2086 * @pss: data about the state of the current dirty page scan 2087 */ 2088 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2089 { 2090 RAMBlock *block; 2091 ram_addr_t offset; 2092 bool dirty; 2093 2094 do { 2095 block = unqueue_page(rs, &offset); 2096 /* 2097 * We're sending this page, and since it's postcopy nothing else 2098 * will dirty it, and we must make sure it doesn't get sent again 2099 * even if this queue request was received after the background 2100 * search already sent it. 2101 */ 2102 if (block) { 2103 unsigned long page; 2104 2105 page = offset >> TARGET_PAGE_BITS; 2106 dirty = test_bit(page, block->bmap); 2107 if (!dirty) { 2108 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2109 page); 2110 } else { 2111 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2112 } 2113 } 2114 2115 } while (block && !dirty); 2116 2117 if (!block) { 2118 /* 2119 * Poll write faults too if background snapshot is enabled; that's 2120 * when we have vcpus got blocked by the write protected pages. 2121 */ 2122 block = poll_fault_page(rs, &offset); 2123 } 2124 2125 if (block) { 2126 /* 2127 * We want the background search to continue from the queued page 2128 * since the guest is likely to want other pages near to the page 2129 * it just requested. 2130 */ 2131 pss->block = block; 2132 pss->page = offset >> TARGET_PAGE_BITS; 2133 2134 /* 2135 * This unqueued page would break the "one round" check, even is 2136 * really rare. 2137 */ 2138 pss->complete_round = false; 2139 } 2140 2141 return !!block; 2142 } 2143 2144 /** 2145 * migration_page_queue_free: drop any remaining pages in the ram 2146 * request queue 2147 * 2148 * It should be empty at the end anyway, but in error cases there may 2149 * be some left. in case that there is any page left, we drop it. 2150 * 2151 */ 2152 static void migration_page_queue_free(RAMState *rs) 2153 { 2154 struct RAMSrcPageRequest *mspr, *next_mspr; 2155 /* This queue generally should be empty - but in the case of a failed 2156 * migration might have some droppings in. 2157 */ 2158 RCU_READ_LOCK_GUARD(); 2159 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2160 memory_region_unref(mspr->rb->mr); 2161 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2162 g_free(mspr); 2163 } 2164 } 2165 2166 /** 2167 * ram_save_queue_pages: queue the page for transmission 2168 * 2169 * A request from postcopy destination for example. 2170 * 2171 * Returns zero on success or negative on error 2172 * 2173 * @rbname: Name of the RAMBLock of the request. NULL means the 2174 * same that last one. 2175 * @start: starting address from the start of the RAMBlock 2176 * @len: length (in bytes) to send 2177 */ 2178 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2179 { 2180 RAMBlock *ramblock; 2181 RAMState *rs = ram_state; 2182 2183 stat64_add(&ram_counters.postcopy_requests, 1); 2184 RCU_READ_LOCK_GUARD(); 2185 2186 if (!rbname) { 2187 /* Reuse last RAMBlock */ 2188 ramblock = rs->last_req_rb; 2189 2190 if (!ramblock) { 2191 /* 2192 * Shouldn't happen, we can't reuse the last RAMBlock if 2193 * it's the 1st request. 2194 */ 2195 error_report("ram_save_queue_pages no previous block"); 2196 return -1; 2197 } 2198 } else { 2199 ramblock = qemu_ram_block_by_name(rbname); 2200 2201 if (!ramblock) { 2202 /* We shouldn't be asked for a non-existent RAMBlock */ 2203 error_report("ram_save_queue_pages no block '%s'", rbname); 2204 return -1; 2205 } 2206 rs->last_req_rb = ramblock; 2207 } 2208 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2209 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2210 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2211 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2212 __func__, start, len, ramblock->used_length); 2213 return -1; 2214 } 2215 2216 /* 2217 * When with postcopy preempt, we send back the page directly in the 2218 * rp-return thread. 2219 */ 2220 if (postcopy_preempt_active()) { 2221 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2222 size_t page_size = qemu_ram_pagesize(ramblock); 2223 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2224 int ret = 0; 2225 2226 qemu_mutex_lock(&rs->bitmap_mutex); 2227 2228 pss_init(pss, ramblock, page_start); 2229 /* 2230 * Always use the preempt channel, and make sure it's there. It's 2231 * safe to access without lock, because when rp-thread is running 2232 * we should be the only one who operates on the qemufile 2233 */ 2234 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2235 assert(pss->pss_channel); 2236 2237 /* 2238 * It must be either one or multiple of host page size. Just 2239 * assert; if something wrong we're mostly split brain anyway. 2240 */ 2241 assert(len % page_size == 0); 2242 while (len) { 2243 if (ram_save_host_page_urgent(pss)) { 2244 error_report("%s: ram_save_host_page_urgent() failed: " 2245 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2246 __func__, ramblock->idstr, start); 2247 ret = -1; 2248 break; 2249 } 2250 /* 2251 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2252 * will automatically be moved and point to the next host page 2253 * we're going to send, so no need to update here. 2254 * 2255 * Normally QEMU never sends >1 host page in requests, so 2256 * logically we don't even need that as the loop should only 2257 * run once, but just to be consistent. 2258 */ 2259 len -= page_size; 2260 }; 2261 qemu_mutex_unlock(&rs->bitmap_mutex); 2262 2263 return ret; 2264 } 2265 2266 struct RAMSrcPageRequest *new_entry = 2267 g_new0(struct RAMSrcPageRequest, 1); 2268 new_entry->rb = ramblock; 2269 new_entry->offset = start; 2270 new_entry->len = len; 2271 2272 memory_region_ref(ramblock->mr); 2273 qemu_mutex_lock(&rs->src_page_req_mutex); 2274 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2275 migration_make_urgent_request(); 2276 qemu_mutex_unlock(&rs->src_page_req_mutex); 2277 2278 return 0; 2279 } 2280 2281 static bool save_page_use_compression(RAMState *rs) 2282 { 2283 if (!migrate_compress()) { 2284 return false; 2285 } 2286 2287 /* 2288 * If xbzrle is enabled (e.g., after first round of migration), stop 2289 * using the data compression. In theory, xbzrle can do better than 2290 * compression. 2291 */ 2292 if (rs->xbzrle_enabled) { 2293 return false; 2294 } 2295 2296 return true; 2297 } 2298 2299 /* 2300 * try to compress the page before posting it out, return true if the page 2301 * has been properly handled by compression, otherwise needs other 2302 * paths to handle it 2303 */ 2304 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2305 RAMBlock *block, ram_addr_t offset) 2306 { 2307 if (!save_page_use_compression(rs)) { 2308 return false; 2309 } 2310 2311 /* 2312 * When starting the process of a new block, the first page of 2313 * the block should be sent out before other pages in the same 2314 * block, and all the pages in last block should have been sent 2315 * out, keeping this order is important, because the 'cont' flag 2316 * is used to avoid resending the block name. 2317 * 2318 * We post the fist page as normal page as compression will take 2319 * much CPU resource. 2320 */ 2321 if (block != pss->last_sent_block) { 2322 flush_compressed_data(rs); 2323 return false; 2324 } 2325 2326 if (compress_page_with_multi_thread(block, offset) > 0) { 2327 return true; 2328 } 2329 2330 compression_counters.busy++; 2331 return false; 2332 } 2333 2334 /** 2335 * ram_save_target_page_legacy: save one target page 2336 * 2337 * Returns the number of pages written 2338 * 2339 * @rs: current RAM state 2340 * @pss: data about the page we want to send 2341 */ 2342 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2343 { 2344 RAMBlock *block = pss->block; 2345 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2346 int res; 2347 2348 if (control_save_page(pss, block, offset, &res)) { 2349 return res; 2350 } 2351 2352 if (save_compress_page(rs, pss, block, offset)) { 2353 return 1; 2354 } 2355 2356 res = save_zero_page(pss, pss->pss_channel, block, offset); 2357 if (res > 0) { 2358 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2359 * page would be stale 2360 */ 2361 if (rs->xbzrle_enabled) { 2362 XBZRLE_cache_lock(); 2363 xbzrle_cache_zero_page(rs, block->offset + offset); 2364 XBZRLE_cache_unlock(); 2365 } 2366 return res; 2367 } 2368 2369 /* 2370 * Do not use multifd in postcopy as one whole host page should be 2371 * placed. Meanwhile postcopy requires atomic update of pages, so even 2372 * if host page size == guest page size the dest guest during run may 2373 * still see partially copied pages which is data corruption. 2374 */ 2375 if (migrate_multifd() && !migration_in_postcopy()) { 2376 return ram_save_multifd_page(pss->pss_channel, block, offset); 2377 } 2378 2379 return ram_save_page(rs, pss); 2380 } 2381 2382 /* Should be called before sending a host page */ 2383 static void pss_host_page_prepare(PageSearchStatus *pss) 2384 { 2385 /* How many guest pages are there in one host page? */ 2386 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2387 2388 pss->host_page_sending = true; 2389 if (guest_pfns <= 1) { 2390 /* 2391 * This covers both when guest psize == host psize, or when guest 2392 * has larger psize than the host (guest_pfns==0). 2393 * 2394 * For the latter, we always send one whole guest page per 2395 * iteration of the host page (example: an Alpha VM on x86 host 2396 * will have guest psize 8K while host psize 4K). 2397 */ 2398 pss->host_page_start = pss->page; 2399 pss->host_page_end = pss->page + 1; 2400 } else { 2401 /* 2402 * The host page spans over multiple guest pages, we send them 2403 * within the same host page iteration. 2404 */ 2405 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2406 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2407 } 2408 } 2409 2410 /* 2411 * Whether the page pointed by PSS is within the host page being sent. 2412 * Must be called after a previous pss_host_page_prepare(). 2413 */ 2414 static bool pss_within_range(PageSearchStatus *pss) 2415 { 2416 ram_addr_t ram_addr; 2417 2418 assert(pss->host_page_sending); 2419 2420 /* Over host-page boundary? */ 2421 if (pss->page >= pss->host_page_end) { 2422 return false; 2423 } 2424 2425 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2426 2427 return offset_in_ramblock(pss->block, ram_addr); 2428 } 2429 2430 static void pss_host_page_finish(PageSearchStatus *pss) 2431 { 2432 pss->host_page_sending = false; 2433 /* This is not needed, but just to reset it */ 2434 pss->host_page_start = pss->host_page_end = 0; 2435 } 2436 2437 /* 2438 * Send an urgent host page specified by `pss'. Need to be called with 2439 * bitmap_mutex held. 2440 * 2441 * Returns 0 if save host page succeeded, false otherwise. 2442 */ 2443 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2444 { 2445 bool page_dirty, sent = false; 2446 RAMState *rs = ram_state; 2447 int ret = 0; 2448 2449 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2450 pss_host_page_prepare(pss); 2451 2452 /* 2453 * If precopy is sending the same page, let it be done in precopy, or 2454 * we could send the same page in two channels and none of them will 2455 * receive the whole page. 2456 */ 2457 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2458 trace_postcopy_preempt_hit(pss->block->idstr, 2459 pss->page << TARGET_PAGE_BITS); 2460 return 0; 2461 } 2462 2463 do { 2464 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2465 2466 if (page_dirty) { 2467 /* Be strict to return code; it must be 1, or what else? */ 2468 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2469 error_report_once("%s: ram_save_target_page failed", __func__); 2470 ret = -1; 2471 goto out; 2472 } 2473 sent = true; 2474 } 2475 pss_find_next_dirty(pss); 2476 } while (pss_within_range(pss)); 2477 out: 2478 pss_host_page_finish(pss); 2479 /* For urgent requests, flush immediately if sent */ 2480 if (sent) { 2481 qemu_fflush(pss->pss_channel); 2482 } 2483 return ret; 2484 } 2485 2486 /** 2487 * ram_save_host_page: save a whole host page 2488 * 2489 * Starting at *offset send pages up to the end of the current host 2490 * page. It's valid for the initial offset to point into the middle of 2491 * a host page in which case the remainder of the hostpage is sent. 2492 * Only dirty target pages are sent. Note that the host page size may 2493 * be a huge page for this block. 2494 * 2495 * The saving stops at the boundary of the used_length of the block 2496 * if the RAMBlock isn't a multiple of the host page size. 2497 * 2498 * The caller must be with ram_state.bitmap_mutex held to call this 2499 * function. Note that this function can temporarily release the lock, but 2500 * when the function is returned it'll make sure the lock is still held. 2501 * 2502 * Returns the number of pages written or negative on error 2503 * 2504 * @rs: current RAM state 2505 * @pss: data about the page we want to send 2506 */ 2507 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2508 { 2509 bool page_dirty, preempt_active = postcopy_preempt_active(); 2510 int tmppages, pages = 0; 2511 size_t pagesize_bits = 2512 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2513 unsigned long start_page = pss->page; 2514 int res; 2515 2516 if (ramblock_is_ignored(pss->block)) { 2517 error_report("block %s should not be migrated !", pss->block->idstr); 2518 return 0; 2519 } 2520 2521 /* Update host page boundary information */ 2522 pss_host_page_prepare(pss); 2523 2524 do { 2525 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2526 2527 /* Check the pages is dirty and if it is send it */ 2528 if (page_dirty) { 2529 /* 2530 * Properly yield the lock only in postcopy preempt mode 2531 * because both migration thread and rp-return thread can 2532 * operate on the bitmaps. 2533 */ 2534 if (preempt_active) { 2535 qemu_mutex_unlock(&rs->bitmap_mutex); 2536 } 2537 tmppages = migration_ops->ram_save_target_page(rs, pss); 2538 if (tmppages >= 0) { 2539 pages += tmppages; 2540 /* 2541 * Allow rate limiting to happen in the middle of huge pages if 2542 * something is sent in the current iteration. 2543 */ 2544 if (pagesize_bits > 1 && tmppages > 0) { 2545 migration_rate_limit(); 2546 } 2547 } 2548 if (preempt_active) { 2549 qemu_mutex_lock(&rs->bitmap_mutex); 2550 } 2551 } else { 2552 tmppages = 0; 2553 } 2554 2555 if (tmppages < 0) { 2556 pss_host_page_finish(pss); 2557 return tmppages; 2558 } 2559 2560 pss_find_next_dirty(pss); 2561 } while (pss_within_range(pss)); 2562 2563 pss_host_page_finish(pss); 2564 2565 res = ram_save_release_protection(rs, pss, start_page); 2566 return (res < 0 ? res : pages); 2567 } 2568 2569 /** 2570 * ram_find_and_save_block: finds a dirty page and sends it to f 2571 * 2572 * Called within an RCU critical section. 2573 * 2574 * Returns the number of pages written where zero means no dirty pages, 2575 * or negative on error 2576 * 2577 * @rs: current RAM state 2578 * 2579 * On systems where host-page-size > target-page-size it will send all the 2580 * pages in a host page that are dirty. 2581 */ 2582 static int ram_find_and_save_block(RAMState *rs) 2583 { 2584 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2585 int pages = 0; 2586 2587 /* No dirty page as there is zero RAM */ 2588 if (!rs->ram_bytes_total) { 2589 return pages; 2590 } 2591 2592 /* 2593 * Always keep last_seen_block/last_page valid during this procedure, 2594 * because find_dirty_block() relies on these values (e.g., we compare 2595 * last_seen_block with pss.block to see whether we searched all the 2596 * ramblocks) to detect the completion of migration. Having NULL value 2597 * of last_seen_block can conditionally cause below loop to run forever. 2598 */ 2599 if (!rs->last_seen_block) { 2600 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2601 rs->last_page = 0; 2602 } 2603 2604 pss_init(pss, rs->last_seen_block, rs->last_page); 2605 2606 while (true){ 2607 if (!get_queued_page(rs, pss)) { 2608 /* priority queue empty, so just search for something dirty */ 2609 int res = find_dirty_block(rs, pss); 2610 if (res != PAGE_DIRTY_FOUND) { 2611 if (res == PAGE_ALL_CLEAN) { 2612 break; 2613 } else if (res == PAGE_TRY_AGAIN) { 2614 continue; 2615 } else if (res < 0) { 2616 pages = res; 2617 break; 2618 } 2619 } 2620 } 2621 pages = ram_save_host_page(rs, pss); 2622 if (pages) { 2623 break; 2624 } 2625 } 2626 2627 rs->last_seen_block = pss->block; 2628 rs->last_page = pss->page; 2629 2630 return pages; 2631 } 2632 2633 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2634 { 2635 uint64_t pages = size / TARGET_PAGE_SIZE; 2636 2637 if (zero) { 2638 stat64_add(&ram_counters.zero_pages, pages); 2639 } else { 2640 stat64_add(&ram_counters.normal_pages, pages); 2641 ram_transferred_add(size); 2642 qemu_file_credit_transfer(f, size); 2643 } 2644 } 2645 2646 static uint64_t ram_bytes_total_with_ignored(void) 2647 { 2648 RAMBlock *block; 2649 uint64_t total = 0; 2650 2651 RCU_READ_LOCK_GUARD(); 2652 2653 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2654 total += block->used_length; 2655 } 2656 return total; 2657 } 2658 2659 uint64_t ram_bytes_total(void) 2660 { 2661 RAMBlock *block; 2662 uint64_t total = 0; 2663 2664 RCU_READ_LOCK_GUARD(); 2665 2666 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2667 total += block->used_length; 2668 } 2669 return total; 2670 } 2671 2672 static void xbzrle_load_setup(void) 2673 { 2674 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2675 } 2676 2677 static void xbzrle_load_cleanup(void) 2678 { 2679 g_free(XBZRLE.decoded_buf); 2680 XBZRLE.decoded_buf = NULL; 2681 } 2682 2683 static void ram_state_cleanup(RAMState **rsp) 2684 { 2685 if (*rsp) { 2686 migration_page_queue_free(*rsp); 2687 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2688 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2689 g_free(*rsp); 2690 *rsp = NULL; 2691 } 2692 } 2693 2694 static void xbzrle_cleanup(void) 2695 { 2696 XBZRLE_cache_lock(); 2697 if (XBZRLE.cache) { 2698 cache_fini(XBZRLE.cache); 2699 g_free(XBZRLE.encoded_buf); 2700 g_free(XBZRLE.current_buf); 2701 g_free(XBZRLE.zero_target_page); 2702 XBZRLE.cache = NULL; 2703 XBZRLE.encoded_buf = NULL; 2704 XBZRLE.current_buf = NULL; 2705 XBZRLE.zero_target_page = NULL; 2706 } 2707 XBZRLE_cache_unlock(); 2708 } 2709 2710 static void ram_save_cleanup(void *opaque) 2711 { 2712 RAMState **rsp = opaque; 2713 RAMBlock *block; 2714 2715 /* We don't use dirty log with background snapshots */ 2716 if (!migrate_background_snapshot()) { 2717 /* caller have hold iothread lock or is in a bh, so there is 2718 * no writing race against the migration bitmap 2719 */ 2720 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2721 /* 2722 * do not stop dirty log without starting it, since 2723 * memory_global_dirty_log_stop will assert that 2724 * memory_global_dirty_log_start/stop used in pairs 2725 */ 2726 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2727 } 2728 } 2729 2730 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2731 g_free(block->clear_bmap); 2732 block->clear_bmap = NULL; 2733 g_free(block->bmap); 2734 block->bmap = NULL; 2735 } 2736 2737 xbzrle_cleanup(); 2738 compress_threads_save_cleanup(); 2739 ram_state_cleanup(rsp); 2740 g_free(migration_ops); 2741 migration_ops = NULL; 2742 } 2743 2744 static void ram_state_reset(RAMState *rs) 2745 { 2746 int i; 2747 2748 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2749 rs->pss[i].last_sent_block = NULL; 2750 } 2751 2752 rs->last_seen_block = NULL; 2753 rs->last_page = 0; 2754 rs->last_version = ram_list.version; 2755 rs->xbzrle_enabled = false; 2756 } 2757 2758 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2759 2760 /* **** functions for postcopy ***** */ 2761 2762 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2763 { 2764 struct RAMBlock *block; 2765 2766 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2767 unsigned long *bitmap = block->bmap; 2768 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2769 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2770 2771 while (run_start < range) { 2772 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2773 ram_discard_range(block->idstr, 2774 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2775 ((ram_addr_t)(run_end - run_start)) 2776 << TARGET_PAGE_BITS); 2777 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2778 } 2779 } 2780 } 2781 2782 /** 2783 * postcopy_send_discard_bm_ram: discard a RAMBlock 2784 * 2785 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2786 * 2787 * @ms: current migration state 2788 * @block: RAMBlock to discard 2789 */ 2790 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2791 { 2792 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2793 unsigned long current; 2794 unsigned long *bitmap = block->bmap; 2795 2796 for (current = 0; current < end; ) { 2797 unsigned long one = find_next_bit(bitmap, end, current); 2798 unsigned long zero, discard_length; 2799 2800 if (one >= end) { 2801 break; 2802 } 2803 2804 zero = find_next_zero_bit(bitmap, end, one + 1); 2805 2806 if (zero >= end) { 2807 discard_length = end - one; 2808 } else { 2809 discard_length = zero - one; 2810 } 2811 postcopy_discard_send_range(ms, one, discard_length); 2812 current = one + discard_length; 2813 } 2814 } 2815 2816 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2817 2818 /** 2819 * postcopy_each_ram_send_discard: discard all RAMBlocks 2820 * 2821 * Utility for the outgoing postcopy code. 2822 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2823 * passing it bitmap indexes and name. 2824 * (qemu_ram_foreach_block ends up passing unscaled lengths 2825 * which would mean postcopy code would have to deal with target page) 2826 * 2827 * @ms: current migration state 2828 */ 2829 static void postcopy_each_ram_send_discard(MigrationState *ms) 2830 { 2831 struct RAMBlock *block; 2832 2833 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2834 postcopy_discard_send_init(ms, block->idstr); 2835 2836 /* 2837 * Deal with TPS != HPS and huge pages. It discard any partially sent 2838 * host-page size chunks, mark any partially dirty host-page size 2839 * chunks as all dirty. In this case the host-page is the host-page 2840 * for the particular RAMBlock, i.e. it might be a huge page. 2841 */ 2842 postcopy_chunk_hostpages_pass(ms, block); 2843 2844 /* 2845 * Postcopy sends chunks of bitmap over the wire, but it 2846 * just needs indexes at this point, avoids it having 2847 * target page specific code. 2848 */ 2849 postcopy_send_discard_bm_ram(ms, block); 2850 postcopy_discard_send_finish(ms); 2851 } 2852 } 2853 2854 /** 2855 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2856 * 2857 * Helper for postcopy_chunk_hostpages; it's called twice to 2858 * canonicalize the two bitmaps, that are similar, but one is 2859 * inverted. 2860 * 2861 * Postcopy requires that all target pages in a hostpage are dirty or 2862 * clean, not a mix. This function canonicalizes the bitmaps. 2863 * 2864 * @ms: current migration state 2865 * @block: block that contains the page we want to canonicalize 2866 */ 2867 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2868 { 2869 RAMState *rs = ram_state; 2870 unsigned long *bitmap = block->bmap; 2871 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2872 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2873 unsigned long run_start; 2874 2875 if (block->page_size == TARGET_PAGE_SIZE) { 2876 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2877 return; 2878 } 2879 2880 /* Find a dirty page */ 2881 run_start = find_next_bit(bitmap, pages, 0); 2882 2883 while (run_start < pages) { 2884 2885 /* 2886 * If the start of this run of pages is in the middle of a host 2887 * page, then we need to fixup this host page. 2888 */ 2889 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2890 /* Find the end of this run */ 2891 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2892 /* 2893 * If the end isn't at the start of a host page, then the 2894 * run doesn't finish at the end of a host page 2895 * and we need to discard. 2896 */ 2897 } 2898 2899 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2900 unsigned long page; 2901 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2902 host_ratio); 2903 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2904 2905 /* Clean up the bitmap */ 2906 for (page = fixup_start_addr; 2907 page < fixup_start_addr + host_ratio; page++) { 2908 /* 2909 * Remark them as dirty, updating the count for any pages 2910 * that weren't previously dirty. 2911 */ 2912 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2913 } 2914 } 2915 2916 /* Find the next dirty page for the next iteration */ 2917 run_start = find_next_bit(bitmap, pages, run_start); 2918 } 2919 } 2920 2921 /** 2922 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2923 * 2924 * Transmit the set of pages to be discarded after precopy to the target 2925 * these are pages that: 2926 * a) Have been previously transmitted but are now dirty again 2927 * b) Pages that have never been transmitted, this ensures that 2928 * any pages on the destination that have been mapped by background 2929 * tasks get discarded (transparent huge pages is the specific concern) 2930 * Hopefully this is pretty sparse 2931 * 2932 * @ms: current migration state 2933 */ 2934 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2935 { 2936 RAMState *rs = ram_state; 2937 2938 RCU_READ_LOCK_GUARD(); 2939 2940 /* This should be our last sync, the src is now paused */ 2941 migration_bitmap_sync(rs); 2942 2943 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2944 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2945 rs->last_seen_block = NULL; 2946 rs->last_page = 0; 2947 2948 postcopy_each_ram_send_discard(ms); 2949 2950 trace_ram_postcopy_send_discard_bitmap(); 2951 } 2952 2953 /** 2954 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2955 * 2956 * Returns zero on success 2957 * 2958 * @rbname: name of the RAMBlock of the request. NULL means the 2959 * same that last one. 2960 * @start: RAMBlock starting page 2961 * @length: RAMBlock size 2962 */ 2963 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2964 { 2965 trace_ram_discard_range(rbname, start, length); 2966 2967 RCU_READ_LOCK_GUARD(); 2968 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2969 2970 if (!rb) { 2971 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2972 return -1; 2973 } 2974 2975 /* 2976 * On source VM, we don't need to update the received bitmap since 2977 * we don't even have one. 2978 */ 2979 if (rb->receivedmap) { 2980 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2981 length >> qemu_target_page_bits()); 2982 } 2983 2984 return ram_block_discard_range(rb, start, length); 2985 } 2986 2987 /* 2988 * For every allocation, we will try not to crash the VM if the 2989 * allocation failed. 2990 */ 2991 static int xbzrle_init(void) 2992 { 2993 Error *local_err = NULL; 2994 2995 if (!migrate_xbzrle()) { 2996 return 0; 2997 } 2998 2999 XBZRLE_cache_lock(); 3000 3001 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 3002 if (!XBZRLE.zero_target_page) { 3003 error_report("%s: Error allocating zero page", __func__); 3004 goto err_out; 3005 } 3006 3007 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 3008 TARGET_PAGE_SIZE, &local_err); 3009 if (!XBZRLE.cache) { 3010 error_report_err(local_err); 3011 goto free_zero_page; 3012 } 3013 3014 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 3015 if (!XBZRLE.encoded_buf) { 3016 error_report("%s: Error allocating encoded_buf", __func__); 3017 goto free_cache; 3018 } 3019 3020 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 3021 if (!XBZRLE.current_buf) { 3022 error_report("%s: Error allocating current_buf", __func__); 3023 goto free_encoded_buf; 3024 } 3025 3026 /* We are all good */ 3027 XBZRLE_cache_unlock(); 3028 return 0; 3029 3030 free_encoded_buf: 3031 g_free(XBZRLE.encoded_buf); 3032 XBZRLE.encoded_buf = NULL; 3033 free_cache: 3034 cache_fini(XBZRLE.cache); 3035 XBZRLE.cache = NULL; 3036 free_zero_page: 3037 g_free(XBZRLE.zero_target_page); 3038 XBZRLE.zero_target_page = NULL; 3039 err_out: 3040 XBZRLE_cache_unlock(); 3041 return -ENOMEM; 3042 } 3043 3044 static int ram_state_init(RAMState **rsp) 3045 { 3046 *rsp = g_try_new0(RAMState, 1); 3047 3048 if (!*rsp) { 3049 error_report("%s: Init ramstate fail", __func__); 3050 return -1; 3051 } 3052 3053 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3054 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3055 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3056 (*rsp)->ram_bytes_total = ram_bytes_total(); 3057 3058 /* 3059 * Count the total number of pages used by ram blocks not including any 3060 * gaps due to alignment or unplugs. 3061 * This must match with the initial values of dirty bitmap. 3062 */ 3063 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 3064 ram_state_reset(*rsp); 3065 3066 return 0; 3067 } 3068 3069 static void ram_list_init_bitmaps(void) 3070 { 3071 MigrationState *ms = migrate_get_current(); 3072 RAMBlock *block; 3073 unsigned long pages; 3074 uint8_t shift; 3075 3076 /* Skip setting bitmap if there is no RAM */ 3077 if (ram_bytes_total()) { 3078 shift = ms->clear_bitmap_shift; 3079 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3080 error_report("clear_bitmap_shift (%u) too big, using " 3081 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3082 shift = CLEAR_BITMAP_SHIFT_MAX; 3083 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3084 error_report("clear_bitmap_shift (%u) too small, using " 3085 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3086 shift = CLEAR_BITMAP_SHIFT_MIN; 3087 } 3088 3089 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3090 pages = block->max_length >> TARGET_PAGE_BITS; 3091 /* 3092 * The initial dirty bitmap for migration must be set with all 3093 * ones to make sure we'll migrate every guest RAM page to 3094 * destination. 3095 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3096 * new migration after a failed migration, ram_list. 3097 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3098 * guest memory. 3099 */ 3100 block->bmap = bitmap_new(pages); 3101 bitmap_set(block->bmap, 0, pages); 3102 block->clear_bmap_shift = shift; 3103 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3104 } 3105 } 3106 } 3107 3108 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3109 { 3110 unsigned long pages; 3111 RAMBlock *rb; 3112 3113 RCU_READ_LOCK_GUARD(); 3114 3115 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3116 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3117 rs->migration_dirty_pages -= pages; 3118 } 3119 } 3120 3121 static void ram_init_bitmaps(RAMState *rs) 3122 { 3123 /* For memory_global_dirty_log_start below. */ 3124 qemu_mutex_lock_iothread(); 3125 qemu_mutex_lock_ramlist(); 3126 3127 WITH_RCU_READ_LOCK_GUARD() { 3128 ram_list_init_bitmaps(); 3129 /* We don't use dirty log with background snapshots */ 3130 if (!migrate_background_snapshot()) { 3131 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3132 migration_bitmap_sync_precopy(rs); 3133 } 3134 } 3135 qemu_mutex_unlock_ramlist(); 3136 qemu_mutex_unlock_iothread(); 3137 3138 /* 3139 * After an eventual first bitmap sync, fixup the initial bitmap 3140 * containing all 1s to exclude any discarded pages from migration. 3141 */ 3142 migration_bitmap_clear_discarded_pages(rs); 3143 } 3144 3145 static int ram_init_all(RAMState **rsp) 3146 { 3147 if (ram_state_init(rsp)) { 3148 return -1; 3149 } 3150 3151 if (xbzrle_init()) { 3152 ram_state_cleanup(rsp); 3153 return -1; 3154 } 3155 3156 ram_init_bitmaps(*rsp); 3157 3158 return 0; 3159 } 3160 3161 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3162 { 3163 RAMBlock *block; 3164 uint64_t pages = 0; 3165 3166 /* 3167 * Postcopy is not using xbzrle/compression, so no need for that. 3168 * Also, since source are already halted, we don't need to care 3169 * about dirty page logging as well. 3170 */ 3171 3172 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3173 pages += bitmap_count_one(block->bmap, 3174 block->used_length >> TARGET_PAGE_BITS); 3175 } 3176 3177 /* This may not be aligned with current bitmaps. Recalculate. */ 3178 rs->migration_dirty_pages = pages; 3179 3180 ram_state_reset(rs); 3181 3182 /* Update RAMState cache of output QEMUFile */ 3183 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3184 3185 trace_ram_state_resume_prepare(pages); 3186 } 3187 3188 /* 3189 * This function clears bits of the free pages reported by the caller from the 3190 * migration dirty bitmap. @addr is the host address corresponding to the 3191 * start of the continuous guest free pages, and @len is the total bytes of 3192 * those pages. 3193 */ 3194 void qemu_guest_free_page_hint(void *addr, size_t len) 3195 { 3196 RAMBlock *block; 3197 ram_addr_t offset; 3198 size_t used_len, start, npages; 3199 MigrationState *s = migrate_get_current(); 3200 3201 /* This function is currently expected to be used during live migration */ 3202 if (!migration_is_setup_or_active(s->state)) { 3203 return; 3204 } 3205 3206 for (; len > 0; len -= used_len, addr += used_len) { 3207 block = qemu_ram_block_from_host(addr, false, &offset); 3208 if (unlikely(!block || offset >= block->used_length)) { 3209 /* 3210 * The implementation might not support RAMBlock resize during 3211 * live migration, but it could happen in theory with future 3212 * updates. So we add a check here to capture that case. 3213 */ 3214 error_report_once("%s unexpected error", __func__); 3215 return; 3216 } 3217 3218 if (len <= block->used_length - offset) { 3219 used_len = len; 3220 } else { 3221 used_len = block->used_length - offset; 3222 } 3223 3224 start = offset >> TARGET_PAGE_BITS; 3225 npages = used_len >> TARGET_PAGE_BITS; 3226 3227 qemu_mutex_lock(&ram_state->bitmap_mutex); 3228 /* 3229 * The skipped free pages are equavalent to be sent from clear_bmap's 3230 * perspective, so clear the bits from the memory region bitmap which 3231 * are initially set. Otherwise those skipped pages will be sent in 3232 * the next round after syncing from the memory region bitmap. 3233 */ 3234 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3235 ram_state->migration_dirty_pages -= 3236 bitmap_count_one_with_offset(block->bmap, start, npages); 3237 bitmap_clear(block->bmap, start, npages); 3238 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3239 } 3240 } 3241 3242 /* 3243 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3244 * long-running RCU critical section. When rcu-reclaims in the code 3245 * start to become numerous it will be necessary to reduce the 3246 * granularity of these critical sections. 3247 */ 3248 3249 /** 3250 * ram_save_setup: Setup RAM for migration 3251 * 3252 * Returns zero to indicate success and negative for error 3253 * 3254 * @f: QEMUFile where to send the data 3255 * @opaque: RAMState pointer 3256 */ 3257 static int ram_save_setup(QEMUFile *f, void *opaque) 3258 { 3259 RAMState **rsp = opaque; 3260 RAMBlock *block; 3261 int ret; 3262 3263 if (compress_threads_save_setup()) { 3264 return -1; 3265 } 3266 3267 /* migration has already setup the bitmap, reuse it. */ 3268 if (!migration_in_colo_state()) { 3269 if (ram_init_all(rsp) != 0) { 3270 compress_threads_save_cleanup(); 3271 return -1; 3272 } 3273 } 3274 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3275 3276 WITH_RCU_READ_LOCK_GUARD() { 3277 qemu_put_be64(f, ram_bytes_total_with_ignored() 3278 | RAM_SAVE_FLAG_MEM_SIZE); 3279 3280 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3281 qemu_put_byte(f, strlen(block->idstr)); 3282 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3283 qemu_put_be64(f, block->used_length); 3284 if (migrate_postcopy_ram() && block->page_size != 3285 qemu_host_page_size) { 3286 qemu_put_be64(f, block->page_size); 3287 } 3288 if (migrate_ignore_shared()) { 3289 qemu_put_be64(f, block->mr->addr); 3290 } 3291 } 3292 } 3293 3294 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3295 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3296 3297 migration_ops = g_malloc0(sizeof(MigrationOps)); 3298 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3299 ret = multifd_send_sync_main(f); 3300 if (ret < 0) { 3301 return ret; 3302 } 3303 3304 if (!migrate_multifd_flush_after_each_section()) { 3305 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3306 } 3307 3308 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3309 qemu_fflush(f); 3310 3311 return 0; 3312 } 3313 3314 /** 3315 * ram_save_iterate: iterative stage for migration 3316 * 3317 * Returns zero to indicate success and negative for error 3318 * 3319 * @f: QEMUFile where to send the data 3320 * @opaque: RAMState pointer 3321 */ 3322 static int ram_save_iterate(QEMUFile *f, void *opaque) 3323 { 3324 RAMState **temp = opaque; 3325 RAMState *rs = *temp; 3326 int ret = 0; 3327 int i; 3328 int64_t t0; 3329 int done = 0; 3330 3331 if (blk_mig_bulk_active()) { 3332 /* Avoid transferring ram during bulk phase of block migration as 3333 * the bulk phase will usually take a long time and transferring 3334 * ram updates during that time is pointless. */ 3335 goto out; 3336 } 3337 3338 /* 3339 * We'll take this lock a little bit long, but it's okay for two reasons. 3340 * Firstly, the only possible other thread to take it is who calls 3341 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3342 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3343 * guarantees that we'll at least released it in a regular basis. 3344 */ 3345 qemu_mutex_lock(&rs->bitmap_mutex); 3346 WITH_RCU_READ_LOCK_GUARD() { 3347 if (ram_list.version != rs->last_version) { 3348 ram_state_reset(rs); 3349 } 3350 3351 /* Read version before ram_list.blocks */ 3352 smp_rmb(); 3353 3354 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3355 3356 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3357 i = 0; 3358 while ((ret = qemu_file_rate_limit(f)) == 0 || 3359 postcopy_has_request(rs)) { 3360 int pages; 3361 3362 if (qemu_file_get_error(f)) { 3363 break; 3364 } 3365 3366 pages = ram_find_and_save_block(rs); 3367 /* no more pages to sent */ 3368 if (pages == 0) { 3369 done = 1; 3370 break; 3371 } 3372 3373 if (pages < 0) { 3374 qemu_file_set_error(f, pages); 3375 break; 3376 } 3377 3378 rs->target_page_count += pages; 3379 3380 /* 3381 * During postcopy, it is necessary to make sure one whole host 3382 * page is sent in one chunk. 3383 */ 3384 if (migrate_postcopy_ram()) { 3385 flush_compressed_data(rs); 3386 } 3387 3388 /* 3389 * we want to check in the 1st loop, just in case it was the 1st 3390 * time and we had to sync the dirty bitmap. 3391 * qemu_clock_get_ns() is a bit expensive, so we only check each 3392 * some iterations 3393 */ 3394 if ((i & 63) == 0) { 3395 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3396 1000000; 3397 if (t1 > MAX_WAIT) { 3398 trace_ram_save_iterate_big_wait(t1, i); 3399 break; 3400 } 3401 } 3402 i++; 3403 } 3404 } 3405 qemu_mutex_unlock(&rs->bitmap_mutex); 3406 3407 /* 3408 * Must occur before EOS (or any QEMUFile operation) 3409 * because of RDMA protocol. 3410 */ 3411 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3412 3413 out: 3414 if (ret >= 0 3415 && migration_is_setup_or_active(migrate_get_current()->state)) { 3416 if (migrate_multifd_flush_after_each_section()) { 3417 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3418 if (ret < 0) { 3419 return ret; 3420 } 3421 } 3422 3423 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3424 qemu_fflush(f); 3425 ram_transferred_add(8); 3426 3427 ret = qemu_file_get_error(f); 3428 } 3429 if (ret < 0) { 3430 return ret; 3431 } 3432 3433 return done; 3434 } 3435 3436 /** 3437 * ram_save_complete: function called to send the remaining amount of ram 3438 * 3439 * Returns zero to indicate success or negative on error 3440 * 3441 * Called with iothread lock 3442 * 3443 * @f: QEMUFile where to send the data 3444 * @opaque: RAMState pointer 3445 */ 3446 static int ram_save_complete(QEMUFile *f, void *opaque) 3447 { 3448 RAMState **temp = opaque; 3449 RAMState *rs = *temp; 3450 int ret = 0; 3451 3452 rs->last_stage = !migration_in_colo_state(); 3453 3454 WITH_RCU_READ_LOCK_GUARD() { 3455 if (!migration_in_postcopy()) { 3456 migration_bitmap_sync_precopy(rs); 3457 } 3458 3459 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3460 3461 /* try transferring iterative blocks of memory */ 3462 3463 /* flush all remaining blocks regardless of rate limiting */ 3464 qemu_mutex_lock(&rs->bitmap_mutex); 3465 while (true) { 3466 int pages; 3467 3468 pages = ram_find_and_save_block(rs); 3469 /* no more blocks to sent */ 3470 if (pages == 0) { 3471 break; 3472 } 3473 if (pages < 0) { 3474 ret = pages; 3475 break; 3476 } 3477 } 3478 qemu_mutex_unlock(&rs->bitmap_mutex); 3479 3480 flush_compressed_data(rs); 3481 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3482 } 3483 3484 if (ret < 0) { 3485 return ret; 3486 } 3487 3488 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3489 if (ret < 0) { 3490 return ret; 3491 } 3492 3493 if (!migrate_multifd_flush_after_each_section()) { 3494 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); 3495 } 3496 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3497 qemu_fflush(f); 3498 3499 return 0; 3500 } 3501 3502 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3503 uint64_t *can_postcopy) 3504 { 3505 RAMState **temp = opaque; 3506 RAMState *rs = *temp; 3507 3508 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3509 3510 if (migrate_postcopy_ram()) { 3511 /* We can do postcopy, and all the data is postcopiable */ 3512 *can_postcopy += remaining_size; 3513 } else { 3514 *must_precopy += remaining_size; 3515 } 3516 } 3517 3518 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3519 uint64_t *can_postcopy) 3520 { 3521 MigrationState *s = migrate_get_current(); 3522 RAMState **temp = opaque; 3523 RAMState *rs = *temp; 3524 3525 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3526 3527 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3528 qemu_mutex_lock_iothread(); 3529 WITH_RCU_READ_LOCK_GUARD() { 3530 migration_bitmap_sync_precopy(rs); 3531 } 3532 qemu_mutex_unlock_iothread(); 3533 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3534 } 3535 3536 if (migrate_postcopy_ram()) { 3537 /* We can do postcopy, and all the data is postcopiable */ 3538 *can_postcopy += remaining_size; 3539 } else { 3540 *must_precopy += remaining_size; 3541 } 3542 } 3543 3544 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3545 { 3546 unsigned int xh_len; 3547 int xh_flags; 3548 uint8_t *loaded_data; 3549 3550 /* extract RLE header */ 3551 xh_flags = qemu_get_byte(f); 3552 xh_len = qemu_get_be16(f); 3553 3554 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3555 error_report("Failed to load XBZRLE page - wrong compression!"); 3556 return -1; 3557 } 3558 3559 if (xh_len > TARGET_PAGE_SIZE) { 3560 error_report("Failed to load XBZRLE page - len overflow!"); 3561 return -1; 3562 } 3563 loaded_data = XBZRLE.decoded_buf; 3564 /* load data and decode */ 3565 /* it can change loaded_data to point to an internal buffer */ 3566 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3567 3568 /* decode RLE */ 3569 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3570 TARGET_PAGE_SIZE) == -1) { 3571 error_report("Failed to load XBZRLE page - decode error!"); 3572 return -1; 3573 } 3574 3575 return 0; 3576 } 3577 3578 /** 3579 * ram_block_from_stream: read a RAMBlock id from the migration stream 3580 * 3581 * Must be called from within a rcu critical section. 3582 * 3583 * Returns a pointer from within the RCU-protected ram_list. 3584 * 3585 * @mis: the migration incoming state pointer 3586 * @f: QEMUFile where to read the data from 3587 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3588 * @channel: the channel we're using 3589 */ 3590 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3591 QEMUFile *f, int flags, 3592 int channel) 3593 { 3594 RAMBlock *block = mis->last_recv_block[channel]; 3595 char id[256]; 3596 uint8_t len; 3597 3598 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3599 if (!block) { 3600 error_report("Ack, bad migration stream!"); 3601 return NULL; 3602 } 3603 return block; 3604 } 3605 3606 len = qemu_get_byte(f); 3607 qemu_get_buffer(f, (uint8_t *)id, len); 3608 id[len] = 0; 3609 3610 block = qemu_ram_block_by_name(id); 3611 if (!block) { 3612 error_report("Can't find block %s", id); 3613 return NULL; 3614 } 3615 3616 if (ramblock_is_ignored(block)) { 3617 error_report("block %s should not be migrated !", id); 3618 return NULL; 3619 } 3620 3621 mis->last_recv_block[channel] = block; 3622 3623 return block; 3624 } 3625 3626 static inline void *host_from_ram_block_offset(RAMBlock *block, 3627 ram_addr_t offset) 3628 { 3629 if (!offset_in_ramblock(block, offset)) { 3630 return NULL; 3631 } 3632 3633 return block->host + offset; 3634 } 3635 3636 static void *host_page_from_ram_block_offset(RAMBlock *block, 3637 ram_addr_t offset) 3638 { 3639 /* Note: Explicitly no check against offset_in_ramblock(). */ 3640 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3641 block->page_size); 3642 } 3643 3644 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3645 ram_addr_t offset) 3646 { 3647 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3648 } 3649 3650 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3651 ram_addr_t offset, bool record_bitmap) 3652 { 3653 if (!offset_in_ramblock(block, offset)) { 3654 return NULL; 3655 } 3656 if (!block->colo_cache) { 3657 error_report("%s: colo_cache is NULL in block :%s", 3658 __func__, block->idstr); 3659 return NULL; 3660 } 3661 3662 /* 3663 * During colo checkpoint, we need bitmap of these migrated pages. 3664 * It help us to decide which pages in ram cache should be flushed 3665 * into VM's RAM later. 3666 */ 3667 if (record_bitmap && 3668 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3669 ram_state->migration_dirty_pages++; 3670 } 3671 return block->colo_cache + offset; 3672 } 3673 3674 /** 3675 * ram_handle_compressed: handle the zero page case 3676 * 3677 * If a page (or a whole RDMA chunk) has been 3678 * determined to be zero, then zap it. 3679 * 3680 * @host: host address for the zero page 3681 * @ch: what the page is filled from. We only support zero 3682 * @size: size of the zero page 3683 */ 3684 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3685 { 3686 if (ch != 0 || !buffer_is_zero(host, size)) { 3687 memset(host, ch, size); 3688 } 3689 } 3690 3691 /* return the size after decompression, or negative value on error */ 3692 static int 3693 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3694 const uint8_t *source, size_t source_len) 3695 { 3696 int err; 3697 3698 err = inflateReset(stream); 3699 if (err != Z_OK) { 3700 return -1; 3701 } 3702 3703 stream->avail_in = source_len; 3704 stream->next_in = (uint8_t *)source; 3705 stream->avail_out = dest_len; 3706 stream->next_out = dest; 3707 3708 err = inflate(stream, Z_NO_FLUSH); 3709 if (err != Z_STREAM_END) { 3710 return -1; 3711 } 3712 3713 return stream->total_out; 3714 } 3715 3716 static void *do_data_decompress(void *opaque) 3717 { 3718 DecompressParam *param = opaque; 3719 unsigned long pagesize; 3720 uint8_t *des; 3721 int len, ret; 3722 3723 qemu_mutex_lock(¶m->mutex); 3724 while (!param->quit) { 3725 if (param->des) { 3726 des = param->des; 3727 len = param->len; 3728 param->des = 0; 3729 qemu_mutex_unlock(¶m->mutex); 3730 3731 pagesize = TARGET_PAGE_SIZE; 3732 3733 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3734 param->compbuf, len); 3735 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3736 error_report("decompress data failed"); 3737 qemu_file_set_error(decomp_file, ret); 3738 } 3739 3740 qemu_mutex_lock(&decomp_done_lock); 3741 param->done = true; 3742 qemu_cond_signal(&decomp_done_cond); 3743 qemu_mutex_unlock(&decomp_done_lock); 3744 3745 qemu_mutex_lock(¶m->mutex); 3746 } else { 3747 qemu_cond_wait(¶m->cond, ¶m->mutex); 3748 } 3749 } 3750 qemu_mutex_unlock(¶m->mutex); 3751 3752 return NULL; 3753 } 3754 3755 static int wait_for_decompress_done(void) 3756 { 3757 int idx, thread_count; 3758 3759 if (!migrate_compress()) { 3760 return 0; 3761 } 3762 3763 thread_count = migrate_decompress_threads(); 3764 qemu_mutex_lock(&decomp_done_lock); 3765 for (idx = 0; idx < thread_count; idx++) { 3766 while (!decomp_param[idx].done) { 3767 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3768 } 3769 } 3770 qemu_mutex_unlock(&decomp_done_lock); 3771 return qemu_file_get_error(decomp_file); 3772 } 3773 3774 static void compress_threads_load_cleanup(void) 3775 { 3776 int i, thread_count; 3777 3778 if (!migrate_compress()) { 3779 return; 3780 } 3781 thread_count = migrate_decompress_threads(); 3782 for (i = 0; i < thread_count; i++) { 3783 /* 3784 * we use it as a indicator which shows if the thread is 3785 * properly init'd or not 3786 */ 3787 if (!decomp_param[i].compbuf) { 3788 break; 3789 } 3790 3791 qemu_mutex_lock(&decomp_param[i].mutex); 3792 decomp_param[i].quit = true; 3793 qemu_cond_signal(&decomp_param[i].cond); 3794 qemu_mutex_unlock(&decomp_param[i].mutex); 3795 } 3796 for (i = 0; i < thread_count; i++) { 3797 if (!decomp_param[i].compbuf) { 3798 break; 3799 } 3800 3801 qemu_thread_join(decompress_threads + i); 3802 qemu_mutex_destroy(&decomp_param[i].mutex); 3803 qemu_cond_destroy(&decomp_param[i].cond); 3804 inflateEnd(&decomp_param[i].stream); 3805 g_free(decomp_param[i].compbuf); 3806 decomp_param[i].compbuf = NULL; 3807 } 3808 g_free(decompress_threads); 3809 g_free(decomp_param); 3810 decompress_threads = NULL; 3811 decomp_param = NULL; 3812 decomp_file = NULL; 3813 } 3814 3815 static int compress_threads_load_setup(QEMUFile *f) 3816 { 3817 int i, thread_count; 3818 3819 if (!migrate_compress()) { 3820 return 0; 3821 } 3822 3823 thread_count = migrate_decompress_threads(); 3824 decompress_threads = g_new0(QemuThread, thread_count); 3825 decomp_param = g_new0(DecompressParam, thread_count); 3826 qemu_mutex_init(&decomp_done_lock); 3827 qemu_cond_init(&decomp_done_cond); 3828 decomp_file = f; 3829 for (i = 0; i < thread_count; i++) { 3830 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3831 goto exit; 3832 } 3833 3834 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3835 qemu_mutex_init(&decomp_param[i].mutex); 3836 qemu_cond_init(&decomp_param[i].cond); 3837 decomp_param[i].done = true; 3838 decomp_param[i].quit = false; 3839 qemu_thread_create(decompress_threads + i, "decompress", 3840 do_data_decompress, decomp_param + i, 3841 QEMU_THREAD_JOINABLE); 3842 } 3843 return 0; 3844 exit: 3845 compress_threads_load_cleanup(); 3846 return -1; 3847 } 3848 3849 static void decompress_data_with_multi_threads(QEMUFile *f, 3850 void *host, int len) 3851 { 3852 int idx, thread_count; 3853 3854 thread_count = migrate_decompress_threads(); 3855 QEMU_LOCK_GUARD(&decomp_done_lock); 3856 while (true) { 3857 for (idx = 0; idx < thread_count; idx++) { 3858 if (decomp_param[idx].done) { 3859 decomp_param[idx].done = false; 3860 qemu_mutex_lock(&decomp_param[idx].mutex); 3861 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3862 decomp_param[idx].des = host; 3863 decomp_param[idx].len = len; 3864 qemu_cond_signal(&decomp_param[idx].cond); 3865 qemu_mutex_unlock(&decomp_param[idx].mutex); 3866 break; 3867 } 3868 } 3869 if (idx < thread_count) { 3870 break; 3871 } else { 3872 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3873 } 3874 } 3875 } 3876 3877 static void colo_init_ram_state(void) 3878 { 3879 ram_state_init(&ram_state); 3880 } 3881 3882 /* 3883 * colo cache: this is for secondary VM, we cache the whole 3884 * memory of the secondary VM, it is need to hold the global lock 3885 * to call this helper. 3886 */ 3887 int colo_init_ram_cache(void) 3888 { 3889 RAMBlock *block; 3890 3891 WITH_RCU_READ_LOCK_GUARD() { 3892 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3893 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3894 NULL, false, false); 3895 if (!block->colo_cache) { 3896 error_report("%s: Can't alloc memory for COLO cache of block %s," 3897 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3898 block->used_length); 3899 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3900 if (block->colo_cache) { 3901 qemu_anon_ram_free(block->colo_cache, block->used_length); 3902 block->colo_cache = NULL; 3903 } 3904 } 3905 return -errno; 3906 } 3907 if (!machine_dump_guest_core(current_machine)) { 3908 qemu_madvise(block->colo_cache, block->used_length, 3909 QEMU_MADV_DONTDUMP); 3910 } 3911 } 3912 } 3913 3914 /* 3915 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3916 * with to decide which page in cache should be flushed into SVM's RAM. Here 3917 * we use the same name 'ram_bitmap' as for migration. 3918 */ 3919 if (ram_bytes_total()) { 3920 RAMBlock *block; 3921 3922 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3923 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3924 block->bmap = bitmap_new(pages); 3925 } 3926 } 3927 3928 colo_init_ram_state(); 3929 return 0; 3930 } 3931 3932 /* TODO: duplicated with ram_init_bitmaps */ 3933 void colo_incoming_start_dirty_log(void) 3934 { 3935 RAMBlock *block = NULL; 3936 /* For memory_global_dirty_log_start below. */ 3937 qemu_mutex_lock_iothread(); 3938 qemu_mutex_lock_ramlist(); 3939 3940 memory_global_dirty_log_sync(); 3941 WITH_RCU_READ_LOCK_GUARD() { 3942 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3943 ramblock_sync_dirty_bitmap(ram_state, block); 3944 /* Discard this dirty bitmap record */ 3945 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3946 } 3947 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3948 } 3949 ram_state->migration_dirty_pages = 0; 3950 qemu_mutex_unlock_ramlist(); 3951 qemu_mutex_unlock_iothread(); 3952 } 3953 3954 /* It is need to hold the global lock to call this helper */ 3955 void colo_release_ram_cache(void) 3956 { 3957 RAMBlock *block; 3958 3959 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3960 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3961 g_free(block->bmap); 3962 block->bmap = NULL; 3963 } 3964 3965 WITH_RCU_READ_LOCK_GUARD() { 3966 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3967 if (block->colo_cache) { 3968 qemu_anon_ram_free(block->colo_cache, block->used_length); 3969 block->colo_cache = NULL; 3970 } 3971 } 3972 } 3973 ram_state_cleanup(&ram_state); 3974 } 3975 3976 /** 3977 * ram_load_setup: Setup RAM for migration incoming side 3978 * 3979 * Returns zero to indicate success and negative for error 3980 * 3981 * @f: QEMUFile where to receive the data 3982 * @opaque: RAMState pointer 3983 */ 3984 static int ram_load_setup(QEMUFile *f, void *opaque) 3985 { 3986 if (compress_threads_load_setup(f)) { 3987 return -1; 3988 } 3989 3990 xbzrle_load_setup(); 3991 ramblock_recv_map_init(); 3992 3993 return 0; 3994 } 3995 3996 static int ram_load_cleanup(void *opaque) 3997 { 3998 RAMBlock *rb; 3999 4000 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4001 qemu_ram_block_writeback(rb); 4002 } 4003 4004 xbzrle_load_cleanup(); 4005 compress_threads_load_cleanup(); 4006 4007 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4008 g_free(rb->receivedmap); 4009 rb->receivedmap = NULL; 4010 } 4011 4012 return 0; 4013 } 4014 4015 /** 4016 * ram_postcopy_incoming_init: allocate postcopy data structures 4017 * 4018 * Returns 0 for success and negative if there was one error 4019 * 4020 * @mis: current migration incoming state 4021 * 4022 * Allocate data structures etc needed by incoming migration with 4023 * postcopy-ram. postcopy-ram's similarly names 4024 * postcopy_ram_incoming_init does the work. 4025 */ 4026 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 4027 { 4028 return postcopy_ram_incoming_init(mis); 4029 } 4030 4031 /** 4032 * ram_load_postcopy: load a page in postcopy case 4033 * 4034 * Returns 0 for success or -errno in case of error 4035 * 4036 * Called in postcopy mode by ram_load(). 4037 * rcu_read_lock is taken prior to this being called. 4038 * 4039 * @f: QEMUFile where to send the data 4040 * @channel: the channel to use for loading 4041 */ 4042 int ram_load_postcopy(QEMUFile *f, int channel) 4043 { 4044 int flags = 0, ret = 0; 4045 bool place_needed = false; 4046 bool matches_target_page_size = false; 4047 MigrationIncomingState *mis = migration_incoming_get_current(); 4048 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 4049 4050 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4051 ram_addr_t addr; 4052 void *page_buffer = NULL; 4053 void *place_source = NULL; 4054 RAMBlock *block = NULL; 4055 uint8_t ch; 4056 int len; 4057 4058 addr = qemu_get_be64(f); 4059 4060 /* 4061 * If qemu file error, we should stop here, and then "addr" 4062 * may be invalid 4063 */ 4064 ret = qemu_file_get_error(f); 4065 if (ret) { 4066 break; 4067 } 4068 4069 flags = addr & ~TARGET_PAGE_MASK; 4070 addr &= TARGET_PAGE_MASK; 4071 4072 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 4073 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4074 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 4075 block = ram_block_from_stream(mis, f, flags, channel); 4076 if (!block) { 4077 ret = -EINVAL; 4078 break; 4079 } 4080 4081 /* 4082 * Relying on used_length is racy and can result in false positives. 4083 * We might place pages beyond used_length in case RAM was shrunk 4084 * while in postcopy, which is fine - trying to place via 4085 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4086 */ 4087 if (!block->host || addr >= block->postcopy_length) { 4088 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4089 ret = -EINVAL; 4090 break; 4091 } 4092 tmp_page->target_pages++; 4093 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4094 /* 4095 * Postcopy requires that we place whole host pages atomically; 4096 * these may be huge pages for RAMBlocks that are backed by 4097 * hugetlbfs. 4098 * To make it atomic, the data is read into a temporary page 4099 * that's moved into place later. 4100 * The migration protocol uses, possibly smaller, target-pages 4101 * however the source ensures it always sends all the components 4102 * of a host page in one chunk. 4103 */ 4104 page_buffer = tmp_page->tmp_huge_page + 4105 host_page_offset_from_ram_block_offset(block, addr); 4106 /* If all TP are zero then we can optimise the place */ 4107 if (tmp_page->target_pages == 1) { 4108 tmp_page->host_addr = 4109 host_page_from_ram_block_offset(block, addr); 4110 } else if (tmp_page->host_addr != 4111 host_page_from_ram_block_offset(block, addr)) { 4112 /* not the 1st TP within the HP */ 4113 error_report("Non-same host page detected on channel %d: " 4114 "Target host page %p, received host page %p " 4115 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4116 channel, tmp_page->host_addr, 4117 host_page_from_ram_block_offset(block, addr), 4118 block->idstr, addr, tmp_page->target_pages); 4119 ret = -EINVAL; 4120 break; 4121 } 4122 4123 /* 4124 * If it's the last part of a host page then we place the host 4125 * page 4126 */ 4127 if (tmp_page->target_pages == 4128 (block->page_size / TARGET_PAGE_SIZE)) { 4129 place_needed = true; 4130 } 4131 place_source = tmp_page->tmp_huge_page; 4132 } 4133 4134 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4135 case RAM_SAVE_FLAG_ZERO: 4136 ch = qemu_get_byte(f); 4137 /* 4138 * Can skip to set page_buffer when 4139 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4140 */ 4141 if (ch || !matches_target_page_size) { 4142 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4143 } 4144 if (ch) { 4145 tmp_page->all_zero = false; 4146 } 4147 break; 4148 4149 case RAM_SAVE_FLAG_PAGE: 4150 tmp_page->all_zero = false; 4151 if (!matches_target_page_size) { 4152 /* For huge pages, we always use temporary buffer */ 4153 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4154 } else { 4155 /* 4156 * For small pages that matches target page size, we 4157 * avoid the qemu_file copy. Instead we directly use 4158 * the buffer of QEMUFile to place the page. Note: we 4159 * cannot do any QEMUFile operation before using that 4160 * buffer to make sure the buffer is valid when 4161 * placing the page. 4162 */ 4163 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4164 TARGET_PAGE_SIZE); 4165 } 4166 break; 4167 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4168 tmp_page->all_zero = false; 4169 len = qemu_get_be32(f); 4170 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4171 error_report("Invalid compressed data length: %d", len); 4172 ret = -EINVAL; 4173 break; 4174 } 4175 decompress_data_with_multi_threads(f, page_buffer, len); 4176 break; 4177 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4178 multifd_recv_sync_main(); 4179 break; 4180 case RAM_SAVE_FLAG_EOS: 4181 /* normal exit */ 4182 if (migrate_multifd_flush_after_each_section()) { 4183 multifd_recv_sync_main(); 4184 } 4185 break; 4186 default: 4187 error_report("Unknown combination of migration flags: 0x%x" 4188 " (postcopy mode)", flags); 4189 ret = -EINVAL; 4190 break; 4191 } 4192 4193 /* Got the whole host page, wait for decompress before placing. */ 4194 if (place_needed) { 4195 ret |= wait_for_decompress_done(); 4196 } 4197 4198 /* Detect for any possible file errors */ 4199 if (!ret && qemu_file_get_error(f)) { 4200 ret = qemu_file_get_error(f); 4201 } 4202 4203 if (!ret && place_needed) { 4204 if (tmp_page->all_zero) { 4205 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4206 } else { 4207 ret = postcopy_place_page(mis, tmp_page->host_addr, 4208 place_source, block); 4209 } 4210 place_needed = false; 4211 postcopy_temp_page_reset(tmp_page); 4212 } 4213 } 4214 4215 return ret; 4216 } 4217 4218 static bool postcopy_is_running(void) 4219 { 4220 PostcopyState ps = postcopy_state_get(); 4221 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4222 } 4223 4224 /* 4225 * Flush content of RAM cache into SVM's memory. 4226 * Only flush the pages that be dirtied by PVM or SVM or both. 4227 */ 4228 void colo_flush_ram_cache(void) 4229 { 4230 RAMBlock *block = NULL; 4231 void *dst_host; 4232 void *src_host; 4233 unsigned long offset = 0; 4234 4235 memory_global_dirty_log_sync(); 4236 WITH_RCU_READ_LOCK_GUARD() { 4237 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4238 ramblock_sync_dirty_bitmap(ram_state, block); 4239 } 4240 } 4241 4242 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4243 WITH_RCU_READ_LOCK_GUARD() { 4244 block = QLIST_FIRST_RCU(&ram_list.blocks); 4245 4246 while (block) { 4247 unsigned long num = 0; 4248 4249 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4250 if (!offset_in_ramblock(block, 4251 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4252 offset = 0; 4253 num = 0; 4254 block = QLIST_NEXT_RCU(block, next); 4255 } else { 4256 unsigned long i = 0; 4257 4258 for (i = 0; i < num; i++) { 4259 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4260 } 4261 dst_host = block->host 4262 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4263 src_host = block->colo_cache 4264 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4265 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4266 offset += num; 4267 } 4268 } 4269 } 4270 trace_colo_flush_ram_cache_end(); 4271 } 4272 4273 /** 4274 * ram_load_precopy: load pages in precopy case 4275 * 4276 * Returns 0 for success or -errno in case of error 4277 * 4278 * Called in precopy mode by ram_load(). 4279 * rcu_read_lock is taken prior to this being called. 4280 * 4281 * @f: QEMUFile where to send the data 4282 */ 4283 static int ram_load_precopy(QEMUFile *f) 4284 { 4285 MigrationIncomingState *mis = migration_incoming_get_current(); 4286 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4287 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4288 bool postcopy_advised = migration_incoming_postcopy_advised(); 4289 if (!migrate_compress()) { 4290 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4291 } 4292 4293 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4294 ram_addr_t addr, total_ram_bytes; 4295 void *host = NULL, *host_bak = NULL; 4296 uint8_t ch; 4297 4298 /* 4299 * Yield periodically to let main loop run, but an iteration of 4300 * the main loop is expensive, so do it each some iterations 4301 */ 4302 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4303 aio_co_schedule(qemu_get_current_aio_context(), 4304 qemu_coroutine_self()); 4305 qemu_coroutine_yield(); 4306 } 4307 i++; 4308 4309 addr = qemu_get_be64(f); 4310 flags = addr & ~TARGET_PAGE_MASK; 4311 addr &= TARGET_PAGE_MASK; 4312 4313 if (flags & invalid_flags) { 4314 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4315 error_report("Received an unexpected compressed page"); 4316 } 4317 4318 ret = -EINVAL; 4319 break; 4320 } 4321 4322 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4323 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4324 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4325 RAM_CHANNEL_PRECOPY); 4326 4327 host = host_from_ram_block_offset(block, addr); 4328 /* 4329 * After going into COLO stage, we should not load the page 4330 * into SVM's memory directly, we put them into colo_cache firstly. 4331 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4332 * Previously, we copied all these memory in preparing stage of COLO 4333 * while we need to stop VM, which is a time-consuming process. 4334 * Here we optimize it by a trick, back-up every page while in 4335 * migration process while COLO is enabled, though it affects the 4336 * speed of the migration, but it obviously reduce the downtime of 4337 * back-up all SVM'S memory in COLO preparing stage. 4338 */ 4339 if (migration_incoming_colo_enabled()) { 4340 if (migration_incoming_in_colo_state()) { 4341 /* In COLO stage, put all pages into cache temporarily */ 4342 host = colo_cache_from_block_offset(block, addr, true); 4343 } else { 4344 /* 4345 * In migration stage but before COLO stage, 4346 * Put all pages into both cache and SVM's memory. 4347 */ 4348 host_bak = colo_cache_from_block_offset(block, addr, false); 4349 } 4350 } 4351 if (!host) { 4352 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4353 ret = -EINVAL; 4354 break; 4355 } 4356 if (!migration_incoming_in_colo_state()) { 4357 ramblock_recv_bitmap_set(block, host); 4358 } 4359 4360 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4361 } 4362 4363 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4364 case RAM_SAVE_FLAG_MEM_SIZE: 4365 /* Synchronize RAM block list */ 4366 total_ram_bytes = addr; 4367 while (!ret && total_ram_bytes) { 4368 RAMBlock *block; 4369 char id[256]; 4370 ram_addr_t length; 4371 4372 len = qemu_get_byte(f); 4373 qemu_get_buffer(f, (uint8_t *)id, len); 4374 id[len] = 0; 4375 length = qemu_get_be64(f); 4376 4377 block = qemu_ram_block_by_name(id); 4378 if (block && !qemu_ram_is_migratable(block)) { 4379 error_report("block %s should not be migrated !", id); 4380 ret = -EINVAL; 4381 } else if (block) { 4382 if (length != block->used_length) { 4383 Error *local_err = NULL; 4384 4385 ret = qemu_ram_resize(block, length, 4386 &local_err); 4387 if (local_err) { 4388 error_report_err(local_err); 4389 } 4390 } 4391 /* For postcopy we need to check hugepage sizes match */ 4392 if (postcopy_advised && migrate_postcopy_ram() && 4393 block->page_size != qemu_host_page_size) { 4394 uint64_t remote_page_size = qemu_get_be64(f); 4395 if (remote_page_size != block->page_size) { 4396 error_report("Mismatched RAM page size %s " 4397 "(local) %zd != %" PRId64, 4398 id, block->page_size, 4399 remote_page_size); 4400 ret = -EINVAL; 4401 } 4402 } 4403 if (migrate_ignore_shared()) { 4404 hwaddr addr = qemu_get_be64(f); 4405 if (ramblock_is_ignored(block) && 4406 block->mr->addr != addr) { 4407 error_report("Mismatched GPAs for block %s " 4408 "%" PRId64 "!= %" PRId64, 4409 id, (uint64_t)addr, 4410 (uint64_t)block->mr->addr); 4411 ret = -EINVAL; 4412 } 4413 } 4414 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4415 block->idstr); 4416 } else { 4417 error_report("Unknown ramblock \"%s\", cannot " 4418 "accept migration", id); 4419 ret = -EINVAL; 4420 } 4421 4422 total_ram_bytes -= length; 4423 } 4424 break; 4425 4426 case RAM_SAVE_FLAG_ZERO: 4427 ch = qemu_get_byte(f); 4428 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4429 break; 4430 4431 case RAM_SAVE_FLAG_PAGE: 4432 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4433 break; 4434 4435 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4436 len = qemu_get_be32(f); 4437 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4438 error_report("Invalid compressed data length: %d", len); 4439 ret = -EINVAL; 4440 break; 4441 } 4442 decompress_data_with_multi_threads(f, host, len); 4443 break; 4444 4445 case RAM_SAVE_FLAG_XBZRLE: 4446 if (load_xbzrle(f, addr, host) < 0) { 4447 error_report("Failed to decompress XBZRLE page at " 4448 RAM_ADDR_FMT, addr); 4449 ret = -EINVAL; 4450 break; 4451 } 4452 break; 4453 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4454 multifd_recv_sync_main(); 4455 break; 4456 case RAM_SAVE_FLAG_EOS: 4457 /* normal exit */ 4458 if (migrate_multifd_flush_after_each_section()) { 4459 multifd_recv_sync_main(); 4460 } 4461 break; 4462 default: 4463 if (flags & RAM_SAVE_FLAG_HOOK) { 4464 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4465 } else { 4466 error_report("Unknown combination of migration flags: 0x%x", 4467 flags); 4468 ret = -EINVAL; 4469 } 4470 } 4471 if (!ret) { 4472 ret = qemu_file_get_error(f); 4473 } 4474 if (!ret && host_bak) { 4475 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4476 } 4477 } 4478 4479 ret |= wait_for_decompress_done(); 4480 return ret; 4481 } 4482 4483 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4484 { 4485 int ret = 0; 4486 static uint64_t seq_iter; 4487 /* 4488 * If system is running in postcopy mode, page inserts to host memory must 4489 * be atomic 4490 */ 4491 bool postcopy_running = postcopy_is_running(); 4492 4493 seq_iter++; 4494 4495 if (version_id != 4) { 4496 return -EINVAL; 4497 } 4498 4499 /* 4500 * This RCU critical section can be very long running. 4501 * When RCU reclaims in the code start to become numerous, 4502 * it will be necessary to reduce the granularity of this 4503 * critical section. 4504 */ 4505 WITH_RCU_READ_LOCK_GUARD() { 4506 if (postcopy_running) { 4507 /* 4508 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4509 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4510 * service fast page faults. 4511 */ 4512 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4513 } else { 4514 ret = ram_load_precopy(f); 4515 } 4516 } 4517 trace_ram_load_complete(ret, seq_iter); 4518 4519 return ret; 4520 } 4521 4522 static bool ram_has_postcopy(void *opaque) 4523 { 4524 RAMBlock *rb; 4525 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4526 if (ramblock_is_pmem(rb)) { 4527 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4528 "is not supported now!", rb->idstr, rb->host); 4529 return false; 4530 } 4531 } 4532 4533 return migrate_postcopy_ram(); 4534 } 4535 4536 /* Sync all the dirty bitmap with destination VM. */ 4537 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4538 { 4539 RAMBlock *block; 4540 QEMUFile *file = s->to_dst_file; 4541 int ramblock_count = 0; 4542 4543 trace_ram_dirty_bitmap_sync_start(); 4544 4545 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4546 qemu_savevm_send_recv_bitmap(file, block->idstr); 4547 trace_ram_dirty_bitmap_request(block->idstr); 4548 ramblock_count++; 4549 } 4550 4551 trace_ram_dirty_bitmap_sync_wait(); 4552 4553 /* Wait until all the ramblocks' dirty bitmap synced */ 4554 while (ramblock_count--) { 4555 qemu_sem_wait(&s->rp_state.rp_sem); 4556 } 4557 4558 trace_ram_dirty_bitmap_sync_complete(); 4559 4560 return 0; 4561 } 4562 4563 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4564 { 4565 qemu_sem_post(&s->rp_state.rp_sem); 4566 } 4567 4568 /* 4569 * Read the received bitmap, revert it as the initial dirty bitmap. 4570 * This is only used when the postcopy migration is paused but wants 4571 * to resume from a middle point. 4572 */ 4573 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4574 { 4575 int ret = -EINVAL; 4576 /* from_dst_file is always valid because we're within rp_thread */ 4577 QEMUFile *file = s->rp_state.from_dst_file; 4578 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4579 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4580 uint64_t size, end_mark; 4581 4582 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4583 4584 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4585 error_report("%s: incorrect state %s", __func__, 4586 MigrationStatus_str(s->state)); 4587 return -EINVAL; 4588 } 4589 4590 /* 4591 * Note: see comments in ramblock_recv_bitmap_send() on why we 4592 * need the endianness conversion, and the paddings. 4593 */ 4594 local_size = ROUND_UP(local_size, 8); 4595 4596 /* Add paddings */ 4597 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4598 4599 size = qemu_get_be64(file); 4600 4601 /* The size of the bitmap should match with our ramblock */ 4602 if (size != local_size) { 4603 error_report("%s: ramblock '%s' bitmap size mismatch " 4604 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4605 block->idstr, size, local_size); 4606 ret = -EINVAL; 4607 goto out; 4608 } 4609 4610 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4611 end_mark = qemu_get_be64(file); 4612 4613 ret = qemu_file_get_error(file); 4614 if (ret || size != local_size) { 4615 error_report("%s: read bitmap failed for ramblock '%s': %d" 4616 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4617 __func__, block->idstr, ret, local_size, size); 4618 ret = -EIO; 4619 goto out; 4620 } 4621 4622 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4623 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4624 __func__, block->idstr, end_mark); 4625 ret = -EINVAL; 4626 goto out; 4627 } 4628 4629 /* 4630 * Endianness conversion. We are during postcopy (though paused). 4631 * The dirty bitmap won't change. We can directly modify it. 4632 */ 4633 bitmap_from_le(block->bmap, le_bitmap, nbits); 4634 4635 /* 4636 * What we received is "received bitmap". Revert it as the initial 4637 * dirty bitmap for this ramblock. 4638 */ 4639 bitmap_complement(block->bmap, block->bmap, nbits); 4640 4641 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4642 ramblock_dirty_bitmap_clear_discarded_pages(block); 4643 4644 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4645 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4646 4647 /* 4648 * We succeeded to sync bitmap for current ramblock. If this is 4649 * the last one to sync, we need to notify the main send thread. 4650 */ 4651 ram_dirty_bitmap_reload_notify(s); 4652 4653 ret = 0; 4654 out: 4655 g_free(le_bitmap); 4656 return ret; 4657 } 4658 4659 static int ram_resume_prepare(MigrationState *s, void *opaque) 4660 { 4661 RAMState *rs = *(RAMState **)opaque; 4662 int ret; 4663 4664 ret = ram_dirty_bitmap_sync_all(s, rs); 4665 if (ret) { 4666 return ret; 4667 } 4668 4669 ram_state_resume_prepare(rs, s->to_dst_file); 4670 4671 return 0; 4672 } 4673 4674 void postcopy_preempt_shutdown_file(MigrationState *s) 4675 { 4676 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4677 qemu_fflush(s->postcopy_qemufile_src); 4678 } 4679 4680 static SaveVMHandlers savevm_ram_handlers = { 4681 .save_setup = ram_save_setup, 4682 .save_live_iterate = ram_save_iterate, 4683 .save_live_complete_postcopy = ram_save_complete, 4684 .save_live_complete_precopy = ram_save_complete, 4685 .has_postcopy = ram_has_postcopy, 4686 .state_pending_exact = ram_state_pending_exact, 4687 .state_pending_estimate = ram_state_pending_estimate, 4688 .load_state = ram_load, 4689 .save_cleanup = ram_save_cleanup, 4690 .load_setup = ram_load_setup, 4691 .load_cleanup = ram_load_cleanup, 4692 .resume_prepare = ram_resume_prepare, 4693 }; 4694 4695 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4696 size_t old_size, size_t new_size) 4697 { 4698 PostcopyState ps = postcopy_state_get(); 4699 ram_addr_t offset; 4700 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4701 Error *err = NULL; 4702 4703 if (ramblock_is_ignored(rb)) { 4704 return; 4705 } 4706 4707 if (!migration_is_idle()) { 4708 /* 4709 * Precopy code on the source cannot deal with the size of RAM blocks 4710 * changing at random points in time - especially after sending the 4711 * RAM block sizes in the migration stream, they must no longer change. 4712 * Abort and indicate a proper reason. 4713 */ 4714 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4715 migration_cancel(err); 4716 error_free(err); 4717 } 4718 4719 switch (ps) { 4720 case POSTCOPY_INCOMING_ADVISE: 4721 /* 4722 * Update what ram_postcopy_incoming_init()->init_range() does at the 4723 * time postcopy was advised. Syncing RAM blocks with the source will 4724 * result in RAM resizes. 4725 */ 4726 if (old_size < new_size) { 4727 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4728 error_report("RAM block '%s' discard of resized RAM failed", 4729 rb->idstr); 4730 } 4731 } 4732 rb->postcopy_length = new_size; 4733 break; 4734 case POSTCOPY_INCOMING_NONE: 4735 case POSTCOPY_INCOMING_RUNNING: 4736 case POSTCOPY_INCOMING_END: 4737 /* 4738 * Once our guest is running, postcopy does no longer care about 4739 * resizes. When growing, the new memory was not available on the 4740 * source, no handler needed. 4741 */ 4742 break; 4743 default: 4744 error_report("RAM block '%s' resized during postcopy state: %d", 4745 rb->idstr, ps); 4746 exit(-1); 4747 } 4748 } 4749 4750 static RAMBlockNotifier ram_mig_ram_notifier = { 4751 .ram_block_resized = ram_mig_ram_block_resized, 4752 }; 4753 4754 void ram_mig_init(void) 4755 { 4756 qemu_mutex_init(&XBZRLE.lock); 4757 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4758 ram_block_notifier_add(&ram_mig_ram_notifier); 4759 } 4760