1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 #include "options.h" 61 62 #include "hw/boards.h" /* for machine_dump_guest_core() */ 63 64 #if defined(__linux__) 65 #include "qemu/userfaultfd.h" 66 #endif /* defined(__linux__) */ 67 68 /***********************************************************/ 69 /* ram save/restore */ 70 71 /* 72 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 73 * worked for pages that were filled with the same char. We switched 74 * it to only search for the zero value. And to avoid confusion with 75 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 76 */ 77 /* 78 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 79 */ 80 #define RAM_SAVE_FLAG_FULL 0x01 81 #define RAM_SAVE_FLAG_ZERO 0x02 82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 83 #define RAM_SAVE_FLAG_PAGE 0x08 84 #define RAM_SAVE_FLAG_EOS 0x10 85 #define RAM_SAVE_FLAG_CONTINUE 0x20 86 #define RAM_SAVE_FLAG_XBZRLE 0x40 87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 88 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 89 /* We can't use any flag that is bigger than 0x200 */ 90 91 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, 92 uint8_t *, int) = xbzrle_encode_buffer; 93 #if defined(CONFIG_AVX512BW_OPT) 94 #include "qemu/cpuid.h" 95 static void __attribute__((constructor)) init_cpu_flag(void) 96 { 97 unsigned max = __get_cpuid_max(0, NULL); 98 int a, b, c, d; 99 if (max >= 1) { 100 __cpuid(1, a, b, c, d); 101 /* We must check that AVX is not just available, but usable. */ 102 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { 103 int bv; 104 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); 105 __cpuid_count(7, 0, a, b, c, d); 106 /* 0xe6: 107 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 108 * and ZMM16-ZMM31 state are enabled by OS) 109 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) 110 */ 111 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { 112 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; 113 } 114 } 115 } 116 } 117 #endif 118 119 XBZRLECacheStats xbzrle_counters; 120 121 /* used by the search for pages to send */ 122 struct PageSearchStatus { 123 /* The migration channel used for a specific host page */ 124 QEMUFile *pss_channel; 125 /* Last block from where we have sent data */ 126 RAMBlock *last_sent_block; 127 /* Current block being searched */ 128 RAMBlock *block; 129 /* Current page to search from */ 130 unsigned long page; 131 /* Set once we wrap around */ 132 bool complete_round; 133 /* Whether we're sending a host page */ 134 bool host_page_sending; 135 /* The start/end of current host page. Invalid if host_page_sending==false */ 136 unsigned long host_page_start; 137 unsigned long host_page_end; 138 }; 139 typedef struct PageSearchStatus PageSearchStatus; 140 141 /* struct contains XBZRLE cache and a static page 142 used by the compression */ 143 static struct { 144 /* buffer used for XBZRLE encoding */ 145 uint8_t *encoded_buf; 146 /* buffer for storing page content */ 147 uint8_t *current_buf; 148 /* Cache for XBZRLE, Protected by lock. */ 149 PageCache *cache; 150 QemuMutex lock; 151 /* it will store a page full of zeros */ 152 uint8_t *zero_target_page; 153 /* buffer used for XBZRLE decoding */ 154 uint8_t *decoded_buf; 155 } XBZRLE; 156 157 static void XBZRLE_cache_lock(void) 158 { 159 if (migrate_xbzrle()) { 160 qemu_mutex_lock(&XBZRLE.lock); 161 } 162 } 163 164 static void XBZRLE_cache_unlock(void) 165 { 166 if (migrate_xbzrle()) { 167 qemu_mutex_unlock(&XBZRLE.lock); 168 } 169 } 170 171 /** 172 * xbzrle_cache_resize: resize the xbzrle cache 173 * 174 * This function is called from migrate_params_apply in main 175 * thread, possibly while a migration is in progress. A running 176 * migration may be using the cache and might finish during this call, 177 * hence changes to the cache are protected by XBZRLE.lock(). 178 * 179 * Returns 0 for success or -1 for error 180 * 181 * @new_size: new cache size 182 * @errp: set *errp if the check failed, with reason 183 */ 184 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 185 { 186 PageCache *new_cache; 187 int64_t ret = 0; 188 189 /* Check for truncation */ 190 if (new_size != (size_t)new_size) { 191 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 192 "exceeding address space"); 193 return -1; 194 } 195 196 if (new_size == migrate_xbzrle_cache_size()) { 197 /* nothing to do */ 198 return 0; 199 } 200 201 XBZRLE_cache_lock(); 202 203 if (XBZRLE.cache != NULL) { 204 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 205 if (!new_cache) { 206 ret = -1; 207 goto out; 208 } 209 210 cache_fini(XBZRLE.cache); 211 XBZRLE.cache = new_cache; 212 } 213 out: 214 XBZRLE_cache_unlock(); 215 return ret; 216 } 217 218 static bool postcopy_preempt_active(void) 219 { 220 return migrate_postcopy_preempt() && migration_in_postcopy(); 221 } 222 223 bool ramblock_is_ignored(RAMBlock *block) 224 { 225 return !qemu_ram_is_migratable(block) || 226 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 227 } 228 229 #undef RAMBLOCK_FOREACH 230 231 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 232 { 233 RAMBlock *block; 234 int ret = 0; 235 236 RCU_READ_LOCK_GUARD(); 237 238 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 239 ret = func(block, opaque); 240 if (ret) { 241 break; 242 } 243 } 244 return ret; 245 } 246 247 static void ramblock_recv_map_init(void) 248 { 249 RAMBlock *rb; 250 251 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 252 assert(!rb->receivedmap); 253 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 254 } 255 } 256 257 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 258 { 259 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 260 rb->receivedmap); 261 } 262 263 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 264 { 265 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 266 } 267 268 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 269 { 270 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 271 } 272 273 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 274 size_t nr) 275 { 276 bitmap_set_atomic(rb->receivedmap, 277 ramblock_recv_bitmap_offset(host_addr, rb), 278 nr); 279 } 280 281 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 282 283 /* 284 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 285 * 286 * Returns >0 if success with sent bytes, or <0 if error. 287 */ 288 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 289 const char *block_name) 290 { 291 RAMBlock *block = qemu_ram_block_by_name(block_name); 292 unsigned long *le_bitmap, nbits; 293 uint64_t size; 294 295 if (!block) { 296 error_report("%s: invalid block name: %s", __func__, block_name); 297 return -1; 298 } 299 300 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 301 302 /* 303 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 304 * machines we may need 4 more bytes for padding (see below 305 * comment). So extend it a bit before hand. 306 */ 307 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 308 309 /* 310 * Always use little endian when sending the bitmap. This is 311 * required that when source and destination VMs are not using the 312 * same endianness. (Note: big endian won't work.) 313 */ 314 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 315 316 /* Size of the bitmap, in bytes */ 317 size = DIV_ROUND_UP(nbits, 8); 318 319 /* 320 * size is always aligned to 8 bytes for 64bit machines, but it 321 * may not be true for 32bit machines. We need this padding to 322 * make sure the migration can survive even between 32bit and 323 * 64bit machines. 324 */ 325 size = ROUND_UP(size, 8); 326 327 qemu_put_be64(file, size); 328 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 329 /* 330 * Mark as an end, in case the middle part is screwed up due to 331 * some "mysterious" reason. 332 */ 333 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 334 qemu_fflush(file); 335 336 g_free(le_bitmap); 337 338 if (qemu_file_get_error(file)) { 339 return qemu_file_get_error(file); 340 } 341 342 return size + sizeof(size); 343 } 344 345 /* 346 * An outstanding page request, on the source, having been received 347 * and queued 348 */ 349 struct RAMSrcPageRequest { 350 RAMBlock *rb; 351 hwaddr offset; 352 hwaddr len; 353 354 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 355 }; 356 357 /* State of RAM for migration */ 358 struct RAMState { 359 /* 360 * PageSearchStatus structures for the channels when send pages. 361 * Protected by the bitmap_mutex. 362 */ 363 PageSearchStatus pss[RAM_CHANNEL_MAX]; 364 /* UFFD file descriptor, used in 'write-tracking' migration */ 365 int uffdio_fd; 366 /* total ram size in bytes */ 367 uint64_t ram_bytes_total; 368 /* Last block that we have visited searching for dirty pages */ 369 RAMBlock *last_seen_block; 370 /* Last dirty target page we have sent */ 371 ram_addr_t last_page; 372 /* last ram version we have seen */ 373 uint32_t last_version; 374 /* How many times we have dirty too many pages */ 375 int dirty_rate_high_cnt; 376 /* these variables are used for bitmap sync */ 377 /* last time we did a full bitmap_sync */ 378 int64_t time_last_bitmap_sync; 379 /* bytes transferred at start_time */ 380 uint64_t bytes_xfer_prev; 381 /* number of dirty pages since start_time */ 382 uint64_t num_dirty_pages_period; 383 /* xbzrle misses since the beginning of the period */ 384 uint64_t xbzrle_cache_miss_prev; 385 /* Amount of xbzrle pages since the beginning of the period */ 386 uint64_t xbzrle_pages_prev; 387 /* Amount of xbzrle encoded bytes since the beginning of the period */ 388 uint64_t xbzrle_bytes_prev; 389 /* Start using XBZRLE (e.g., after the first round). */ 390 bool xbzrle_enabled; 391 /* Are we on the last stage of migration */ 392 bool last_stage; 393 /* compression statistics since the beginning of the period */ 394 /* amount of count that no free thread to compress data */ 395 uint64_t compress_thread_busy_prev; 396 /* amount bytes after compression */ 397 uint64_t compressed_size_prev; 398 /* amount of compressed pages */ 399 uint64_t compress_pages_prev; 400 401 /* total handled target pages at the beginning of period */ 402 uint64_t target_page_count_prev; 403 /* total handled target pages since start */ 404 uint64_t target_page_count; 405 /* number of dirty bits in the bitmap */ 406 uint64_t migration_dirty_pages; 407 /* 408 * Protects: 409 * - dirty/clear bitmap 410 * - migration_dirty_pages 411 * - pss structures 412 */ 413 QemuMutex bitmap_mutex; 414 /* The RAMBlock used in the last src_page_requests */ 415 RAMBlock *last_req_rb; 416 /* Queue of outstanding page requests from the destination */ 417 QemuMutex src_page_req_mutex; 418 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 419 }; 420 typedef struct RAMState RAMState; 421 422 static RAMState *ram_state; 423 424 static NotifierWithReturnList precopy_notifier_list; 425 426 /* Whether postcopy has queued requests? */ 427 static bool postcopy_has_request(RAMState *rs) 428 { 429 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 430 } 431 432 void precopy_infrastructure_init(void) 433 { 434 notifier_with_return_list_init(&precopy_notifier_list); 435 } 436 437 void precopy_add_notifier(NotifierWithReturn *n) 438 { 439 notifier_with_return_list_add(&precopy_notifier_list, n); 440 } 441 442 void precopy_remove_notifier(NotifierWithReturn *n) 443 { 444 notifier_with_return_remove(n); 445 } 446 447 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 448 { 449 PrecopyNotifyData pnd; 450 pnd.reason = reason; 451 pnd.errp = errp; 452 453 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 454 } 455 456 uint64_t ram_bytes_remaining(void) 457 { 458 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 459 0; 460 } 461 462 RAMStats ram_counters; 463 464 void ram_transferred_add(uint64_t bytes) 465 { 466 if (runstate_is_running()) { 467 stat64_add(&ram_counters.precopy_bytes, bytes); 468 } else if (migration_in_postcopy()) { 469 stat64_add(&ram_counters.postcopy_bytes, bytes); 470 } else { 471 stat64_add(&ram_counters.downtime_bytes, bytes); 472 } 473 stat64_add(&ram_counters.transferred, bytes); 474 } 475 476 struct MigrationOps { 477 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 478 }; 479 typedef struct MigrationOps MigrationOps; 480 481 MigrationOps *migration_ops; 482 483 CompressionStats compression_counters; 484 485 struct CompressParam { 486 bool done; 487 bool quit; 488 bool zero_page; 489 QEMUFile *file; 490 QemuMutex mutex; 491 QemuCond cond; 492 RAMBlock *block; 493 ram_addr_t offset; 494 495 /* internally used fields */ 496 z_stream stream; 497 uint8_t *originbuf; 498 }; 499 typedef struct CompressParam CompressParam; 500 501 struct DecompressParam { 502 bool done; 503 bool quit; 504 QemuMutex mutex; 505 QemuCond cond; 506 void *des; 507 uint8_t *compbuf; 508 int len; 509 z_stream stream; 510 }; 511 typedef struct DecompressParam DecompressParam; 512 513 static CompressParam *comp_param; 514 static QemuThread *compress_threads; 515 /* comp_done_cond is used to wake up the migration thread when 516 * one of the compression threads has finished the compression. 517 * comp_done_lock is used to co-work with comp_done_cond. 518 */ 519 static QemuMutex comp_done_lock; 520 static QemuCond comp_done_cond; 521 522 static QEMUFile *decomp_file; 523 static DecompressParam *decomp_param; 524 static QemuThread *decompress_threads; 525 static QemuMutex decomp_done_lock; 526 static QemuCond decomp_done_cond; 527 528 static int ram_save_host_page_urgent(PageSearchStatus *pss); 529 530 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 531 ram_addr_t offset, uint8_t *source_buf); 532 533 /* NOTE: page is the PFN not real ram_addr_t. */ 534 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 535 { 536 pss->block = rb; 537 pss->page = page; 538 pss->complete_round = false; 539 } 540 541 /* 542 * Check whether two PSSs are actively sending the same page. Return true 543 * if it is, false otherwise. 544 */ 545 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 546 { 547 return pss1->host_page_sending && pss2->host_page_sending && 548 (pss1->host_page_start == pss2->host_page_start); 549 } 550 551 static void *do_data_compress(void *opaque) 552 { 553 CompressParam *param = opaque; 554 RAMBlock *block; 555 ram_addr_t offset; 556 bool zero_page; 557 558 qemu_mutex_lock(¶m->mutex); 559 while (!param->quit) { 560 if (param->block) { 561 block = param->block; 562 offset = param->offset; 563 param->block = NULL; 564 qemu_mutex_unlock(¶m->mutex); 565 566 zero_page = do_compress_ram_page(param->file, ¶m->stream, 567 block, offset, param->originbuf); 568 569 qemu_mutex_lock(&comp_done_lock); 570 param->done = true; 571 param->zero_page = zero_page; 572 qemu_cond_signal(&comp_done_cond); 573 qemu_mutex_unlock(&comp_done_lock); 574 575 qemu_mutex_lock(¶m->mutex); 576 } else { 577 qemu_cond_wait(¶m->cond, ¶m->mutex); 578 } 579 } 580 qemu_mutex_unlock(¶m->mutex); 581 582 return NULL; 583 } 584 585 static void compress_threads_save_cleanup(void) 586 { 587 int i, thread_count; 588 589 if (!migrate_compress() || !comp_param) { 590 return; 591 } 592 593 thread_count = migrate_compress_threads(); 594 for (i = 0; i < thread_count; i++) { 595 /* 596 * we use it as a indicator which shows if the thread is 597 * properly init'd or not 598 */ 599 if (!comp_param[i].file) { 600 break; 601 } 602 603 qemu_mutex_lock(&comp_param[i].mutex); 604 comp_param[i].quit = true; 605 qemu_cond_signal(&comp_param[i].cond); 606 qemu_mutex_unlock(&comp_param[i].mutex); 607 608 qemu_thread_join(compress_threads + i); 609 qemu_mutex_destroy(&comp_param[i].mutex); 610 qemu_cond_destroy(&comp_param[i].cond); 611 deflateEnd(&comp_param[i].stream); 612 g_free(comp_param[i].originbuf); 613 qemu_fclose(comp_param[i].file); 614 comp_param[i].file = NULL; 615 } 616 qemu_mutex_destroy(&comp_done_lock); 617 qemu_cond_destroy(&comp_done_cond); 618 g_free(compress_threads); 619 g_free(comp_param); 620 compress_threads = NULL; 621 comp_param = NULL; 622 } 623 624 static int compress_threads_save_setup(void) 625 { 626 int i, thread_count; 627 628 if (!migrate_compress()) { 629 return 0; 630 } 631 thread_count = migrate_compress_threads(); 632 compress_threads = g_new0(QemuThread, thread_count); 633 comp_param = g_new0(CompressParam, thread_count); 634 qemu_cond_init(&comp_done_cond); 635 qemu_mutex_init(&comp_done_lock); 636 for (i = 0; i < thread_count; i++) { 637 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 638 if (!comp_param[i].originbuf) { 639 goto exit; 640 } 641 642 if (deflateInit(&comp_param[i].stream, 643 migrate_compress_level()) != Z_OK) { 644 g_free(comp_param[i].originbuf); 645 goto exit; 646 } 647 648 /* comp_param[i].file is just used as a dummy buffer to save data, 649 * set its ops to empty. 650 */ 651 comp_param[i].file = qemu_file_new_output( 652 QIO_CHANNEL(qio_channel_null_new())); 653 comp_param[i].done = true; 654 comp_param[i].quit = false; 655 qemu_mutex_init(&comp_param[i].mutex); 656 qemu_cond_init(&comp_param[i].cond); 657 qemu_thread_create(compress_threads + i, "compress", 658 do_data_compress, comp_param + i, 659 QEMU_THREAD_JOINABLE); 660 } 661 return 0; 662 663 exit: 664 compress_threads_save_cleanup(); 665 return -1; 666 } 667 668 /** 669 * save_page_header: write page header to wire 670 * 671 * If this is the 1st block, it also writes the block identification 672 * 673 * Returns the number of bytes written 674 * 675 * @pss: current PSS channel status 676 * @block: block that contains the page we want to send 677 * @offset: offset inside the block for the page 678 * in the lower bits, it contains flags 679 */ 680 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 681 RAMBlock *block, ram_addr_t offset) 682 { 683 size_t size, len; 684 bool same_block = (block == pss->last_sent_block); 685 686 if (same_block) { 687 offset |= RAM_SAVE_FLAG_CONTINUE; 688 } 689 qemu_put_be64(f, offset); 690 size = 8; 691 692 if (!same_block) { 693 len = strlen(block->idstr); 694 qemu_put_byte(f, len); 695 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 696 size += 1 + len; 697 pss->last_sent_block = block; 698 } 699 return size; 700 } 701 702 /** 703 * mig_throttle_guest_down: throttle down the guest 704 * 705 * Reduce amount of guest cpu execution to hopefully slow down memory 706 * writes. If guest dirty memory rate is reduced below the rate at 707 * which we can transfer pages to the destination then we should be 708 * able to complete migration. Some workloads dirty memory way too 709 * fast and will not effectively converge, even with auto-converge. 710 */ 711 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 712 uint64_t bytes_dirty_threshold) 713 { 714 uint64_t pct_initial = migrate_cpu_throttle_initial(); 715 uint64_t pct_increment = migrate_cpu_throttle_increment(); 716 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 717 int pct_max = migrate_max_cpu_throttle(); 718 719 uint64_t throttle_now = cpu_throttle_get_percentage(); 720 uint64_t cpu_now, cpu_ideal, throttle_inc; 721 722 /* We have not started throttling yet. Let's start it. */ 723 if (!cpu_throttle_active()) { 724 cpu_throttle_set(pct_initial); 725 } else { 726 /* Throttling already on, just increase the rate */ 727 if (!pct_tailslow) { 728 throttle_inc = pct_increment; 729 } else { 730 /* Compute the ideal CPU percentage used by Guest, which may 731 * make the dirty rate match the dirty rate threshold. */ 732 cpu_now = 100 - throttle_now; 733 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 734 bytes_dirty_period); 735 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 736 } 737 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 738 } 739 } 740 741 void mig_throttle_counter_reset(void) 742 { 743 RAMState *rs = ram_state; 744 745 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 746 rs->num_dirty_pages_period = 0; 747 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); 748 } 749 750 /** 751 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 752 * 753 * @rs: current RAM state 754 * @current_addr: address for the zero page 755 * 756 * Update the xbzrle cache to reflect a page that's been sent as all 0. 757 * The important thing is that a stale (not-yet-0'd) page be replaced 758 * by the new data. 759 * As a bonus, if the page wasn't in the cache it gets added so that 760 * when a small write is made into the 0'd page it gets XBZRLE sent. 761 */ 762 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 763 { 764 /* We don't care if this fails to allocate a new cache page 765 * as long as it updated an old one */ 766 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 767 stat64_get(&ram_counters.dirty_sync_count)); 768 } 769 770 #define ENCODING_FLAG_XBZRLE 0x1 771 772 /** 773 * save_xbzrle_page: compress and send current page 774 * 775 * Returns: 1 means that we wrote the page 776 * 0 means that page is identical to the one already sent 777 * -1 means that xbzrle would be longer than normal 778 * 779 * @rs: current RAM state 780 * @pss: current PSS channel 781 * @current_data: pointer to the address of the page contents 782 * @current_addr: addr of the page 783 * @block: block that contains the page we want to send 784 * @offset: offset inside the block for the page 785 */ 786 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 787 uint8_t **current_data, ram_addr_t current_addr, 788 RAMBlock *block, ram_addr_t offset) 789 { 790 int encoded_len = 0, bytes_xbzrle; 791 uint8_t *prev_cached_page; 792 QEMUFile *file = pss->pss_channel; 793 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); 794 795 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 796 xbzrle_counters.cache_miss++; 797 if (!rs->last_stage) { 798 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 799 generation) == -1) { 800 return -1; 801 } else { 802 /* update *current_data when the page has been 803 inserted into cache */ 804 *current_data = get_cached_data(XBZRLE.cache, current_addr); 805 } 806 } 807 return -1; 808 } 809 810 /* 811 * Reaching here means the page has hit the xbzrle cache, no matter what 812 * encoding result it is (normal encoding, overflow or skipping the page), 813 * count the page as encoded. This is used to calculate the encoding rate. 814 * 815 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 816 * 2nd page turns out to be skipped (i.e. no new bytes written to the 817 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 818 * skipped page included. In this way, the encoding rate can tell if the 819 * guest page is good for xbzrle encoding. 820 */ 821 xbzrle_counters.pages++; 822 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 823 824 /* save current buffer into memory */ 825 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 826 827 /* XBZRLE encoding (if there is no overflow) */ 828 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, 829 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 830 TARGET_PAGE_SIZE); 831 832 /* 833 * Update the cache contents, so that it corresponds to the data 834 * sent, in all cases except where we skip the page. 835 */ 836 if (!rs->last_stage && encoded_len != 0) { 837 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 838 /* 839 * In the case where we couldn't compress, ensure that the caller 840 * sends the data from the cache, since the guest might have 841 * changed the RAM since we copied it. 842 */ 843 *current_data = prev_cached_page; 844 } 845 846 if (encoded_len == 0) { 847 trace_save_xbzrle_page_skipping(); 848 return 0; 849 } else if (encoded_len == -1) { 850 trace_save_xbzrle_page_overflow(); 851 xbzrle_counters.overflow++; 852 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 853 return -1; 854 } 855 856 /* Send XBZRLE based compressed page */ 857 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 858 offset | RAM_SAVE_FLAG_XBZRLE); 859 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 860 qemu_put_be16(file, encoded_len); 861 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 862 bytes_xbzrle += encoded_len + 1 + 2; 863 /* 864 * Like compressed_size (please see update_compress_thread_counts), 865 * the xbzrle encoded bytes don't count the 8 byte header with 866 * RAM_SAVE_FLAG_CONTINUE. 867 */ 868 xbzrle_counters.bytes += bytes_xbzrle - 8; 869 ram_transferred_add(bytes_xbzrle); 870 871 return 1; 872 } 873 874 /** 875 * pss_find_next_dirty: find the next dirty page of current ramblock 876 * 877 * This function updates pss->page to point to the next dirty page index 878 * within the ramblock to migrate, or the end of ramblock when nothing 879 * found. Note that when pss->host_page_sending==true it means we're 880 * during sending a host page, so we won't look for dirty page that is 881 * outside the host page boundary. 882 * 883 * @pss: the current page search status 884 */ 885 static void pss_find_next_dirty(PageSearchStatus *pss) 886 { 887 RAMBlock *rb = pss->block; 888 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 889 unsigned long *bitmap = rb->bmap; 890 891 if (ramblock_is_ignored(rb)) { 892 /* Points directly to the end, so we know no dirty page */ 893 pss->page = size; 894 return; 895 } 896 897 /* 898 * If during sending a host page, only look for dirty pages within the 899 * current host page being send. 900 */ 901 if (pss->host_page_sending) { 902 assert(pss->host_page_end); 903 size = MIN(size, pss->host_page_end); 904 } 905 906 pss->page = find_next_bit(bitmap, size, pss->page); 907 } 908 909 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 910 unsigned long page) 911 { 912 uint8_t shift; 913 hwaddr size, start; 914 915 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 916 return; 917 } 918 919 shift = rb->clear_bmap_shift; 920 /* 921 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 922 * can make things easier sometimes since then start address 923 * of the small chunk will always be 64 pages aligned so the 924 * bitmap will always be aligned to unsigned long. We should 925 * even be able to remove this restriction but I'm simply 926 * keeping it. 927 */ 928 assert(shift >= 6); 929 930 size = 1ULL << (TARGET_PAGE_BITS + shift); 931 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 932 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 933 memory_region_clear_dirty_bitmap(rb->mr, start, size); 934 } 935 936 static void 937 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 938 unsigned long start, 939 unsigned long npages) 940 { 941 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 942 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 943 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 944 945 /* 946 * Clear pages from start to start + npages - 1, so the end boundary is 947 * exclusive. 948 */ 949 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 950 migration_clear_memory_region_dirty_bitmap(rb, i); 951 } 952 } 953 954 /* 955 * colo_bitmap_find_diry:find contiguous dirty pages from start 956 * 957 * Returns the page offset within memory region of the start of the contiguout 958 * dirty page 959 * 960 * @rs: current RAM state 961 * @rb: RAMBlock where to search for dirty pages 962 * @start: page where we start the search 963 * @num: the number of contiguous dirty pages 964 */ 965 static inline 966 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 967 unsigned long start, unsigned long *num) 968 { 969 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 970 unsigned long *bitmap = rb->bmap; 971 unsigned long first, next; 972 973 *num = 0; 974 975 if (ramblock_is_ignored(rb)) { 976 return size; 977 } 978 979 first = find_next_bit(bitmap, size, start); 980 if (first >= size) { 981 return first; 982 } 983 next = find_next_zero_bit(bitmap, size, first + 1); 984 assert(next >= first); 985 *num = next - first; 986 return first; 987 } 988 989 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 990 RAMBlock *rb, 991 unsigned long page) 992 { 993 bool ret; 994 995 /* 996 * Clear dirty bitmap if needed. This _must_ be called before we 997 * send any of the page in the chunk because we need to make sure 998 * we can capture further page content changes when we sync dirty 999 * log the next time. So as long as we are going to send any of 1000 * the page in the chunk we clear the remote dirty bitmap for all. 1001 * Clearing it earlier won't be a problem, but too late will. 1002 */ 1003 migration_clear_memory_region_dirty_bitmap(rb, page); 1004 1005 ret = test_and_clear_bit(page, rb->bmap); 1006 if (ret) { 1007 rs->migration_dirty_pages--; 1008 } 1009 1010 return ret; 1011 } 1012 1013 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 1014 void *opaque) 1015 { 1016 const hwaddr offset = section->offset_within_region; 1017 const hwaddr size = int128_get64(section->size); 1018 const unsigned long start = offset >> TARGET_PAGE_BITS; 1019 const unsigned long npages = size >> TARGET_PAGE_BITS; 1020 RAMBlock *rb = section->mr->ram_block; 1021 uint64_t *cleared_bits = opaque; 1022 1023 /* 1024 * We don't grab ram_state->bitmap_mutex because we expect to run 1025 * only when starting migration or during postcopy recovery where 1026 * we don't have concurrent access. 1027 */ 1028 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1029 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1030 } 1031 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1032 bitmap_clear(rb->bmap, start, npages); 1033 } 1034 1035 /* 1036 * Exclude all dirty pages from migration that fall into a discarded range as 1037 * managed by a RamDiscardManager responsible for the mapped memory region of 1038 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1039 * 1040 * Discarded pages ("logically unplugged") have undefined content and must 1041 * not get migrated, because even reading these pages for migration might 1042 * result in undesired behavior. 1043 * 1044 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1045 * 1046 * Note: The result is only stable while migrating (precopy/postcopy). 1047 */ 1048 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1049 { 1050 uint64_t cleared_bits = 0; 1051 1052 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1053 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1054 MemoryRegionSection section = { 1055 .mr = rb->mr, 1056 .offset_within_region = 0, 1057 .size = int128_make64(qemu_ram_get_used_length(rb)), 1058 }; 1059 1060 ram_discard_manager_replay_discarded(rdm, §ion, 1061 dirty_bitmap_clear_section, 1062 &cleared_bits); 1063 } 1064 return cleared_bits; 1065 } 1066 1067 /* 1068 * Check if a host-page aligned page falls into a discarded range as managed by 1069 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1070 * 1071 * Note: The result is only stable while migrating (precopy/postcopy). 1072 */ 1073 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1074 { 1075 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1076 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1077 MemoryRegionSection section = { 1078 .mr = rb->mr, 1079 .offset_within_region = start, 1080 .size = int128_make64(qemu_ram_pagesize(rb)), 1081 }; 1082 1083 return !ram_discard_manager_is_populated(rdm, §ion); 1084 } 1085 return false; 1086 } 1087 1088 /* Called with RCU critical section */ 1089 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1090 { 1091 uint64_t new_dirty_pages = 1092 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1093 1094 rs->migration_dirty_pages += new_dirty_pages; 1095 rs->num_dirty_pages_period += new_dirty_pages; 1096 } 1097 1098 /** 1099 * ram_pagesize_summary: calculate all the pagesizes of a VM 1100 * 1101 * Returns a summary bitmap of the page sizes of all RAMBlocks 1102 * 1103 * For VMs with just normal pages this is equivalent to the host page 1104 * size. If it's got some huge pages then it's the OR of all the 1105 * different page sizes. 1106 */ 1107 uint64_t ram_pagesize_summary(void) 1108 { 1109 RAMBlock *block; 1110 uint64_t summary = 0; 1111 1112 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1113 summary |= block->page_size; 1114 } 1115 1116 return summary; 1117 } 1118 1119 uint64_t ram_get_total_transferred_pages(void) 1120 { 1121 return stat64_get(&ram_counters.normal_pages) + 1122 stat64_get(&ram_counters.zero_pages) + 1123 compression_counters.pages + xbzrle_counters.pages; 1124 } 1125 1126 static void migration_update_rates(RAMState *rs, int64_t end_time) 1127 { 1128 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1129 double compressed_size; 1130 1131 /* calculate period counters */ 1132 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1133 / (end_time - rs->time_last_bitmap_sync); 1134 1135 if (!page_count) { 1136 return; 1137 } 1138 1139 if (migrate_xbzrle()) { 1140 double encoded_size, unencoded_size; 1141 1142 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1143 rs->xbzrle_cache_miss_prev) / page_count; 1144 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1145 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1146 TARGET_PAGE_SIZE; 1147 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1148 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1149 xbzrle_counters.encoding_rate = 0; 1150 } else { 1151 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1152 } 1153 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1154 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1155 } 1156 1157 if (migrate_compress()) { 1158 compression_counters.busy_rate = (double)(compression_counters.busy - 1159 rs->compress_thread_busy_prev) / page_count; 1160 rs->compress_thread_busy_prev = compression_counters.busy; 1161 1162 compressed_size = compression_counters.compressed_size - 1163 rs->compressed_size_prev; 1164 if (compressed_size) { 1165 double uncompressed_size = (compression_counters.pages - 1166 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1167 1168 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1169 compression_counters.compression_rate = 1170 uncompressed_size / compressed_size; 1171 1172 rs->compress_pages_prev = compression_counters.pages; 1173 rs->compressed_size_prev = compression_counters.compressed_size; 1174 } 1175 } 1176 } 1177 1178 static void migration_trigger_throttle(RAMState *rs) 1179 { 1180 uint64_t threshold = migrate_throttle_trigger_threshold(); 1181 uint64_t bytes_xfer_period = 1182 stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev; 1183 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1184 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1185 1186 /* During block migration the auto-converge logic incorrectly detects 1187 * that ram migration makes no progress. Avoid this by disabling the 1188 * throttling logic during the bulk phase of block migration. */ 1189 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1190 /* The following detection logic can be refined later. For now: 1191 Check to see if the ratio between dirtied bytes and the approx. 1192 amount of bytes that just got transferred since the last time 1193 we were in this routine reaches the threshold. If that happens 1194 twice, start or increase throttling. */ 1195 1196 if ((bytes_dirty_period > bytes_dirty_threshold) && 1197 (++rs->dirty_rate_high_cnt >= 2)) { 1198 trace_migration_throttle(); 1199 rs->dirty_rate_high_cnt = 0; 1200 mig_throttle_guest_down(bytes_dirty_period, 1201 bytes_dirty_threshold); 1202 } 1203 } 1204 } 1205 1206 static void migration_bitmap_sync(RAMState *rs) 1207 { 1208 RAMBlock *block; 1209 int64_t end_time; 1210 1211 stat64_add(&ram_counters.dirty_sync_count, 1); 1212 1213 if (!rs->time_last_bitmap_sync) { 1214 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1215 } 1216 1217 trace_migration_bitmap_sync_start(); 1218 memory_global_dirty_log_sync(); 1219 1220 qemu_mutex_lock(&rs->bitmap_mutex); 1221 WITH_RCU_READ_LOCK_GUARD() { 1222 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1223 ramblock_sync_dirty_bitmap(rs, block); 1224 } 1225 ram_counters.remaining = ram_bytes_remaining(); 1226 } 1227 qemu_mutex_unlock(&rs->bitmap_mutex); 1228 1229 memory_global_after_dirty_log_sync(); 1230 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1231 1232 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1233 1234 /* more than 1 second = 1000 millisecons */ 1235 if (end_time > rs->time_last_bitmap_sync + 1000) { 1236 migration_trigger_throttle(rs); 1237 1238 migration_update_rates(rs, end_time); 1239 1240 rs->target_page_count_prev = rs->target_page_count; 1241 1242 /* reset period counters */ 1243 rs->time_last_bitmap_sync = end_time; 1244 rs->num_dirty_pages_period = 0; 1245 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); 1246 } 1247 if (migrate_events()) { 1248 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); 1249 qapi_event_send_migration_pass(generation); 1250 } 1251 } 1252 1253 static void migration_bitmap_sync_precopy(RAMState *rs) 1254 { 1255 Error *local_err = NULL; 1256 1257 /* 1258 * The current notifier usage is just an optimization to migration, so we 1259 * don't stop the normal migration process in the error case. 1260 */ 1261 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1262 error_report_err(local_err); 1263 local_err = NULL; 1264 } 1265 1266 migration_bitmap_sync(rs); 1267 1268 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1269 error_report_err(local_err); 1270 } 1271 } 1272 1273 void ram_release_page(const char *rbname, uint64_t offset) 1274 { 1275 if (!migrate_release_ram() || !migration_in_postcopy()) { 1276 return; 1277 } 1278 1279 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1280 } 1281 1282 /** 1283 * save_zero_page_to_file: send the zero page to the file 1284 * 1285 * Returns the size of data written to the file, 0 means the page is not 1286 * a zero page 1287 * 1288 * @pss: current PSS channel 1289 * @block: block that contains the page we want to send 1290 * @offset: offset inside the block for the page 1291 */ 1292 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1293 RAMBlock *block, ram_addr_t offset) 1294 { 1295 uint8_t *p = block->host + offset; 1296 int len = 0; 1297 1298 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1299 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1300 qemu_put_byte(file, 0); 1301 len += 1; 1302 ram_release_page(block->idstr, offset); 1303 } 1304 return len; 1305 } 1306 1307 /** 1308 * save_zero_page: send the zero page to the stream 1309 * 1310 * Returns the number of pages written. 1311 * 1312 * @pss: current PSS channel 1313 * @block: block that contains the page we want to send 1314 * @offset: offset inside the block for the page 1315 */ 1316 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1317 ram_addr_t offset) 1318 { 1319 int len = save_zero_page_to_file(pss, f, block, offset); 1320 1321 if (len) { 1322 stat64_add(&ram_counters.zero_pages, 1); 1323 ram_transferred_add(len); 1324 return 1; 1325 } 1326 return -1; 1327 } 1328 1329 /* 1330 * @pages: the number of pages written by the control path, 1331 * < 0 - error 1332 * > 0 - number of pages written 1333 * 1334 * Return true if the pages has been saved, otherwise false is returned. 1335 */ 1336 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1337 ram_addr_t offset, int *pages) 1338 { 1339 uint64_t bytes_xmit = 0; 1340 int ret; 1341 1342 *pages = -1; 1343 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1344 TARGET_PAGE_SIZE, &bytes_xmit); 1345 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1346 return false; 1347 } 1348 1349 if (bytes_xmit) { 1350 ram_transferred_add(bytes_xmit); 1351 *pages = 1; 1352 } 1353 1354 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1355 return true; 1356 } 1357 1358 if (bytes_xmit > 0) { 1359 stat64_add(&ram_counters.normal_pages, 1); 1360 } else if (bytes_xmit == 0) { 1361 stat64_add(&ram_counters.zero_pages, 1); 1362 } 1363 1364 return true; 1365 } 1366 1367 /* 1368 * directly send the page to the stream 1369 * 1370 * Returns the number of pages written. 1371 * 1372 * @pss: current PSS channel 1373 * @block: block that contains the page we want to send 1374 * @offset: offset inside the block for the page 1375 * @buf: the page to be sent 1376 * @async: send to page asyncly 1377 */ 1378 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1379 ram_addr_t offset, uint8_t *buf, bool async) 1380 { 1381 QEMUFile *file = pss->pss_channel; 1382 1383 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1384 offset | RAM_SAVE_FLAG_PAGE)); 1385 if (async) { 1386 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1387 migrate_release_ram() && 1388 migration_in_postcopy()); 1389 } else { 1390 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1391 } 1392 ram_transferred_add(TARGET_PAGE_SIZE); 1393 stat64_add(&ram_counters.normal_pages, 1); 1394 return 1; 1395 } 1396 1397 /** 1398 * ram_save_page: send the given page to the stream 1399 * 1400 * Returns the number of pages written. 1401 * < 0 - error 1402 * >=0 - Number of pages written - this might legally be 0 1403 * if xbzrle noticed the page was the same. 1404 * 1405 * @rs: current RAM state 1406 * @block: block that contains the page we want to send 1407 * @offset: offset inside the block for the page 1408 */ 1409 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1410 { 1411 int pages = -1; 1412 uint8_t *p; 1413 bool send_async = true; 1414 RAMBlock *block = pss->block; 1415 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1416 ram_addr_t current_addr = block->offset + offset; 1417 1418 p = block->host + offset; 1419 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1420 1421 XBZRLE_cache_lock(); 1422 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1423 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1424 block, offset); 1425 if (!rs->last_stage) { 1426 /* Can't send this cached data async, since the cache page 1427 * might get updated before it gets to the wire 1428 */ 1429 send_async = false; 1430 } 1431 } 1432 1433 /* XBZRLE overflow or normal page */ 1434 if (pages == -1) { 1435 pages = save_normal_page(pss, block, offset, p, send_async); 1436 } 1437 1438 XBZRLE_cache_unlock(); 1439 1440 return pages; 1441 } 1442 1443 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1444 ram_addr_t offset) 1445 { 1446 if (multifd_queue_page(file, block, offset) < 0) { 1447 return -1; 1448 } 1449 stat64_add(&ram_counters.normal_pages, 1); 1450 1451 return 1; 1452 } 1453 1454 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1455 ram_addr_t offset, uint8_t *source_buf) 1456 { 1457 RAMState *rs = ram_state; 1458 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1459 uint8_t *p = block->host + offset; 1460 int ret; 1461 1462 if (save_zero_page_to_file(pss, f, block, offset)) { 1463 return true; 1464 } 1465 1466 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1467 1468 /* 1469 * copy it to a internal buffer to avoid it being modified by VM 1470 * so that we can catch up the error during compression and 1471 * decompression 1472 */ 1473 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1474 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1475 if (ret < 0) { 1476 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1477 error_report("compressed data failed!"); 1478 } 1479 return false; 1480 } 1481 1482 static void 1483 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1484 { 1485 ram_transferred_add(bytes_xmit); 1486 1487 if (param->zero_page) { 1488 stat64_add(&ram_counters.zero_pages, 1); 1489 return; 1490 } 1491 1492 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1493 compression_counters.compressed_size += bytes_xmit - 8; 1494 compression_counters.pages++; 1495 } 1496 1497 static bool save_page_use_compression(RAMState *rs); 1498 1499 static void flush_compressed_data(RAMState *rs) 1500 { 1501 MigrationState *ms = migrate_get_current(); 1502 int idx, len, thread_count; 1503 1504 if (!save_page_use_compression(rs)) { 1505 return; 1506 } 1507 thread_count = migrate_compress_threads(); 1508 1509 qemu_mutex_lock(&comp_done_lock); 1510 for (idx = 0; idx < thread_count; idx++) { 1511 while (!comp_param[idx].done) { 1512 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1513 } 1514 } 1515 qemu_mutex_unlock(&comp_done_lock); 1516 1517 for (idx = 0; idx < thread_count; idx++) { 1518 qemu_mutex_lock(&comp_param[idx].mutex); 1519 if (!comp_param[idx].quit) { 1520 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1521 /* 1522 * it's safe to fetch zero_page without holding comp_done_lock 1523 * as there is no further request submitted to the thread, 1524 * i.e, the thread should be waiting for a request at this point. 1525 */ 1526 update_compress_thread_counts(&comp_param[idx], len); 1527 } 1528 qemu_mutex_unlock(&comp_param[idx].mutex); 1529 } 1530 } 1531 1532 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1533 ram_addr_t offset) 1534 { 1535 param->block = block; 1536 param->offset = offset; 1537 } 1538 1539 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1540 { 1541 int idx, thread_count, bytes_xmit = -1, pages = -1; 1542 bool wait = migrate_compress_wait_thread(); 1543 MigrationState *ms = migrate_get_current(); 1544 1545 thread_count = migrate_compress_threads(); 1546 qemu_mutex_lock(&comp_done_lock); 1547 retry: 1548 for (idx = 0; idx < thread_count; idx++) { 1549 if (comp_param[idx].done) { 1550 comp_param[idx].done = false; 1551 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1552 comp_param[idx].file); 1553 qemu_mutex_lock(&comp_param[idx].mutex); 1554 set_compress_params(&comp_param[idx], block, offset); 1555 qemu_cond_signal(&comp_param[idx].cond); 1556 qemu_mutex_unlock(&comp_param[idx].mutex); 1557 pages = 1; 1558 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1559 break; 1560 } 1561 } 1562 1563 /* 1564 * wait for the free thread if the user specifies 'compress-wait-thread', 1565 * otherwise we will post the page out in the main thread as normal page. 1566 */ 1567 if (pages < 0 && wait) { 1568 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1569 goto retry; 1570 } 1571 qemu_mutex_unlock(&comp_done_lock); 1572 1573 return pages; 1574 } 1575 1576 #define PAGE_ALL_CLEAN 0 1577 #define PAGE_TRY_AGAIN 1 1578 #define PAGE_DIRTY_FOUND 2 1579 /** 1580 * find_dirty_block: find the next dirty page and update any state 1581 * associated with the search process. 1582 * 1583 * Returns: 1584 * PAGE_ALL_CLEAN: no dirty page found, give up 1585 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1586 * PAGE_DIRTY_FOUND: dirty page found 1587 * 1588 * @rs: current RAM state 1589 * @pss: data about the state of the current dirty page scan 1590 * @again: set to false if the search has scanned the whole of RAM 1591 */ 1592 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1593 { 1594 /* Update pss->page for the next dirty bit in ramblock */ 1595 pss_find_next_dirty(pss); 1596 1597 if (pss->complete_round && pss->block == rs->last_seen_block && 1598 pss->page >= rs->last_page) { 1599 /* 1600 * We've been once around the RAM and haven't found anything. 1601 * Give up. 1602 */ 1603 return PAGE_ALL_CLEAN; 1604 } 1605 if (!offset_in_ramblock(pss->block, 1606 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1607 /* Didn't find anything in this RAM Block */ 1608 pss->page = 0; 1609 pss->block = QLIST_NEXT_RCU(pss->block, next); 1610 if (!pss->block) { 1611 /* 1612 * If memory migration starts over, we will meet a dirtied page 1613 * which may still exists in compression threads's ring, so we 1614 * should flush the compressed data to make sure the new page 1615 * is not overwritten by the old one in the destination. 1616 * 1617 * Also If xbzrle is on, stop using the data compression at this 1618 * point. In theory, xbzrle can do better than compression. 1619 */ 1620 flush_compressed_data(rs); 1621 1622 /* Hit the end of the list */ 1623 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1624 /* Flag that we've looped */ 1625 pss->complete_round = true; 1626 /* After the first round, enable XBZRLE. */ 1627 if (migrate_xbzrle()) { 1628 rs->xbzrle_enabled = true; 1629 } 1630 } 1631 /* Didn't find anything this time, but try again on the new block */ 1632 return PAGE_TRY_AGAIN; 1633 } else { 1634 /* We've found something */ 1635 return PAGE_DIRTY_FOUND; 1636 } 1637 } 1638 1639 /** 1640 * unqueue_page: gets a page of the queue 1641 * 1642 * Helper for 'get_queued_page' - gets a page off the queue 1643 * 1644 * Returns the block of the page (or NULL if none available) 1645 * 1646 * @rs: current RAM state 1647 * @offset: used to return the offset within the RAMBlock 1648 */ 1649 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1650 { 1651 struct RAMSrcPageRequest *entry; 1652 RAMBlock *block = NULL; 1653 1654 if (!postcopy_has_request(rs)) { 1655 return NULL; 1656 } 1657 1658 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1659 1660 /* 1661 * This should _never_ change even after we take the lock, because no one 1662 * should be taking anything off the request list other than us. 1663 */ 1664 assert(postcopy_has_request(rs)); 1665 1666 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1667 block = entry->rb; 1668 *offset = entry->offset; 1669 1670 if (entry->len > TARGET_PAGE_SIZE) { 1671 entry->len -= TARGET_PAGE_SIZE; 1672 entry->offset += TARGET_PAGE_SIZE; 1673 } else { 1674 memory_region_unref(block->mr); 1675 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1676 g_free(entry); 1677 migration_consume_urgent_request(); 1678 } 1679 1680 return block; 1681 } 1682 1683 #if defined(__linux__) 1684 /** 1685 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1686 * is found, return RAM block pointer and page offset 1687 * 1688 * Returns pointer to the RAMBlock containing faulting page, 1689 * NULL if no write faults are pending 1690 * 1691 * @rs: current RAM state 1692 * @offset: page offset from the beginning of the block 1693 */ 1694 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1695 { 1696 struct uffd_msg uffd_msg; 1697 void *page_address; 1698 RAMBlock *block; 1699 int res; 1700 1701 if (!migrate_background_snapshot()) { 1702 return NULL; 1703 } 1704 1705 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1706 if (res <= 0) { 1707 return NULL; 1708 } 1709 1710 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1711 block = qemu_ram_block_from_host(page_address, false, offset); 1712 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1713 return block; 1714 } 1715 1716 /** 1717 * ram_save_release_protection: release UFFD write protection after 1718 * a range of pages has been saved 1719 * 1720 * @rs: current RAM state 1721 * @pss: page-search-status structure 1722 * @start_page: index of the first page in the range relative to pss->block 1723 * 1724 * Returns 0 on success, negative value in case of an error 1725 */ 1726 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1727 unsigned long start_page) 1728 { 1729 int res = 0; 1730 1731 /* Check if page is from UFFD-managed region. */ 1732 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1733 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1734 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1735 1736 /* Flush async buffers before un-protect. */ 1737 qemu_fflush(pss->pss_channel); 1738 /* Un-protect memory range. */ 1739 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1740 false, false); 1741 } 1742 1743 return res; 1744 } 1745 1746 /* ram_write_tracking_available: check if kernel supports required UFFD features 1747 * 1748 * Returns true if supports, false otherwise 1749 */ 1750 bool ram_write_tracking_available(void) 1751 { 1752 uint64_t uffd_features; 1753 int res; 1754 1755 res = uffd_query_features(&uffd_features); 1756 return (res == 0 && 1757 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1758 } 1759 1760 /* ram_write_tracking_compatible: check if guest configuration is 1761 * compatible with 'write-tracking' 1762 * 1763 * Returns true if compatible, false otherwise 1764 */ 1765 bool ram_write_tracking_compatible(void) 1766 { 1767 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1768 int uffd_fd; 1769 RAMBlock *block; 1770 bool ret = false; 1771 1772 /* Open UFFD file descriptor */ 1773 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1774 if (uffd_fd < 0) { 1775 return false; 1776 } 1777 1778 RCU_READ_LOCK_GUARD(); 1779 1780 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1781 uint64_t uffd_ioctls; 1782 1783 /* Nothing to do with read-only and MMIO-writable regions */ 1784 if (block->mr->readonly || block->mr->rom_device) { 1785 continue; 1786 } 1787 /* Try to register block memory via UFFD-IO to track writes */ 1788 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1789 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1790 goto out; 1791 } 1792 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1793 goto out; 1794 } 1795 } 1796 ret = true; 1797 1798 out: 1799 uffd_close_fd(uffd_fd); 1800 return ret; 1801 } 1802 1803 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1804 ram_addr_t size) 1805 { 1806 const ram_addr_t end = offset + size; 1807 1808 /* 1809 * We read one byte of each page; this will preallocate page tables if 1810 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1811 * where no page was populated yet. This might require adaption when 1812 * supporting other mappings, like shmem. 1813 */ 1814 for (; offset < end; offset += block->page_size) { 1815 char tmp = *((char *)block->host + offset); 1816 1817 /* Don't optimize the read out */ 1818 asm volatile("" : "+r" (tmp)); 1819 } 1820 } 1821 1822 static inline int populate_read_section(MemoryRegionSection *section, 1823 void *opaque) 1824 { 1825 const hwaddr size = int128_get64(section->size); 1826 hwaddr offset = section->offset_within_region; 1827 RAMBlock *block = section->mr->ram_block; 1828 1829 populate_read_range(block, offset, size); 1830 return 0; 1831 } 1832 1833 /* 1834 * ram_block_populate_read: preallocate page tables and populate pages in the 1835 * RAM block by reading a byte of each page. 1836 * 1837 * Since it's solely used for userfault_fd WP feature, here we just 1838 * hardcode page size to qemu_real_host_page_size. 1839 * 1840 * @block: RAM block to populate 1841 */ 1842 static void ram_block_populate_read(RAMBlock *rb) 1843 { 1844 /* 1845 * Skip populating all pages that fall into a discarded range as managed by 1846 * a RamDiscardManager responsible for the mapped memory region of the 1847 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1848 * must not get populated automatically. We don't have to track 1849 * modifications via userfaultfd WP reliably, because these pages will 1850 * not be part of the migration stream either way -- see 1851 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1852 * 1853 * Note: The result is only stable while migrating (precopy/postcopy). 1854 */ 1855 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1856 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1857 MemoryRegionSection section = { 1858 .mr = rb->mr, 1859 .offset_within_region = 0, 1860 .size = rb->mr->size, 1861 }; 1862 1863 ram_discard_manager_replay_populated(rdm, §ion, 1864 populate_read_section, NULL); 1865 } else { 1866 populate_read_range(rb, 0, rb->used_length); 1867 } 1868 } 1869 1870 /* 1871 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1872 */ 1873 void ram_write_tracking_prepare(void) 1874 { 1875 RAMBlock *block; 1876 1877 RCU_READ_LOCK_GUARD(); 1878 1879 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1880 /* Nothing to do with read-only and MMIO-writable regions */ 1881 if (block->mr->readonly || block->mr->rom_device) { 1882 continue; 1883 } 1884 1885 /* 1886 * Populate pages of the RAM block before enabling userfault_fd 1887 * write protection. 1888 * 1889 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1890 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1891 * pages with pte_none() entries in page table. 1892 */ 1893 ram_block_populate_read(block); 1894 } 1895 } 1896 1897 static inline int uffd_protect_section(MemoryRegionSection *section, 1898 void *opaque) 1899 { 1900 const hwaddr size = int128_get64(section->size); 1901 const hwaddr offset = section->offset_within_region; 1902 RAMBlock *rb = section->mr->ram_block; 1903 int uffd_fd = (uintptr_t)opaque; 1904 1905 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1906 false); 1907 } 1908 1909 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1910 { 1911 assert(rb->flags & RAM_UF_WRITEPROTECT); 1912 1913 /* See ram_block_populate_read() */ 1914 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1915 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1916 MemoryRegionSection section = { 1917 .mr = rb->mr, 1918 .offset_within_region = 0, 1919 .size = rb->mr->size, 1920 }; 1921 1922 return ram_discard_manager_replay_populated(rdm, §ion, 1923 uffd_protect_section, 1924 (void *)(uintptr_t)uffd_fd); 1925 } 1926 return uffd_change_protection(uffd_fd, rb->host, 1927 rb->used_length, true, false); 1928 } 1929 1930 /* 1931 * ram_write_tracking_start: start UFFD-WP memory tracking 1932 * 1933 * Returns 0 for success or negative value in case of error 1934 */ 1935 int ram_write_tracking_start(void) 1936 { 1937 int uffd_fd; 1938 RAMState *rs = ram_state; 1939 RAMBlock *block; 1940 1941 /* Open UFFD file descriptor */ 1942 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1943 if (uffd_fd < 0) { 1944 return uffd_fd; 1945 } 1946 rs->uffdio_fd = uffd_fd; 1947 1948 RCU_READ_LOCK_GUARD(); 1949 1950 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1951 /* Nothing to do with read-only and MMIO-writable regions */ 1952 if (block->mr->readonly || block->mr->rom_device) { 1953 continue; 1954 } 1955 1956 /* Register block memory with UFFD to track writes */ 1957 if (uffd_register_memory(rs->uffdio_fd, block->host, 1958 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1959 goto fail; 1960 } 1961 block->flags |= RAM_UF_WRITEPROTECT; 1962 memory_region_ref(block->mr); 1963 1964 /* Apply UFFD write protection to the block memory range */ 1965 if (ram_block_uffd_protect(block, uffd_fd)) { 1966 goto fail; 1967 } 1968 1969 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1970 block->host, block->max_length); 1971 } 1972 1973 return 0; 1974 1975 fail: 1976 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1977 1978 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1979 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1980 continue; 1981 } 1982 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1983 /* Cleanup flags and remove reference */ 1984 block->flags &= ~RAM_UF_WRITEPROTECT; 1985 memory_region_unref(block->mr); 1986 } 1987 1988 uffd_close_fd(uffd_fd); 1989 rs->uffdio_fd = -1; 1990 return -1; 1991 } 1992 1993 /** 1994 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1995 */ 1996 void ram_write_tracking_stop(void) 1997 { 1998 RAMState *rs = ram_state; 1999 RAMBlock *block; 2000 2001 RCU_READ_LOCK_GUARD(); 2002 2003 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2004 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 2005 continue; 2006 } 2007 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 2008 2009 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 2010 block->host, block->max_length); 2011 2012 /* Cleanup flags and remove reference */ 2013 block->flags &= ~RAM_UF_WRITEPROTECT; 2014 memory_region_unref(block->mr); 2015 } 2016 2017 /* Finally close UFFD file descriptor */ 2018 uffd_close_fd(rs->uffdio_fd); 2019 rs->uffdio_fd = -1; 2020 } 2021 2022 #else 2023 /* No target OS support, stubs just fail or ignore */ 2024 2025 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 2026 { 2027 (void) rs; 2028 (void) offset; 2029 2030 return NULL; 2031 } 2032 2033 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 2034 unsigned long start_page) 2035 { 2036 (void) rs; 2037 (void) pss; 2038 (void) start_page; 2039 2040 return 0; 2041 } 2042 2043 bool ram_write_tracking_available(void) 2044 { 2045 return false; 2046 } 2047 2048 bool ram_write_tracking_compatible(void) 2049 { 2050 assert(0); 2051 return false; 2052 } 2053 2054 int ram_write_tracking_start(void) 2055 { 2056 assert(0); 2057 return -1; 2058 } 2059 2060 void ram_write_tracking_stop(void) 2061 { 2062 assert(0); 2063 } 2064 #endif /* defined(__linux__) */ 2065 2066 /** 2067 * get_queued_page: unqueue a page from the postcopy requests 2068 * 2069 * Skips pages that are already sent (!dirty) 2070 * 2071 * Returns true if a queued page is found 2072 * 2073 * @rs: current RAM state 2074 * @pss: data about the state of the current dirty page scan 2075 */ 2076 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2077 { 2078 RAMBlock *block; 2079 ram_addr_t offset; 2080 bool dirty; 2081 2082 do { 2083 block = unqueue_page(rs, &offset); 2084 /* 2085 * We're sending this page, and since it's postcopy nothing else 2086 * will dirty it, and we must make sure it doesn't get sent again 2087 * even if this queue request was received after the background 2088 * search already sent it. 2089 */ 2090 if (block) { 2091 unsigned long page; 2092 2093 page = offset >> TARGET_PAGE_BITS; 2094 dirty = test_bit(page, block->bmap); 2095 if (!dirty) { 2096 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2097 page); 2098 } else { 2099 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2100 } 2101 } 2102 2103 } while (block && !dirty); 2104 2105 if (!block) { 2106 /* 2107 * Poll write faults too if background snapshot is enabled; that's 2108 * when we have vcpus got blocked by the write protected pages. 2109 */ 2110 block = poll_fault_page(rs, &offset); 2111 } 2112 2113 if (block) { 2114 /* 2115 * We want the background search to continue from the queued page 2116 * since the guest is likely to want other pages near to the page 2117 * it just requested. 2118 */ 2119 pss->block = block; 2120 pss->page = offset >> TARGET_PAGE_BITS; 2121 2122 /* 2123 * This unqueued page would break the "one round" check, even is 2124 * really rare. 2125 */ 2126 pss->complete_round = false; 2127 } 2128 2129 return !!block; 2130 } 2131 2132 /** 2133 * migration_page_queue_free: drop any remaining pages in the ram 2134 * request queue 2135 * 2136 * It should be empty at the end anyway, but in error cases there may 2137 * be some left. in case that there is any page left, we drop it. 2138 * 2139 */ 2140 static void migration_page_queue_free(RAMState *rs) 2141 { 2142 struct RAMSrcPageRequest *mspr, *next_mspr; 2143 /* This queue generally should be empty - but in the case of a failed 2144 * migration might have some droppings in. 2145 */ 2146 RCU_READ_LOCK_GUARD(); 2147 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2148 memory_region_unref(mspr->rb->mr); 2149 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2150 g_free(mspr); 2151 } 2152 } 2153 2154 /** 2155 * ram_save_queue_pages: queue the page for transmission 2156 * 2157 * A request from postcopy destination for example. 2158 * 2159 * Returns zero on success or negative on error 2160 * 2161 * @rbname: Name of the RAMBLock of the request. NULL means the 2162 * same that last one. 2163 * @start: starting address from the start of the RAMBlock 2164 * @len: length (in bytes) to send 2165 */ 2166 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2167 { 2168 RAMBlock *ramblock; 2169 RAMState *rs = ram_state; 2170 2171 stat64_add(&ram_counters.postcopy_requests, 1); 2172 RCU_READ_LOCK_GUARD(); 2173 2174 if (!rbname) { 2175 /* Reuse last RAMBlock */ 2176 ramblock = rs->last_req_rb; 2177 2178 if (!ramblock) { 2179 /* 2180 * Shouldn't happen, we can't reuse the last RAMBlock if 2181 * it's the 1st request. 2182 */ 2183 error_report("ram_save_queue_pages no previous block"); 2184 return -1; 2185 } 2186 } else { 2187 ramblock = qemu_ram_block_by_name(rbname); 2188 2189 if (!ramblock) { 2190 /* We shouldn't be asked for a non-existent RAMBlock */ 2191 error_report("ram_save_queue_pages no block '%s'", rbname); 2192 return -1; 2193 } 2194 rs->last_req_rb = ramblock; 2195 } 2196 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2197 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2198 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2199 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2200 __func__, start, len, ramblock->used_length); 2201 return -1; 2202 } 2203 2204 /* 2205 * When with postcopy preempt, we send back the page directly in the 2206 * rp-return thread. 2207 */ 2208 if (postcopy_preempt_active()) { 2209 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2210 size_t page_size = qemu_ram_pagesize(ramblock); 2211 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2212 int ret = 0; 2213 2214 qemu_mutex_lock(&rs->bitmap_mutex); 2215 2216 pss_init(pss, ramblock, page_start); 2217 /* 2218 * Always use the preempt channel, and make sure it's there. It's 2219 * safe to access without lock, because when rp-thread is running 2220 * we should be the only one who operates on the qemufile 2221 */ 2222 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2223 assert(pss->pss_channel); 2224 2225 /* 2226 * It must be either one or multiple of host page size. Just 2227 * assert; if something wrong we're mostly split brain anyway. 2228 */ 2229 assert(len % page_size == 0); 2230 while (len) { 2231 if (ram_save_host_page_urgent(pss)) { 2232 error_report("%s: ram_save_host_page_urgent() failed: " 2233 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2234 __func__, ramblock->idstr, start); 2235 ret = -1; 2236 break; 2237 } 2238 /* 2239 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2240 * will automatically be moved and point to the next host page 2241 * we're going to send, so no need to update here. 2242 * 2243 * Normally QEMU never sends >1 host page in requests, so 2244 * logically we don't even need that as the loop should only 2245 * run once, but just to be consistent. 2246 */ 2247 len -= page_size; 2248 }; 2249 qemu_mutex_unlock(&rs->bitmap_mutex); 2250 2251 return ret; 2252 } 2253 2254 struct RAMSrcPageRequest *new_entry = 2255 g_new0(struct RAMSrcPageRequest, 1); 2256 new_entry->rb = ramblock; 2257 new_entry->offset = start; 2258 new_entry->len = len; 2259 2260 memory_region_ref(ramblock->mr); 2261 qemu_mutex_lock(&rs->src_page_req_mutex); 2262 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2263 migration_make_urgent_request(); 2264 qemu_mutex_unlock(&rs->src_page_req_mutex); 2265 2266 return 0; 2267 } 2268 2269 static bool save_page_use_compression(RAMState *rs) 2270 { 2271 if (!migrate_compress()) { 2272 return false; 2273 } 2274 2275 /* 2276 * If xbzrle is enabled (e.g., after first round of migration), stop 2277 * using the data compression. In theory, xbzrle can do better than 2278 * compression. 2279 */ 2280 if (rs->xbzrle_enabled) { 2281 return false; 2282 } 2283 2284 return true; 2285 } 2286 2287 /* 2288 * try to compress the page before posting it out, return true if the page 2289 * has been properly handled by compression, otherwise needs other 2290 * paths to handle it 2291 */ 2292 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2293 RAMBlock *block, ram_addr_t offset) 2294 { 2295 if (!save_page_use_compression(rs)) { 2296 return false; 2297 } 2298 2299 /* 2300 * When starting the process of a new block, the first page of 2301 * the block should be sent out before other pages in the same 2302 * block, and all the pages in last block should have been sent 2303 * out, keeping this order is important, because the 'cont' flag 2304 * is used to avoid resending the block name. 2305 * 2306 * We post the fist page as normal page as compression will take 2307 * much CPU resource. 2308 */ 2309 if (block != pss->last_sent_block) { 2310 flush_compressed_data(rs); 2311 return false; 2312 } 2313 2314 if (compress_page_with_multi_thread(block, offset) > 0) { 2315 return true; 2316 } 2317 2318 compression_counters.busy++; 2319 return false; 2320 } 2321 2322 /** 2323 * ram_save_target_page_legacy: save one target page 2324 * 2325 * Returns the number of pages written 2326 * 2327 * @rs: current RAM state 2328 * @pss: data about the page we want to send 2329 */ 2330 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2331 { 2332 RAMBlock *block = pss->block; 2333 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2334 int res; 2335 2336 if (control_save_page(pss, block, offset, &res)) { 2337 return res; 2338 } 2339 2340 if (save_compress_page(rs, pss, block, offset)) { 2341 return 1; 2342 } 2343 2344 res = save_zero_page(pss, pss->pss_channel, block, offset); 2345 if (res > 0) { 2346 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2347 * page would be stale 2348 */ 2349 if (rs->xbzrle_enabled) { 2350 XBZRLE_cache_lock(); 2351 xbzrle_cache_zero_page(rs, block->offset + offset); 2352 XBZRLE_cache_unlock(); 2353 } 2354 return res; 2355 } 2356 2357 /* 2358 * Do not use multifd in postcopy as one whole host page should be 2359 * placed. Meanwhile postcopy requires atomic update of pages, so even 2360 * if host page size == guest page size the dest guest during run may 2361 * still see partially copied pages which is data corruption. 2362 */ 2363 if (migrate_multifd() && !migration_in_postcopy()) { 2364 return ram_save_multifd_page(pss->pss_channel, block, offset); 2365 } 2366 2367 return ram_save_page(rs, pss); 2368 } 2369 2370 /* Should be called before sending a host page */ 2371 static void pss_host_page_prepare(PageSearchStatus *pss) 2372 { 2373 /* How many guest pages are there in one host page? */ 2374 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2375 2376 pss->host_page_sending = true; 2377 if (guest_pfns <= 1) { 2378 /* 2379 * This covers both when guest psize == host psize, or when guest 2380 * has larger psize than the host (guest_pfns==0). 2381 * 2382 * For the latter, we always send one whole guest page per 2383 * iteration of the host page (example: an Alpha VM on x86 host 2384 * will have guest psize 8K while host psize 4K). 2385 */ 2386 pss->host_page_start = pss->page; 2387 pss->host_page_end = pss->page + 1; 2388 } else { 2389 /* 2390 * The host page spans over multiple guest pages, we send them 2391 * within the same host page iteration. 2392 */ 2393 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2394 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2395 } 2396 } 2397 2398 /* 2399 * Whether the page pointed by PSS is within the host page being sent. 2400 * Must be called after a previous pss_host_page_prepare(). 2401 */ 2402 static bool pss_within_range(PageSearchStatus *pss) 2403 { 2404 ram_addr_t ram_addr; 2405 2406 assert(pss->host_page_sending); 2407 2408 /* Over host-page boundary? */ 2409 if (pss->page >= pss->host_page_end) { 2410 return false; 2411 } 2412 2413 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2414 2415 return offset_in_ramblock(pss->block, ram_addr); 2416 } 2417 2418 static void pss_host_page_finish(PageSearchStatus *pss) 2419 { 2420 pss->host_page_sending = false; 2421 /* This is not needed, but just to reset it */ 2422 pss->host_page_start = pss->host_page_end = 0; 2423 } 2424 2425 /* 2426 * Send an urgent host page specified by `pss'. Need to be called with 2427 * bitmap_mutex held. 2428 * 2429 * Returns 0 if save host page succeeded, false otherwise. 2430 */ 2431 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2432 { 2433 bool page_dirty, sent = false; 2434 RAMState *rs = ram_state; 2435 int ret = 0; 2436 2437 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2438 pss_host_page_prepare(pss); 2439 2440 /* 2441 * If precopy is sending the same page, let it be done in precopy, or 2442 * we could send the same page in two channels and none of them will 2443 * receive the whole page. 2444 */ 2445 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2446 trace_postcopy_preempt_hit(pss->block->idstr, 2447 pss->page << TARGET_PAGE_BITS); 2448 return 0; 2449 } 2450 2451 do { 2452 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2453 2454 if (page_dirty) { 2455 /* Be strict to return code; it must be 1, or what else? */ 2456 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2457 error_report_once("%s: ram_save_target_page failed", __func__); 2458 ret = -1; 2459 goto out; 2460 } 2461 sent = true; 2462 } 2463 pss_find_next_dirty(pss); 2464 } while (pss_within_range(pss)); 2465 out: 2466 pss_host_page_finish(pss); 2467 /* For urgent requests, flush immediately if sent */ 2468 if (sent) { 2469 qemu_fflush(pss->pss_channel); 2470 } 2471 return ret; 2472 } 2473 2474 /** 2475 * ram_save_host_page: save a whole host page 2476 * 2477 * Starting at *offset send pages up to the end of the current host 2478 * page. It's valid for the initial offset to point into the middle of 2479 * a host page in which case the remainder of the hostpage is sent. 2480 * Only dirty target pages are sent. Note that the host page size may 2481 * be a huge page for this block. 2482 * 2483 * The saving stops at the boundary of the used_length of the block 2484 * if the RAMBlock isn't a multiple of the host page size. 2485 * 2486 * The caller must be with ram_state.bitmap_mutex held to call this 2487 * function. Note that this function can temporarily release the lock, but 2488 * when the function is returned it'll make sure the lock is still held. 2489 * 2490 * Returns the number of pages written or negative on error 2491 * 2492 * @rs: current RAM state 2493 * @pss: data about the page we want to send 2494 */ 2495 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2496 { 2497 bool page_dirty, preempt_active = postcopy_preempt_active(); 2498 int tmppages, pages = 0; 2499 size_t pagesize_bits = 2500 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2501 unsigned long start_page = pss->page; 2502 int res; 2503 2504 if (ramblock_is_ignored(pss->block)) { 2505 error_report("block %s should not be migrated !", pss->block->idstr); 2506 return 0; 2507 } 2508 2509 /* Update host page boundary information */ 2510 pss_host_page_prepare(pss); 2511 2512 do { 2513 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2514 2515 /* Check the pages is dirty and if it is send it */ 2516 if (page_dirty) { 2517 /* 2518 * Properly yield the lock only in postcopy preempt mode 2519 * because both migration thread and rp-return thread can 2520 * operate on the bitmaps. 2521 */ 2522 if (preempt_active) { 2523 qemu_mutex_unlock(&rs->bitmap_mutex); 2524 } 2525 tmppages = migration_ops->ram_save_target_page(rs, pss); 2526 if (tmppages >= 0) { 2527 pages += tmppages; 2528 /* 2529 * Allow rate limiting to happen in the middle of huge pages if 2530 * something is sent in the current iteration. 2531 */ 2532 if (pagesize_bits > 1 && tmppages > 0) { 2533 migration_rate_limit(); 2534 } 2535 } 2536 if (preempt_active) { 2537 qemu_mutex_lock(&rs->bitmap_mutex); 2538 } 2539 } else { 2540 tmppages = 0; 2541 } 2542 2543 if (tmppages < 0) { 2544 pss_host_page_finish(pss); 2545 return tmppages; 2546 } 2547 2548 pss_find_next_dirty(pss); 2549 } while (pss_within_range(pss)); 2550 2551 pss_host_page_finish(pss); 2552 2553 res = ram_save_release_protection(rs, pss, start_page); 2554 return (res < 0 ? res : pages); 2555 } 2556 2557 /** 2558 * ram_find_and_save_block: finds a dirty page and sends it to f 2559 * 2560 * Called within an RCU critical section. 2561 * 2562 * Returns the number of pages written where zero means no dirty pages, 2563 * or negative on error 2564 * 2565 * @rs: current RAM state 2566 * 2567 * On systems where host-page-size > target-page-size it will send all the 2568 * pages in a host page that are dirty. 2569 */ 2570 static int ram_find_and_save_block(RAMState *rs) 2571 { 2572 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2573 int pages = 0; 2574 2575 /* No dirty page as there is zero RAM */ 2576 if (!rs->ram_bytes_total) { 2577 return pages; 2578 } 2579 2580 /* 2581 * Always keep last_seen_block/last_page valid during this procedure, 2582 * because find_dirty_block() relies on these values (e.g., we compare 2583 * last_seen_block with pss.block to see whether we searched all the 2584 * ramblocks) to detect the completion of migration. Having NULL value 2585 * of last_seen_block can conditionally cause below loop to run forever. 2586 */ 2587 if (!rs->last_seen_block) { 2588 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2589 rs->last_page = 0; 2590 } 2591 2592 pss_init(pss, rs->last_seen_block, rs->last_page); 2593 2594 while (true){ 2595 if (!get_queued_page(rs, pss)) { 2596 /* priority queue empty, so just search for something dirty */ 2597 int res = find_dirty_block(rs, pss); 2598 if (res != PAGE_DIRTY_FOUND) { 2599 if (res == PAGE_ALL_CLEAN) { 2600 break; 2601 } else if (res == PAGE_TRY_AGAIN) { 2602 continue; 2603 } 2604 } 2605 } 2606 pages = ram_save_host_page(rs, pss); 2607 if (pages) { 2608 break; 2609 } 2610 } 2611 2612 rs->last_seen_block = pss->block; 2613 rs->last_page = pss->page; 2614 2615 return pages; 2616 } 2617 2618 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2619 { 2620 uint64_t pages = size / TARGET_PAGE_SIZE; 2621 2622 if (zero) { 2623 stat64_add(&ram_counters.zero_pages, pages); 2624 } else { 2625 stat64_add(&ram_counters.normal_pages, pages); 2626 ram_transferred_add(size); 2627 qemu_file_credit_transfer(f, size); 2628 } 2629 } 2630 2631 static uint64_t ram_bytes_total_with_ignored(void) 2632 { 2633 RAMBlock *block; 2634 uint64_t total = 0; 2635 2636 RCU_READ_LOCK_GUARD(); 2637 2638 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2639 total += block->used_length; 2640 } 2641 return total; 2642 } 2643 2644 uint64_t ram_bytes_total(void) 2645 { 2646 RAMBlock *block; 2647 uint64_t total = 0; 2648 2649 RCU_READ_LOCK_GUARD(); 2650 2651 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2652 total += block->used_length; 2653 } 2654 return total; 2655 } 2656 2657 static void xbzrle_load_setup(void) 2658 { 2659 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2660 } 2661 2662 static void xbzrle_load_cleanup(void) 2663 { 2664 g_free(XBZRLE.decoded_buf); 2665 XBZRLE.decoded_buf = NULL; 2666 } 2667 2668 static void ram_state_cleanup(RAMState **rsp) 2669 { 2670 if (*rsp) { 2671 migration_page_queue_free(*rsp); 2672 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2673 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2674 g_free(*rsp); 2675 *rsp = NULL; 2676 } 2677 } 2678 2679 static void xbzrle_cleanup(void) 2680 { 2681 XBZRLE_cache_lock(); 2682 if (XBZRLE.cache) { 2683 cache_fini(XBZRLE.cache); 2684 g_free(XBZRLE.encoded_buf); 2685 g_free(XBZRLE.current_buf); 2686 g_free(XBZRLE.zero_target_page); 2687 XBZRLE.cache = NULL; 2688 XBZRLE.encoded_buf = NULL; 2689 XBZRLE.current_buf = NULL; 2690 XBZRLE.zero_target_page = NULL; 2691 } 2692 XBZRLE_cache_unlock(); 2693 } 2694 2695 static void ram_save_cleanup(void *opaque) 2696 { 2697 RAMState **rsp = opaque; 2698 RAMBlock *block; 2699 2700 /* We don't use dirty log with background snapshots */ 2701 if (!migrate_background_snapshot()) { 2702 /* caller have hold iothread lock or is in a bh, so there is 2703 * no writing race against the migration bitmap 2704 */ 2705 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2706 /* 2707 * do not stop dirty log without starting it, since 2708 * memory_global_dirty_log_stop will assert that 2709 * memory_global_dirty_log_start/stop used in pairs 2710 */ 2711 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2712 } 2713 } 2714 2715 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2716 g_free(block->clear_bmap); 2717 block->clear_bmap = NULL; 2718 g_free(block->bmap); 2719 block->bmap = NULL; 2720 } 2721 2722 xbzrle_cleanup(); 2723 compress_threads_save_cleanup(); 2724 ram_state_cleanup(rsp); 2725 g_free(migration_ops); 2726 migration_ops = NULL; 2727 } 2728 2729 static void ram_state_reset(RAMState *rs) 2730 { 2731 int i; 2732 2733 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2734 rs->pss[i].last_sent_block = NULL; 2735 } 2736 2737 rs->last_seen_block = NULL; 2738 rs->last_page = 0; 2739 rs->last_version = ram_list.version; 2740 rs->xbzrle_enabled = false; 2741 } 2742 2743 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2744 2745 /* **** functions for postcopy ***** */ 2746 2747 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2748 { 2749 struct RAMBlock *block; 2750 2751 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2752 unsigned long *bitmap = block->bmap; 2753 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2754 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2755 2756 while (run_start < range) { 2757 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2758 ram_discard_range(block->idstr, 2759 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2760 ((ram_addr_t)(run_end - run_start)) 2761 << TARGET_PAGE_BITS); 2762 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2763 } 2764 } 2765 } 2766 2767 /** 2768 * postcopy_send_discard_bm_ram: discard a RAMBlock 2769 * 2770 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2771 * 2772 * @ms: current migration state 2773 * @block: RAMBlock to discard 2774 */ 2775 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2776 { 2777 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2778 unsigned long current; 2779 unsigned long *bitmap = block->bmap; 2780 2781 for (current = 0; current < end; ) { 2782 unsigned long one = find_next_bit(bitmap, end, current); 2783 unsigned long zero, discard_length; 2784 2785 if (one >= end) { 2786 break; 2787 } 2788 2789 zero = find_next_zero_bit(bitmap, end, one + 1); 2790 2791 if (zero >= end) { 2792 discard_length = end - one; 2793 } else { 2794 discard_length = zero - one; 2795 } 2796 postcopy_discard_send_range(ms, one, discard_length); 2797 current = one + discard_length; 2798 } 2799 } 2800 2801 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2802 2803 /** 2804 * postcopy_each_ram_send_discard: discard all RAMBlocks 2805 * 2806 * Utility for the outgoing postcopy code. 2807 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2808 * passing it bitmap indexes and name. 2809 * (qemu_ram_foreach_block ends up passing unscaled lengths 2810 * which would mean postcopy code would have to deal with target page) 2811 * 2812 * @ms: current migration state 2813 */ 2814 static void postcopy_each_ram_send_discard(MigrationState *ms) 2815 { 2816 struct RAMBlock *block; 2817 2818 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2819 postcopy_discard_send_init(ms, block->idstr); 2820 2821 /* 2822 * Deal with TPS != HPS and huge pages. It discard any partially sent 2823 * host-page size chunks, mark any partially dirty host-page size 2824 * chunks as all dirty. In this case the host-page is the host-page 2825 * for the particular RAMBlock, i.e. it might be a huge page. 2826 */ 2827 postcopy_chunk_hostpages_pass(ms, block); 2828 2829 /* 2830 * Postcopy sends chunks of bitmap over the wire, but it 2831 * just needs indexes at this point, avoids it having 2832 * target page specific code. 2833 */ 2834 postcopy_send_discard_bm_ram(ms, block); 2835 postcopy_discard_send_finish(ms); 2836 } 2837 } 2838 2839 /** 2840 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2841 * 2842 * Helper for postcopy_chunk_hostpages; it's called twice to 2843 * canonicalize the two bitmaps, that are similar, but one is 2844 * inverted. 2845 * 2846 * Postcopy requires that all target pages in a hostpage are dirty or 2847 * clean, not a mix. This function canonicalizes the bitmaps. 2848 * 2849 * @ms: current migration state 2850 * @block: block that contains the page we want to canonicalize 2851 */ 2852 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2853 { 2854 RAMState *rs = ram_state; 2855 unsigned long *bitmap = block->bmap; 2856 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2857 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2858 unsigned long run_start; 2859 2860 if (block->page_size == TARGET_PAGE_SIZE) { 2861 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2862 return; 2863 } 2864 2865 /* Find a dirty page */ 2866 run_start = find_next_bit(bitmap, pages, 0); 2867 2868 while (run_start < pages) { 2869 2870 /* 2871 * If the start of this run of pages is in the middle of a host 2872 * page, then we need to fixup this host page. 2873 */ 2874 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2875 /* Find the end of this run */ 2876 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2877 /* 2878 * If the end isn't at the start of a host page, then the 2879 * run doesn't finish at the end of a host page 2880 * and we need to discard. 2881 */ 2882 } 2883 2884 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2885 unsigned long page; 2886 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2887 host_ratio); 2888 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2889 2890 /* Clean up the bitmap */ 2891 for (page = fixup_start_addr; 2892 page < fixup_start_addr + host_ratio; page++) { 2893 /* 2894 * Remark them as dirty, updating the count for any pages 2895 * that weren't previously dirty. 2896 */ 2897 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2898 } 2899 } 2900 2901 /* Find the next dirty page for the next iteration */ 2902 run_start = find_next_bit(bitmap, pages, run_start); 2903 } 2904 } 2905 2906 /** 2907 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2908 * 2909 * Transmit the set of pages to be discarded after precopy to the target 2910 * these are pages that: 2911 * a) Have been previously transmitted but are now dirty again 2912 * b) Pages that have never been transmitted, this ensures that 2913 * any pages on the destination that have been mapped by background 2914 * tasks get discarded (transparent huge pages is the specific concern) 2915 * Hopefully this is pretty sparse 2916 * 2917 * @ms: current migration state 2918 */ 2919 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2920 { 2921 RAMState *rs = ram_state; 2922 2923 RCU_READ_LOCK_GUARD(); 2924 2925 /* This should be our last sync, the src is now paused */ 2926 migration_bitmap_sync(rs); 2927 2928 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2929 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2930 rs->last_seen_block = NULL; 2931 rs->last_page = 0; 2932 2933 postcopy_each_ram_send_discard(ms); 2934 2935 trace_ram_postcopy_send_discard_bitmap(); 2936 } 2937 2938 /** 2939 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2940 * 2941 * Returns zero on success 2942 * 2943 * @rbname: name of the RAMBlock of the request. NULL means the 2944 * same that last one. 2945 * @start: RAMBlock starting page 2946 * @length: RAMBlock size 2947 */ 2948 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2949 { 2950 trace_ram_discard_range(rbname, start, length); 2951 2952 RCU_READ_LOCK_GUARD(); 2953 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2954 2955 if (!rb) { 2956 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2957 return -1; 2958 } 2959 2960 /* 2961 * On source VM, we don't need to update the received bitmap since 2962 * we don't even have one. 2963 */ 2964 if (rb->receivedmap) { 2965 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2966 length >> qemu_target_page_bits()); 2967 } 2968 2969 return ram_block_discard_range(rb, start, length); 2970 } 2971 2972 /* 2973 * For every allocation, we will try not to crash the VM if the 2974 * allocation failed. 2975 */ 2976 static int xbzrle_init(void) 2977 { 2978 Error *local_err = NULL; 2979 2980 if (!migrate_xbzrle()) { 2981 return 0; 2982 } 2983 2984 XBZRLE_cache_lock(); 2985 2986 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2987 if (!XBZRLE.zero_target_page) { 2988 error_report("%s: Error allocating zero page", __func__); 2989 goto err_out; 2990 } 2991 2992 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2993 TARGET_PAGE_SIZE, &local_err); 2994 if (!XBZRLE.cache) { 2995 error_report_err(local_err); 2996 goto free_zero_page; 2997 } 2998 2999 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 3000 if (!XBZRLE.encoded_buf) { 3001 error_report("%s: Error allocating encoded_buf", __func__); 3002 goto free_cache; 3003 } 3004 3005 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 3006 if (!XBZRLE.current_buf) { 3007 error_report("%s: Error allocating current_buf", __func__); 3008 goto free_encoded_buf; 3009 } 3010 3011 /* We are all good */ 3012 XBZRLE_cache_unlock(); 3013 return 0; 3014 3015 free_encoded_buf: 3016 g_free(XBZRLE.encoded_buf); 3017 XBZRLE.encoded_buf = NULL; 3018 free_cache: 3019 cache_fini(XBZRLE.cache); 3020 XBZRLE.cache = NULL; 3021 free_zero_page: 3022 g_free(XBZRLE.zero_target_page); 3023 XBZRLE.zero_target_page = NULL; 3024 err_out: 3025 XBZRLE_cache_unlock(); 3026 return -ENOMEM; 3027 } 3028 3029 static int ram_state_init(RAMState **rsp) 3030 { 3031 *rsp = g_try_new0(RAMState, 1); 3032 3033 if (!*rsp) { 3034 error_report("%s: Init ramstate fail", __func__); 3035 return -1; 3036 } 3037 3038 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3039 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3040 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3041 (*rsp)->ram_bytes_total = ram_bytes_total(); 3042 3043 /* 3044 * Count the total number of pages used by ram blocks not including any 3045 * gaps due to alignment or unplugs. 3046 * This must match with the initial values of dirty bitmap. 3047 */ 3048 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 3049 ram_state_reset(*rsp); 3050 3051 return 0; 3052 } 3053 3054 static void ram_list_init_bitmaps(void) 3055 { 3056 MigrationState *ms = migrate_get_current(); 3057 RAMBlock *block; 3058 unsigned long pages; 3059 uint8_t shift; 3060 3061 /* Skip setting bitmap if there is no RAM */ 3062 if (ram_bytes_total()) { 3063 shift = ms->clear_bitmap_shift; 3064 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3065 error_report("clear_bitmap_shift (%u) too big, using " 3066 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3067 shift = CLEAR_BITMAP_SHIFT_MAX; 3068 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3069 error_report("clear_bitmap_shift (%u) too small, using " 3070 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3071 shift = CLEAR_BITMAP_SHIFT_MIN; 3072 } 3073 3074 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3075 pages = block->max_length >> TARGET_PAGE_BITS; 3076 /* 3077 * The initial dirty bitmap for migration must be set with all 3078 * ones to make sure we'll migrate every guest RAM page to 3079 * destination. 3080 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3081 * new migration after a failed migration, ram_list. 3082 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3083 * guest memory. 3084 */ 3085 block->bmap = bitmap_new(pages); 3086 bitmap_set(block->bmap, 0, pages); 3087 block->clear_bmap_shift = shift; 3088 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3089 } 3090 } 3091 } 3092 3093 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3094 { 3095 unsigned long pages; 3096 RAMBlock *rb; 3097 3098 RCU_READ_LOCK_GUARD(); 3099 3100 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3101 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3102 rs->migration_dirty_pages -= pages; 3103 } 3104 } 3105 3106 static void ram_init_bitmaps(RAMState *rs) 3107 { 3108 /* For memory_global_dirty_log_start below. */ 3109 qemu_mutex_lock_iothread(); 3110 qemu_mutex_lock_ramlist(); 3111 3112 WITH_RCU_READ_LOCK_GUARD() { 3113 ram_list_init_bitmaps(); 3114 /* We don't use dirty log with background snapshots */ 3115 if (!migrate_background_snapshot()) { 3116 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3117 migration_bitmap_sync_precopy(rs); 3118 } 3119 } 3120 qemu_mutex_unlock_ramlist(); 3121 qemu_mutex_unlock_iothread(); 3122 3123 /* 3124 * After an eventual first bitmap sync, fixup the initial bitmap 3125 * containing all 1s to exclude any discarded pages from migration. 3126 */ 3127 migration_bitmap_clear_discarded_pages(rs); 3128 } 3129 3130 static int ram_init_all(RAMState **rsp) 3131 { 3132 if (ram_state_init(rsp)) { 3133 return -1; 3134 } 3135 3136 if (xbzrle_init()) { 3137 ram_state_cleanup(rsp); 3138 return -1; 3139 } 3140 3141 ram_init_bitmaps(*rsp); 3142 3143 return 0; 3144 } 3145 3146 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3147 { 3148 RAMBlock *block; 3149 uint64_t pages = 0; 3150 3151 /* 3152 * Postcopy is not using xbzrle/compression, so no need for that. 3153 * Also, since source are already halted, we don't need to care 3154 * about dirty page logging as well. 3155 */ 3156 3157 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3158 pages += bitmap_count_one(block->bmap, 3159 block->used_length >> TARGET_PAGE_BITS); 3160 } 3161 3162 /* This may not be aligned with current bitmaps. Recalculate. */ 3163 rs->migration_dirty_pages = pages; 3164 3165 ram_state_reset(rs); 3166 3167 /* Update RAMState cache of output QEMUFile */ 3168 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3169 3170 trace_ram_state_resume_prepare(pages); 3171 } 3172 3173 /* 3174 * This function clears bits of the free pages reported by the caller from the 3175 * migration dirty bitmap. @addr is the host address corresponding to the 3176 * start of the continuous guest free pages, and @len is the total bytes of 3177 * those pages. 3178 */ 3179 void qemu_guest_free_page_hint(void *addr, size_t len) 3180 { 3181 RAMBlock *block; 3182 ram_addr_t offset; 3183 size_t used_len, start, npages; 3184 MigrationState *s = migrate_get_current(); 3185 3186 /* This function is currently expected to be used during live migration */ 3187 if (!migration_is_setup_or_active(s->state)) { 3188 return; 3189 } 3190 3191 for (; len > 0; len -= used_len, addr += used_len) { 3192 block = qemu_ram_block_from_host(addr, false, &offset); 3193 if (unlikely(!block || offset >= block->used_length)) { 3194 /* 3195 * The implementation might not support RAMBlock resize during 3196 * live migration, but it could happen in theory with future 3197 * updates. So we add a check here to capture that case. 3198 */ 3199 error_report_once("%s unexpected error", __func__); 3200 return; 3201 } 3202 3203 if (len <= block->used_length - offset) { 3204 used_len = len; 3205 } else { 3206 used_len = block->used_length - offset; 3207 } 3208 3209 start = offset >> TARGET_PAGE_BITS; 3210 npages = used_len >> TARGET_PAGE_BITS; 3211 3212 qemu_mutex_lock(&ram_state->bitmap_mutex); 3213 /* 3214 * The skipped free pages are equavalent to be sent from clear_bmap's 3215 * perspective, so clear the bits from the memory region bitmap which 3216 * are initially set. Otherwise those skipped pages will be sent in 3217 * the next round after syncing from the memory region bitmap. 3218 */ 3219 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3220 ram_state->migration_dirty_pages -= 3221 bitmap_count_one_with_offset(block->bmap, start, npages); 3222 bitmap_clear(block->bmap, start, npages); 3223 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3224 } 3225 } 3226 3227 /* 3228 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3229 * long-running RCU critical section. When rcu-reclaims in the code 3230 * start to become numerous it will be necessary to reduce the 3231 * granularity of these critical sections. 3232 */ 3233 3234 /** 3235 * ram_save_setup: Setup RAM for migration 3236 * 3237 * Returns zero to indicate success and negative for error 3238 * 3239 * @f: QEMUFile where to send the data 3240 * @opaque: RAMState pointer 3241 */ 3242 static int ram_save_setup(QEMUFile *f, void *opaque) 3243 { 3244 RAMState **rsp = opaque; 3245 RAMBlock *block; 3246 int ret; 3247 3248 if (compress_threads_save_setup()) { 3249 return -1; 3250 } 3251 3252 /* migration has already setup the bitmap, reuse it. */ 3253 if (!migration_in_colo_state()) { 3254 if (ram_init_all(rsp) != 0) { 3255 compress_threads_save_cleanup(); 3256 return -1; 3257 } 3258 } 3259 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3260 3261 WITH_RCU_READ_LOCK_GUARD() { 3262 qemu_put_be64(f, ram_bytes_total_with_ignored() 3263 | RAM_SAVE_FLAG_MEM_SIZE); 3264 3265 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3266 qemu_put_byte(f, strlen(block->idstr)); 3267 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3268 qemu_put_be64(f, block->used_length); 3269 if (migrate_postcopy_ram() && block->page_size != 3270 qemu_host_page_size) { 3271 qemu_put_be64(f, block->page_size); 3272 } 3273 if (migrate_ignore_shared()) { 3274 qemu_put_be64(f, block->mr->addr); 3275 } 3276 } 3277 } 3278 3279 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3280 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3281 3282 migration_ops = g_malloc0(sizeof(MigrationOps)); 3283 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3284 ret = multifd_send_sync_main(f); 3285 if (ret < 0) { 3286 return ret; 3287 } 3288 3289 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3290 qemu_fflush(f); 3291 3292 return 0; 3293 } 3294 3295 /** 3296 * ram_save_iterate: iterative stage for migration 3297 * 3298 * Returns zero to indicate success and negative for error 3299 * 3300 * @f: QEMUFile where to send the data 3301 * @opaque: RAMState pointer 3302 */ 3303 static int ram_save_iterate(QEMUFile *f, void *opaque) 3304 { 3305 RAMState **temp = opaque; 3306 RAMState *rs = *temp; 3307 int ret = 0; 3308 int i; 3309 int64_t t0; 3310 int done = 0; 3311 3312 if (blk_mig_bulk_active()) { 3313 /* Avoid transferring ram during bulk phase of block migration as 3314 * the bulk phase will usually take a long time and transferring 3315 * ram updates during that time is pointless. */ 3316 goto out; 3317 } 3318 3319 /* 3320 * We'll take this lock a little bit long, but it's okay for two reasons. 3321 * Firstly, the only possible other thread to take it is who calls 3322 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3323 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3324 * guarantees that we'll at least released it in a regular basis. 3325 */ 3326 qemu_mutex_lock(&rs->bitmap_mutex); 3327 WITH_RCU_READ_LOCK_GUARD() { 3328 if (ram_list.version != rs->last_version) { 3329 ram_state_reset(rs); 3330 } 3331 3332 /* Read version before ram_list.blocks */ 3333 smp_rmb(); 3334 3335 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3336 3337 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3338 i = 0; 3339 while ((ret = qemu_file_rate_limit(f)) == 0 || 3340 postcopy_has_request(rs)) { 3341 int pages; 3342 3343 if (qemu_file_get_error(f)) { 3344 break; 3345 } 3346 3347 pages = ram_find_and_save_block(rs); 3348 /* no more pages to sent */ 3349 if (pages == 0) { 3350 done = 1; 3351 break; 3352 } 3353 3354 if (pages < 0) { 3355 qemu_file_set_error(f, pages); 3356 break; 3357 } 3358 3359 rs->target_page_count += pages; 3360 3361 /* 3362 * During postcopy, it is necessary to make sure one whole host 3363 * page is sent in one chunk. 3364 */ 3365 if (migrate_postcopy_ram()) { 3366 flush_compressed_data(rs); 3367 } 3368 3369 /* 3370 * we want to check in the 1st loop, just in case it was the 1st 3371 * time and we had to sync the dirty bitmap. 3372 * qemu_clock_get_ns() is a bit expensive, so we only check each 3373 * some iterations 3374 */ 3375 if ((i & 63) == 0) { 3376 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3377 1000000; 3378 if (t1 > MAX_WAIT) { 3379 trace_ram_save_iterate_big_wait(t1, i); 3380 break; 3381 } 3382 } 3383 i++; 3384 } 3385 } 3386 qemu_mutex_unlock(&rs->bitmap_mutex); 3387 3388 /* 3389 * Must occur before EOS (or any QEMUFile operation) 3390 * because of RDMA protocol. 3391 */ 3392 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3393 3394 out: 3395 if (ret >= 0 3396 && migration_is_setup_or_active(migrate_get_current()->state)) { 3397 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3398 if (ret < 0) { 3399 return ret; 3400 } 3401 3402 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3403 qemu_fflush(f); 3404 ram_transferred_add(8); 3405 3406 ret = qemu_file_get_error(f); 3407 } 3408 if (ret < 0) { 3409 return ret; 3410 } 3411 3412 return done; 3413 } 3414 3415 /** 3416 * ram_save_complete: function called to send the remaining amount of ram 3417 * 3418 * Returns zero to indicate success or negative on error 3419 * 3420 * Called with iothread lock 3421 * 3422 * @f: QEMUFile where to send the data 3423 * @opaque: RAMState pointer 3424 */ 3425 static int ram_save_complete(QEMUFile *f, void *opaque) 3426 { 3427 RAMState **temp = opaque; 3428 RAMState *rs = *temp; 3429 int ret = 0; 3430 3431 rs->last_stage = !migration_in_colo_state(); 3432 3433 WITH_RCU_READ_LOCK_GUARD() { 3434 if (!migration_in_postcopy()) { 3435 migration_bitmap_sync_precopy(rs); 3436 } 3437 3438 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3439 3440 /* try transferring iterative blocks of memory */ 3441 3442 /* flush all remaining blocks regardless of rate limiting */ 3443 qemu_mutex_lock(&rs->bitmap_mutex); 3444 while (true) { 3445 int pages; 3446 3447 pages = ram_find_and_save_block(rs); 3448 /* no more blocks to sent */ 3449 if (pages == 0) { 3450 break; 3451 } 3452 if (pages < 0) { 3453 ret = pages; 3454 break; 3455 } 3456 } 3457 qemu_mutex_unlock(&rs->bitmap_mutex); 3458 3459 flush_compressed_data(rs); 3460 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3461 } 3462 3463 if (ret < 0) { 3464 return ret; 3465 } 3466 3467 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3468 if (ret < 0) { 3469 return ret; 3470 } 3471 3472 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3473 qemu_fflush(f); 3474 3475 return 0; 3476 } 3477 3478 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3479 uint64_t *can_postcopy) 3480 { 3481 RAMState **temp = opaque; 3482 RAMState *rs = *temp; 3483 3484 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3485 3486 if (migrate_postcopy_ram()) { 3487 /* We can do postcopy, and all the data is postcopiable */ 3488 *can_postcopy += remaining_size; 3489 } else { 3490 *must_precopy += remaining_size; 3491 } 3492 } 3493 3494 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3495 uint64_t *can_postcopy) 3496 { 3497 MigrationState *s = migrate_get_current(); 3498 RAMState **temp = opaque; 3499 RAMState *rs = *temp; 3500 3501 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3502 3503 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3504 qemu_mutex_lock_iothread(); 3505 WITH_RCU_READ_LOCK_GUARD() { 3506 migration_bitmap_sync_precopy(rs); 3507 } 3508 qemu_mutex_unlock_iothread(); 3509 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3510 } 3511 3512 if (migrate_postcopy_ram()) { 3513 /* We can do postcopy, and all the data is postcopiable */ 3514 *can_postcopy += remaining_size; 3515 } else { 3516 *must_precopy += remaining_size; 3517 } 3518 } 3519 3520 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3521 { 3522 unsigned int xh_len; 3523 int xh_flags; 3524 uint8_t *loaded_data; 3525 3526 /* extract RLE header */ 3527 xh_flags = qemu_get_byte(f); 3528 xh_len = qemu_get_be16(f); 3529 3530 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3531 error_report("Failed to load XBZRLE page - wrong compression!"); 3532 return -1; 3533 } 3534 3535 if (xh_len > TARGET_PAGE_SIZE) { 3536 error_report("Failed to load XBZRLE page - len overflow!"); 3537 return -1; 3538 } 3539 loaded_data = XBZRLE.decoded_buf; 3540 /* load data and decode */ 3541 /* it can change loaded_data to point to an internal buffer */ 3542 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3543 3544 /* decode RLE */ 3545 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3546 TARGET_PAGE_SIZE) == -1) { 3547 error_report("Failed to load XBZRLE page - decode error!"); 3548 return -1; 3549 } 3550 3551 return 0; 3552 } 3553 3554 /** 3555 * ram_block_from_stream: read a RAMBlock id from the migration stream 3556 * 3557 * Must be called from within a rcu critical section. 3558 * 3559 * Returns a pointer from within the RCU-protected ram_list. 3560 * 3561 * @mis: the migration incoming state pointer 3562 * @f: QEMUFile where to read the data from 3563 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3564 * @channel: the channel we're using 3565 */ 3566 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3567 QEMUFile *f, int flags, 3568 int channel) 3569 { 3570 RAMBlock *block = mis->last_recv_block[channel]; 3571 char id[256]; 3572 uint8_t len; 3573 3574 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3575 if (!block) { 3576 error_report("Ack, bad migration stream!"); 3577 return NULL; 3578 } 3579 return block; 3580 } 3581 3582 len = qemu_get_byte(f); 3583 qemu_get_buffer(f, (uint8_t *)id, len); 3584 id[len] = 0; 3585 3586 block = qemu_ram_block_by_name(id); 3587 if (!block) { 3588 error_report("Can't find block %s", id); 3589 return NULL; 3590 } 3591 3592 if (ramblock_is_ignored(block)) { 3593 error_report("block %s should not be migrated !", id); 3594 return NULL; 3595 } 3596 3597 mis->last_recv_block[channel] = block; 3598 3599 return block; 3600 } 3601 3602 static inline void *host_from_ram_block_offset(RAMBlock *block, 3603 ram_addr_t offset) 3604 { 3605 if (!offset_in_ramblock(block, offset)) { 3606 return NULL; 3607 } 3608 3609 return block->host + offset; 3610 } 3611 3612 static void *host_page_from_ram_block_offset(RAMBlock *block, 3613 ram_addr_t offset) 3614 { 3615 /* Note: Explicitly no check against offset_in_ramblock(). */ 3616 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3617 block->page_size); 3618 } 3619 3620 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3621 ram_addr_t offset) 3622 { 3623 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3624 } 3625 3626 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3627 ram_addr_t offset, bool record_bitmap) 3628 { 3629 if (!offset_in_ramblock(block, offset)) { 3630 return NULL; 3631 } 3632 if (!block->colo_cache) { 3633 error_report("%s: colo_cache is NULL in block :%s", 3634 __func__, block->idstr); 3635 return NULL; 3636 } 3637 3638 /* 3639 * During colo checkpoint, we need bitmap of these migrated pages. 3640 * It help us to decide which pages in ram cache should be flushed 3641 * into VM's RAM later. 3642 */ 3643 if (record_bitmap && 3644 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3645 ram_state->migration_dirty_pages++; 3646 } 3647 return block->colo_cache + offset; 3648 } 3649 3650 /** 3651 * ram_handle_compressed: handle the zero page case 3652 * 3653 * If a page (or a whole RDMA chunk) has been 3654 * determined to be zero, then zap it. 3655 * 3656 * @host: host address for the zero page 3657 * @ch: what the page is filled from. We only support zero 3658 * @size: size of the zero page 3659 */ 3660 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3661 { 3662 if (ch != 0 || !buffer_is_zero(host, size)) { 3663 memset(host, ch, size); 3664 } 3665 } 3666 3667 /* return the size after decompression, or negative value on error */ 3668 static int 3669 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3670 const uint8_t *source, size_t source_len) 3671 { 3672 int err; 3673 3674 err = inflateReset(stream); 3675 if (err != Z_OK) { 3676 return -1; 3677 } 3678 3679 stream->avail_in = source_len; 3680 stream->next_in = (uint8_t *)source; 3681 stream->avail_out = dest_len; 3682 stream->next_out = dest; 3683 3684 err = inflate(stream, Z_NO_FLUSH); 3685 if (err != Z_STREAM_END) { 3686 return -1; 3687 } 3688 3689 return stream->total_out; 3690 } 3691 3692 static void *do_data_decompress(void *opaque) 3693 { 3694 DecompressParam *param = opaque; 3695 unsigned long pagesize; 3696 uint8_t *des; 3697 int len, ret; 3698 3699 qemu_mutex_lock(¶m->mutex); 3700 while (!param->quit) { 3701 if (param->des) { 3702 des = param->des; 3703 len = param->len; 3704 param->des = 0; 3705 qemu_mutex_unlock(¶m->mutex); 3706 3707 pagesize = TARGET_PAGE_SIZE; 3708 3709 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3710 param->compbuf, len); 3711 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3712 error_report("decompress data failed"); 3713 qemu_file_set_error(decomp_file, ret); 3714 } 3715 3716 qemu_mutex_lock(&decomp_done_lock); 3717 param->done = true; 3718 qemu_cond_signal(&decomp_done_cond); 3719 qemu_mutex_unlock(&decomp_done_lock); 3720 3721 qemu_mutex_lock(¶m->mutex); 3722 } else { 3723 qemu_cond_wait(¶m->cond, ¶m->mutex); 3724 } 3725 } 3726 qemu_mutex_unlock(¶m->mutex); 3727 3728 return NULL; 3729 } 3730 3731 static int wait_for_decompress_done(void) 3732 { 3733 int idx, thread_count; 3734 3735 if (!migrate_compress()) { 3736 return 0; 3737 } 3738 3739 thread_count = migrate_decompress_threads(); 3740 qemu_mutex_lock(&decomp_done_lock); 3741 for (idx = 0; idx < thread_count; idx++) { 3742 while (!decomp_param[idx].done) { 3743 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3744 } 3745 } 3746 qemu_mutex_unlock(&decomp_done_lock); 3747 return qemu_file_get_error(decomp_file); 3748 } 3749 3750 static void compress_threads_load_cleanup(void) 3751 { 3752 int i, thread_count; 3753 3754 if (!migrate_compress()) { 3755 return; 3756 } 3757 thread_count = migrate_decompress_threads(); 3758 for (i = 0; i < thread_count; i++) { 3759 /* 3760 * we use it as a indicator which shows if the thread is 3761 * properly init'd or not 3762 */ 3763 if (!decomp_param[i].compbuf) { 3764 break; 3765 } 3766 3767 qemu_mutex_lock(&decomp_param[i].mutex); 3768 decomp_param[i].quit = true; 3769 qemu_cond_signal(&decomp_param[i].cond); 3770 qemu_mutex_unlock(&decomp_param[i].mutex); 3771 } 3772 for (i = 0; i < thread_count; i++) { 3773 if (!decomp_param[i].compbuf) { 3774 break; 3775 } 3776 3777 qemu_thread_join(decompress_threads + i); 3778 qemu_mutex_destroy(&decomp_param[i].mutex); 3779 qemu_cond_destroy(&decomp_param[i].cond); 3780 inflateEnd(&decomp_param[i].stream); 3781 g_free(decomp_param[i].compbuf); 3782 decomp_param[i].compbuf = NULL; 3783 } 3784 g_free(decompress_threads); 3785 g_free(decomp_param); 3786 decompress_threads = NULL; 3787 decomp_param = NULL; 3788 decomp_file = NULL; 3789 } 3790 3791 static int compress_threads_load_setup(QEMUFile *f) 3792 { 3793 int i, thread_count; 3794 3795 if (!migrate_compress()) { 3796 return 0; 3797 } 3798 3799 thread_count = migrate_decompress_threads(); 3800 decompress_threads = g_new0(QemuThread, thread_count); 3801 decomp_param = g_new0(DecompressParam, thread_count); 3802 qemu_mutex_init(&decomp_done_lock); 3803 qemu_cond_init(&decomp_done_cond); 3804 decomp_file = f; 3805 for (i = 0; i < thread_count; i++) { 3806 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3807 goto exit; 3808 } 3809 3810 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3811 qemu_mutex_init(&decomp_param[i].mutex); 3812 qemu_cond_init(&decomp_param[i].cond); 3813 decomp_param[i].done = true; 3814 decomp_param[i].quit = false; 3815 qemu_thread_create(decompress_threads + i, "decompress", 3816 do_data_decompress, decomp_param + i, 3817 QEMU_THREAD_JOINABLE); 3818 } 3819 return 0; 3820 exit: 3821 compress_threads_load_cleanup(); 3822 return -1; 3823 } 3824 3825 static void decompress_data_with_multi_threads(QEMUFile *f, 3826 void *host, int len) 3827 { 3828 int idx, thread_count; 3829 3830 thread_count = migrate_decompress_threads(); 3831 QEMU_LOCK_GUARD(&decomp_done_lock); 3832 while (true) { 3833 for (idx = 0; idx < thread_count; idx++) { 3834 if (decomp_param[idx].done) { 3835 decomp_param[idx].done = false; 3836 qemu_mutex_lock(&decomp_param[idx].mutex); 3837 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3838 decomp_param[idx].des = host; 3839 decomp_param[idx].len = len; 3840 qemu_cond_signal(&decomp_param[idx].cond); 3841 qemu_mutex_unlock(&decomp_param[idx].mutex); 3842 break; 3843 } 3844 } 3845 if (idx < thread_count) { 3846 break; 3847 } else { 3848 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3849 } 3850 } 3851 } 3852 3853 static void colo_init_ram_state(void) 3854 { 3855 ram_state_init(&ram_state); 3856 } 3857 3858 /* 3859 * colo cache: this is for secondary VM, we cache the whole 3860 * memory of the secondary VM, it is need to hold the global lock 3861 * to call this helper. 3862 */ 3863 int colo_init_ram_cache(void) 3864 { 3865 RAMBlock *block; 3866 3867 WITH_RCU_READ_LOCK_GUARD() { 3868 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3869 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3870 NULL, false, false); 3871 if (!block->colo_cache) { 3872 error_report("%s: Can't alloc memory for COLO cache of block %s," 3873 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3874 block->used_length); 3875 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3876 if (block->colo_cache) { 3877 qemu_anon_ram_free(block->colo_cache, block->used_length); 3878 block->colo_cache = NULL; 3879 } 3880 } 3881 return -errno; 3882 } 3883 if (!machine_dump_guest_core(current_machine)) { 3884 qemu_madvise(block->colo_cache, block->used_length, 3885 QEMU_MADV_DONTDUMP); 3886 } 3887 } 3888 } 3889 3890 /* 3891 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3892 * with to decide which page in cache should be flushed into SVM's RAM. Here 3893 * we use the same name 'ram_bitmap' as for migration. 3894 */ 3895 if (ram_bytes_total()) { 3896 RAMBlock *block; 3897 3898 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3899 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3900 block->bmap = bitmap_new(pages); 3901 } 3902 } 3903 3904 colo_init_ram_state(); 3905 return 0; 3906 } 3907 3908 /* TODO: duplicated with ram_init_bitmaps */ 3909 void colo_incoming_start_dirty_log(void) 3910 { 3911 RAMBlock *block = NULL; 3912 /* For memory_global_dirty_log_start below. */ 3913 qemu_mutex_lock_iothread(); 3914 qemu_mutex_lock_ramlist(); 3915 3916 memory_global_dirty_log_sync(); 3917 WITH_RCU_READ_LOCK_GUARD() { 3918 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3919 ramblock_sync_dirty_bitmap(ram_state, block); 3920 /* Discard this dirty bitmap record */ 3921 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3922 } 3923 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3924 } 3925 ram_state->migration_dirty_pages = 0; 3926 qemu_mutex_unlock_ramlist(); 3927 qemu_mutex_unlock_iothread(); 3928 } 3929 3930 /* It is need to hold the global lock to call this helper */ 3931 void colo_release_ram_cache(void) 3932 { 3933 RAMBlock *block; 3934 3935 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3936 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3937 g_free(block->bmap); 3938 block->bmap = NULL; 3939 } 3940 3941 WITH_RCU_READ_LOCK_GUARD() { 3942 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3943 if (block->colo_cache) { 3944 qemu_anon_ram_free(block->colo_cache, block->used_length); 3945 block->colo_cache = NULL; 3946 } 3947 } 3948 } 3949 ram_state_cleanup(&ram_state); 3950 } 3951 3952 /** 3953 * ram_load_setup: Setup RAM for migration incoming side 3954 * 3955 * Returns zero to indicate success and negative for error 3956 * 3957 * @f: QEMUFile where to receive the data 3958 * @opaque: RAMState pointer 3959 */ 3960 static int ram_load_setup(QEMUFile *f, void *opaque) 3961 { 3962 if (compress_threads_load_setup(f)) { 3963 return -1; 3964 } 3965 3966 xbzrle_load_setup(); 3967 ramblock_recv_map_init(); 3968 3969 return 0; 3970 } 3971 3972 static int ram_load_cleanup(void *opaque) 3973 { 3974 RAMBlock *rb; 3975 3976 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3977 qemu_ram_block_writeback(rb); 3978 } 3979 3980 xbzrle_load_cleanup(); 3981 compress_threads_load_cleanup(); 3982 3983 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3984 g_free(rb->receivedmap); 3985 rb->receivedmap = NULL; 3986 } 3987 3988 return 0; 3989 } 3990 3991 /** 3992 * ram_postcopy_incoming_init: allocate postcopy data structures 3993 * 3994 * Returns 0 for success and negative if there was one error 3995 * 3996 * @mis: current migration incoming state 3997 * 3998 * Allocate data structures etc needed by incoming migration with 3999 * postcopy-ram. postcopy-ram's similarly names 4000 * postcopy_ram_incoming_init does the work. 4001 */ 4002 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 4003 { 4004 return postcopy_ram_incoming_init(mis); 4005 } 4006 4007 /** 4008 * ram_load_postcopy: load a page in postcopy case 4009 * 4010 * Returns 0 for success or -errno in case of error 4011 * 4012 * Called in postcopy mode by ram_load(). 4013 * rcu_read_lock is taken prior to this being called. 4014 * 4015 * @f: QEMUFile where to send the data 4016 * @channel: the channel to use for loading 4017 */ 4018 int ram_load_postcopy(QEMUFile *f, int channel) 4019 { 4020 int flags = 0, ret = 0; 4021 bool place_needed = false; 4022 bool matches_target_page_size = false; 4023 MigrationIncomingState *mis = migration_incoming_get_current(); 4024 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 4025 4026 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4027 ram_addr_t addr; 4028 void *page_buffer = NULL; 4029 void *place_source = NULL; 4030 RAMBlock *block = NULL; 4031 uint8_t ch; 4032 int len; 4033 4034 addr = qemu_get_be64(f); 4035 4036 /* 4037 * If qemu file error, we should stop here, and then "addr" 4038 * may be invalid 4039 */ 4040 ret = qemu_file_get_error(f); 4041 if (ret) { 4042 break; 4043 } 4044 4045 flags = addr & ~TARGET_PAGE_MASK; 4046 addr &= TARGET_PAGE_MASK; 4047 4048 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 4049 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4050 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 4051 block = ram_block_from_stream(mis, f, flags, channel); 4052 if (!block) { 4053 ret = -EINVAL; 4054 break; 4055 } 4056 4057 /* 4058 * Relying on used_length is racy and can result in false positives. 4059 * We might place pages beyond used_length in case RAM was shrunk 4060 * while in postcopy, which is fine - trying to place via 4061 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4062 */ 4063 if (!block->host || addr >= block->postcopy_length) { 4064 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4065 ret = -EINVAL; 4066 break; 4067 } 4068 tmp_page->target_pages++; 4069 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4070 /* 4071 * Postcopy requires that we place whole host pages atomically; 4072 * these may be huge pages for RAMBlocks that are backed by 4073 * hugetlbfs. 4074 * To make it atomic, the data is read into a temporary page 4075 * that's moved into place later. 4076 * The migration protocol uses, possibly smaller, target-pages 4077 * however the source ensures it always sends all the components 4078 * of a host page in one chunk. 4079 */ 4080 page_buffer = tmp_page->tmp_huge_page + 4081 host_page_offset_from_ram_block_offset(block, addr); 4082 /* If all TP are zero then we can optimise the place */ 4083 if (tmp_page->target_pages == 1) { 4084 tmp_page->host_addr = 4085 host_page_from_ram_block_offset(block, addr); 4086 } else if (tmp_page->host_addr != 4087 host_page_from_ram_block_offset(block, addr)) { 4088 /* not the 1st TP within the HP */ 4089 error_report("Non-same host page detected on channel %d: " 4090 "Target host page %p, received host page %p " 4091 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4092 channel, tmp_page->host_addr, 4093 host_page_from_ram_block_offset(block, addr), 4094 block->idstr, addr, tmp_page->target_pages); 4095 ret = -EINVAL; 4096 break; 4097 } 4098 4099 /* 4100 * If it's the last part of a host page then we place the host 4101 * page 4102 */ 4103 if (tmp_page->target_pages == 4104 (block->page_size / TARGET_PAGE_SIZE)) { 4105 place_needed = true; 4106 } 4107 place_source = tmp_page->tmp_huge_page; 4108 } 4109 4110 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4111 case RAM_SAVE_FLAG_ZERO: 4112 ch = qemu_get_byte(f); 4113 /* 4114 * Can skip to set page_buffer when 4115 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4116 */ 4117 if (ch || !matches_target_page_size) { 4118 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4119 } 4120 if (ch) { 4121 tmp_page->all_zero = false; 4122 } 4123 break; 4124 4125 case RAM_SAVE_FLAG_PAGE: 4126 tmp_page->all_zero = false; 4127 if (!matches_target_page_size) { 4128 /* For huge pages, we always use temporary buffer */ 4129 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4130 } else { 4131 /* 4132 * For small pages that matches target page size, we 4133 * avoid the qemu_file copy. Instead we directly use 4134 * the buffer of QEMUFile to place the page. Note: we 4135 * cannot do any QEMUFile operation before using that 4136 * buffer to make sure the buffer is valid when 4137 * placing the page. 4138 */ 4139 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4140 TARGET_PAGE_SIZE); 4141 } 4142 break; 4143 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4144 tmp_page->all_zero = false; 4145 len = qemu_get_be32(f); 4146 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4147 error_report("Invalid compressed data length: %d", len); 4148 ret = -EINVAL; 4149 break; 4150 } 4151 decompress_data_with_multi_threads(f, page_buffer, len); 4152 break; 4153 4154 case RAM_SAVE_FLAG_EOS: 4155 /* normal exit */ 4156 multifd_recv_sync_main(); 4157 break; 4158 default: 4159 error_report("Unknown combination of migration flags: 0x%x" 4160 " (postcopy mode)", flags); 4161 ret = -EINVAL; 4162 break; 4163 } 4164 4165 /* Got the whole host page, wait for decompress before placing. */ 4166 if (place_needed) { 4167 ret |= wait_for_decompress_done(); 4168 } 4169 4170 /* Detect for any possible file errors */ 4171 if (!ret && qemu_file_get_error(f)) { 4172 ret = qemu_file_get_error(f); 4173 } 4174 4175 if (!ret && place_needed) { 4176 if (tmp_page->all_zero) { 4177 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4178 } else { 4179 ret = postcopy_place_page(mis, tmp_page->host_addr, 4180 place_source, block); 4181 } 4182 place_needed = false; 4183 postcopy_temp_page_reset(tmp_page); 4184 } 4185 } 4186 4187 return ret; 4188 } 4189 4190 static bool postcopy_is_running(void) 4191 { 4192 PostcopyState ps = postcopy_state_get(); 4193 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4194 } 4195 4196 /* 4197 * Flush content of RAM cache into SVM's memory. 4198 * Only flush the pages that be dirtied by PVM or SVM or both. 4199 */ 4200 void colo_flush_ram_cache(void) 4201 { 4202 RAMBlock *block = NULL; 4203 void *dst_host; 4204 void *src_host; 4205 unsigned long offset = 0; 4206 4207 memory_global_dirty_log_sync(); 4208 WITH_RCU_READ_LOCK_GUARD() { 4209 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4210 ramblock_sync_dirty_bitmap(ram_state, block); 4211 } 4212 } 4213 4214 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4215 WITH_RCU_READ_LOCK_GUARD() { 4216 block = QLIST_FIRST_RCU(&ram_list.blocks); 4217 4218 while (block) { 4219 unsigned long num = 0; 4220 4221 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4222 if (!offset_in_ramblock(block, 4223 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4224 offset = 0; 4225 num = 0; 4226 block = QLIST_NEXT_RCU(block, next); 4227 } else { 4228 unsigned long i = 0; 4229 4230 for (i = 0; i < num; i++) { 4231 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4232 } 4233 dst_host = block->host 4234 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4235 src_host = block->colo_cache 4236 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4237 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4238 offset += num; 4239 } 4240 } 4241 } 4242 trace_colo_flush_ram_cache_end(); 4243 } 4244 4245 /** 4246 * ram_load_precopy: load pages in precopy case 4247 * 4248 * Returns 0 for success or -errno in case of error 4249 * 4250 * Called in precopy mode by ram_load(). 4251 * rcu_read_lock is taken prior to this being called. 4252 * 4253 * @f: QEMUFile where to send the data 4254 */ 4255 static int ram_load_precopy(QEMUFile *f) 4256 { 4257 MigrationIncomingState *mis = migration_incoming_get_current(); 4258 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4259 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4260 bool postcopy_advised = migration_incoming_postcopy_advised(); 4261 if (!migrate_compress()) { 4262 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4263 } 4264 4265 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4266 ram_addr_t addr, total_ram_bytes; 4267 void *host = NULL, *host_bak = NULL; 4268 uint8_t ch; 4269 4270 /* 4271 * Yield periodically to let main loop run, but an iteration of 4272 * the main loop is expensive, so do it each some iterations 4273 */ 4274 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4275 aio_co_schedule(qemu_get_current_aio_context(), 4276 qemu_coroutine_self()); 4277 qemu_coroutine_yield(); 4278 } 4279 i++; 4280 4281 addr = qemu_get_be64(f); 4282 flags = addr & ~TARGET_PAGE_MASK; 4283 addr &= TARGET_PAGE_MASK; 4284 4285 if (flags & invalid_flags) { 4286 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4287 error_report("Received an unexpected compressed page"); 4288 } 4289 4290 ret = -EINVAL; 4291 break; 4292 } 4293 4294 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4295 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4296 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4297 RAM_CHANNEL_PRECOPY); 4298 4299 host = host_from_ram_block_offset(block, addr); 4300 /* 4301 * After going into COLO stage, we should not load the page 4302 * into SVM's memory directly, we put them into colo_cache firstly. 4303 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4304 * Previously, we copied all these memory in preparing stage of COLO 4305 * while we need to stop VM, which is a time-consuming process. 4306 * Here we optimize it by a trick, back-up every page while in 4307 * migration process while COLO is enabled, though it affects the 4308 * speed of the migration, but it obviously reduce the downtime of 4309 * back-up all SVM'S memory in COLO preparing stage. 4310 */ 4311 if (migration_incoming_colo_enabled()) { 4312 if (migration_incoming_in_colo_state()) { 4313 /* In COLO stage, put all pages into cache temporarily */ 4314 host = colo_cache_from_block_offset(block, addr, true); 4315 } else { 4316 /* 4317 * In migration stage but before COLO stage, 4318 * Put all pages into both cache and SVM's memory. 4319 */ 4320 host_bak = colo_cache_from_block_offset(block, addr, false); 4321 } 4322 } 4323 if (!host) { 4324 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4325 ret = -EINVAL; 4326 break; 4327 } 4328 if (!migration_incoming_in_colo_state()) { 4329 ramblock_recv_bitmap_set(block, host); 4330 } 4331 4332 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4333 } 4334 4335 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4336 case RAM_SAVE_FLAG_MEM_SIZE: 4337 /* Synchronize RAM block list */ 4338 total_ram_bytes = addr; 4339 while (!ret && total_ram_bytes) { 4340 RAMBlock *block; 4341 char id[256]; 4342 ram_addr_t length; 4343 4344 len = qemu_get_byte(f); 4345 qemu_get_buffer(f, (uint8_t *)id, len); 4346 id[len] = 0; 4347 length = qemu_get_be64(f); 4348 4349 block = qemu_ram_block_by_name(id); 4350 if (block && !qemu_ram_is_migratable(block)) { 4351 error_report("block %s should not be migrated !", id); 4352 ret = -EINVAL; 4353 } else if (block) { 4354 if (length != block->used_length) { 4355 Error *local_err = NULL; 4356 4357 ret = qemu_ram_resize(block, length, 4358 &local_err); 4359 if (local_err) { 4360 error_report_err(local_err); 4361 } 4362 } 4363 /* For postcopy we need to check hugepage sizes match */ 4364 if (postcopy_advised && migrate_postcopy_ram() && 4365 block->page_size != qemu_host_page_size) { 4366 uint64_t remote_page_size = qemu_get_be64(f); 4367 if (remote_page_size != block->page_size) { 4368 error_report("Mismatched RAM page size %s " 4369 "(local) %zd != %" PRId64, 4370 id, block->page_size, 4371 remote_page_size); 4372 ret = -EINVAL; 4373 } 4374 } 4375 if (migrate_ignore_shared()) { 4376 hwaddr addr = qemu_get_be64(f); 4377 if (ramblock_is_ignored(block) && 4378 block->mr->addr != addr) { 4379 error_report("Mismatched GPAs for block %s " 4380 "%" PRId64 "!= %" PRId64, 4381 id, (uint64_t)addr, 4382 (uint64_t)block->mr->addr); 4383 ret = -EINVAL; 4384 } 4385 } 4386 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4387 block->idstr); 4388 } else { 4389 error_report("Unknown ramblock \"%s\", cannot " 4390 "accept migration", id); 4391 ret = -EINVAL; 4392 } 4393 4394 total_ram_bytes -= length; 4395 } 4396 break; 4397 4398 case RAM_SAVE_FLAG_ZERO: 4399 ch = qemu_get_byte(f); 4400 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4401 break; 4402 4403 case RAM_SAVE_FLAG_PAGE: 4404 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4405 break; 4406 4407 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4408 len = qemu_get_be32(f); 4409 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4410 error_report("Invalid compressed data length: %d", len); 4411 ret = -EINVAL; 4412 break; 4413 } 4414 decompress_data_with_multi_threads(f, host, len); 4415 break; 4416 4417 case RAM_SAVE_FLAG_XBZRLE: 4418 if (load_xbzrle(f, addr, host) < 0) { 4419 error_report("Failed to decompress XBZRLE page at " 4420 RAM_ADDR_FMT, addr); 4421 ret = -EINVAL; 4422 break; 4423 } 4424 break; 4425 case RAM_SAVE_FLAG_EOS: 4426 /* normal exit */ 4427 multifd_recv_sync_main(); 4428 break; 4429 default: 4430 if (flags & RAM_SAVE_FLAG_HOOK) { 4431 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4432 } else { 4433 error_report("Unknown combination of migration flags: 0x%x", 4434 flags); 4435 ret = -EINVAL; 4436 } 4437 } 4438 if (!ret) { 4439 ret = qemu_file_get_error(f); 4440 } 4441 if (!ret && host_bak) { 4442 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4443 } 4444 } 4445 4446 ret |= wait_for_decompress_done(); 4447 return ret; 4448 } 4449 4450 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4451 { 4452 int ret = 0; 4453 static uint64_t seq_iter; 4454 /* 4455 * If system is running in postcopy mode, page inserts to host memory must 4456 * be atomic 4457 */ 4458 bool postcopy_running = postcopy_is_running(); 4459 4460 seq_iter++; 4461 4462 if (version_id != 4) { 4463 return -EINVAL; 4464 } 4465 4466 /* 4467 * This RCU critical section can be very long running. 4468 * When RCU reclaims in the code start to become numerous, 4469 * it will be necessary to reduce the granularity of this 4470 * critical section. 4471 */ 4472 WITH_RCU_READ_LOCK_GUARD() { 4473 if (postcopy_running) { 4474 /* 4475 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4476 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4477 * service fast page faults. 4478 */ 4479 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4480 } else { 4481 ret = ram_load_precopy(f); 4482 } 4483 } 4484 trace_ram_load_complete(ret, seq_iter); 4485 4486 return ret; 4487 } 4488 4489 static bool ram_has_postcopy(void *opaque) 4490 { 4491 RAMBlock *rb; 4492 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4493 if (ramblock_is_pmem(rb)) { 4494 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4495 "is not supported now!", rb->idstr, rb->host); 4496 return false; 4497 } 4498 } 4499 4500 return migrate_postcopy_ram(); 4501 } 4502 4503 /* Sync all the dirty bitmap with destination VM. */ 4504 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4505 { 4506 RAMBlock *block; 4507 QEMUFile *file = s->to_dst_file; 4508 int ramblock_count = 0; 4509 4510 trace_ram_dirty_bitmap_sync_start(); 4511 4512 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4513 qemu_savevm_send_recv_bitmap(file, block->idstr); 4514 trace_ram_dirty_bitmap_request(block->idstr); 4515 ramblock_count++; 4516 } 4517 4518 trace_ram_dirty_bitmap_sync_wait(); 4519 4520 /* Wait until all the ramblocks' dirty bitmap synced */ 4521 while (ramblock_count--) { 4522 qemu_sem_wait(&s->rp_state.rp_sem); 4523 } 4524 4525 trace_ram_dirty_bitmap_sync_complete(); 4526 4527 return 0; 4528 } 4529 4530 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4531 { 4532 qemu_sem_post(&s->rp_state.rp_sem); 4533 } 4534 4535 /* 4536 * Read the received bitmap, revert it as the initial dirty bitmap. 4537 * This is only used when the postcopy migration is paused but wants 4538 * to resume from a middle point. 4539 */ 4540 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4541 { 4542 int ret = -EINVAL; 4543 /* from_dst_file is always valid because we're within rp_thread */ 4544 QEMUFile *file = s->rp_state.from_dst_file; 4545 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4546 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4547 uint64_t size, end_mark; 4548 4549 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4550 4551 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4552 error_report("%s: incorrect state %s", __func__, 4553 MigrationStatus_str(s->state)); 4554 return -EINVAL; 4555 } 4556 4557 /* 4558 * Note: see comments in ramblock_recv_bitmap_send() on why we 4559 * need the endianness conversion, and the paddings. 4560 */ 4561 local_size = ROUND_UP(local_size, 8); 4562 4563 /* Add paddings */ 4564 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4565 4566 size = qemu_get_be64(file); 4567 4568 /* The size of the bitmap should match with our ramblock */ 4569 if (size != local_size) { 4570 error_report("%s: ramblock '%s' bitmap size mismatch " 4571 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4572 block->idstr, size, local_size); 4573 ret = -EINVAL; 4574 goto out; 4575 } 4576 4577 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4578 end_mark = qemu_get_be64(file); 4579 4580 ret = qemu_file_get_error(file); 4581 if (ret || size != local_size) { 4582 error_report("%s: read bitmap failed for ramblock '%s': %d" 4583 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4584 __func__, block->idstr, ret, local_size, size); 4585 ret = -EIO; 4586 goto out; 4587 } 4588 4589 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4590 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4591 __func__, block->idstr, end_mark); 4592 ret = -EINVAL; 4593 goto out; 4594 } 4595 4596 /* 4597 * Endianness conversion. We are during postcopy (though paused). 4598 * The dirty bitmap won't change. We can directly modify it. 4599 */ 4600 bitmap_from_le(block->bmap, le_bitmap, nbits); 4601 4602 /* 4603 * What we received is "received bitmap". Revert it as the initial 4604 * dirty bitmap for this ramblock. 4605 */ 4606 bitmap_complement(block->bmap, block->bmap, nbits); 4607 4608 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4609 ramblock_dirty_bitmap_clear_discarded_pages(block); 4610 4611 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4612 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4613 4614 /* 4615 * We succeeded to sync bitmap for current ramblock. If this is 4616 * the last one to sync, we need to notify the main send thread. 4617 */ 4618 ram_dirty_bitmap_reload_notify(s); 4619 4620 ret = 0; 4621 out: 4622 g_free(le_bitmap); 4623 return ret; 4624 } 4625 4626 static int ram_resume_prepare(MigrationState *s, void *opaque) 4627 { 4628 RAMState *rs = *(RAMState **)opaque; 4629 int ret; 4630 4631 ret = ram_dirty_bitmap_sync_all(s, rs); 4632 if (ret) { 4633 return ret; 4634 } 4635 4636 ram_state_resume_prepare(rs, s->to_dst_file); 4637 4638 return 0; 4639 } 4640 4641 void postcopy_preempt_shutdown_file(MigrationState *s) 4642 { 4643 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4644 qemu_fflush(s->postcopy_qemufile_src); 4645 } 4646 4647 static SaveVMHandlers savevm_ram_handlers = { 4648 .save_setup = ram_save_setup, 4649 .save_live_iterate = ram_save_iterate, 4650 .save_live_complete_postcopy = ram_save_complete, 4651 .save_live_complete_precopy = ram_save_complete, 4652 .has_postcopy = ram_has_postcopy, 4653 .state_pending_exact = ram_state_pending_exact, 4654 .state_pending_estimate = ram_state_pending_estimate, 4655 .load_state = ram_load, 4656 .save_cleanup = ram_save_cleanup, 4657 .load_setup = ram_load_setup, 4658 .load_cleanup = ram_load_cleanup, 4659 .resume_prepare = ram_resume_prepare, 4660 }; 4661 4662 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4663 size_t old_size, size_t new_size) 4664 { 4665 PostcopyState ps = postcopy_state_get(); 4666 ram_addr_t offset; 4667 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4668 Error *err = NULL; 4669 4670 if (ramblock_is_ignored(rb)) { 4671 return; 4672 } 4673 4674 if (!migration_is_idle()) { 4675 /* 4676 * Precopy code on the source cannot deal with the size of RAM blocks 4677 * changing at random points in time - especially after sending the 4678 * RAM block sizes in the migration stream, they must no longer change. 4679 * Abort and indicate a proper reason. 4680 */ 4681 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4682 migration_cancel(err); 4683 error_free(err); 4684 } 4685 4686 switch (ps) { 4687 case POSTCOPY_INCOMING_ADVISE: 4688 /* 4689 * Update what ram_postcopy_incoming_init()->init_range() does at the 4690 * time postcopy was advised. Syncing RAM blocks with the source will 4691 * result in RAM resizes. 4692 */ 4693 if (old_size < new_size) { 4694 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4695 error_report("RAM block '%s' discard of resized RAM failed", 4696 rb->idstr); 4697 } 4698 } 4699 rb->postcopy_length = new_size; 4700 break; 4701 case POSTCOPY_INCOMING_NONE: 4702 case POSTCOPY_INCOMING_RUNNING: 4703 case POSTCOPY_INCOMING_END: 4704 /* 4705 * Once our guest is running, postcopy does no longer care about 4706 * resizes. When growing, the new memory was not available on the 4707 * source, no handler needed. 4708 */ 4709 break; 4710 default: 4711 error_report("RAM block '%s' resized during postcopy state: %d", 4712 rb->idstr, ps); 4713 exit(-1); 4714 } 4715 } 4716 4717 static RAMBlockNotifier ram_mig_ram_notifier = { 4718 .ram_block_resized = ram_mig_ram_block_resized, 4719 }; 4720 4721 void ram_mig_init(void) 4722 { 4723 qemu_mutex_init(&XBZRLE.lock); 4724 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4725 ram_block_notifier_add(&ram_mig_ram_notifier); 4726 } 4727