1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #include "hw/boards.h" /* for machine_dump_guest_core() */ 62 63 #if defined(__linux__) 64 #include "qemu/userfaultfd.h" 65 #endif /* defined(__linux__) */ 66 67 /***********************************************************/ 68 /* ram save/restore */ 69 70 /* 71 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 72 * worked for pages that were filled with the same char. We switched 73 * it to only search for the zero value. And to avoid confusion with 74 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 75 */ 76 /* 77 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 78 */ 79 #define RAM_SAVE_FLAG_FULL 0x01 80 #define RAM_SAVE_FLAG_ZERO 0x02 81 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 82 #define RAM_SAVE_FLAG_PAGE 0x08 83 #define RAM_SAVE_FLAG_EOS 0x10 84 #define RAM_SAVE_FLAG_CONTINUE 0x20 85 #define RAM_SAVE_FLAG_XBZRLE 0x40 86 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 87 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 88 /* We can't use any flag that is bigger than 0x200 */ 89 90 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, 91 uint8_t *, int) = xbzrle_encode_buffer; 92 #if defined(CONFIG_AVX512BW_OPT) 93 #include "qemu/cpuid.h" 94 static void __attribute__((constructor)) init_cpu_flag(void) 95 { 96 unsigned max = __get_cpuid_max(0, NULL); 97 int a, b, c, d; 98 if (max >= 1) { 99 __cpuid(1, a, b, c, d); 100 /* We must check that AVX is not just available, but usable. */ 101 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { 102 int bv; 103 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); 104 __cpuid_count(7, 0, a, b, c, d); 105 /* 0xe6: 106 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 107 * and ZMM16-ZMM31 state are enabled by OS) 108 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) 109 */ 110 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { 111 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; 112 } 113 } 114 } 115 } 116 #endif 117 118 XBZRLECacheStats xbzrle_counters; 119 120 /* used by the search for pages to send */ 121 struct PageSearchStatus { 122 /* The migration channel used for a specific host page */ 123 QEMUFile *pss_channel; 124 /* Last block from where we have sent data */ 125 RAMBlock *last_sent_block; 126 /* Current block being searched */ 127 RAMBlock *block; 128 /* Current page to search from */ 129 unsigned long page; 130 /* Set once we wrap around */ 131 bool complete_round; 132 /* Whether we're sending a host page */ 133 bool host_page_sending; 134 /* The start/end of current host page. Invalid if host_page_sending==false */ 135 unsigned long host_page_start; 136 unsigned long host_page_end; 137 }; 138 typedef struct PageSearchStatus PageSearchStatus; 139 140 /* struct contains XBZRLE cache and a static page 141 used by the compression */ 142 static struct { 143 /* buffer used for XBZRLE encoding */ 144 uint8_t *encoded_buf; 145 /* buffer for storing page content */ 146 uint8_t *current_buf; 147 /* Cache for XBZRLE, Protected by lock. */ 148 PageCache *cache; 149 QemuMutex lock; 150 /* it will store a page full of zeros */ 151 uint8_t *zero_target_page; 152 /* buffer used for XBZRLE decoding */ 153 uint8_t *decoded_buf; 154 } XBZRLE; 155 156 static void XBZRLE_cache_lock(void) 157 { 158 if (migrate_use_xbzrle()) { 159 qemu_mutex_lock(&XBZRLE.lock); 160 } 161 } 162 163 static void XBZRLE_cache_unlock(void) 164 { 165 if (migrate_use_xbzrle()) { 166 qemu_mutex_unlock(&XBZRLE.lock); 167 } 168 } 169 170 /** 171 * xbzrle_cache_resize: resize the xbzrle cache 172 * 173 * This function is called from migrate_params_apply in main 174 * thread, possibly while a migration is in progress. A running 175 * migration may be using the cache and might finish during this call, 176 * hence changes to the cache are protected by XBZRLE.lock(). 177 * 178 * Returns 0 for success or -1 for error 179 * 180 * @new_size: new cache size 181 * @errp: set *errp if the check failed, with reason 182 */ 183 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 184 { 185 PageCache *new_cache; 186 int64_t ret = 0; 187 188 /* Check for truncation */ 189 if (new_size != (size_t)new_size) { 190 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 191 "exceeding address space"); 192 return -1; 193 } 194 195 if (new_size == migrate_xbzrle_cache_size()) { 196 /* nothing to do */ 197 return 0; 198 } 199 200 XBZRLE_cache_lock(); 201 202 if (XBZRLE.cache != NULL) { 203 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 204 if (!new_cache) { 205 ret = -1; 206 goto out; 207 } 208 209 cache_fini(XBZRLE.cache); 210 XBZRLE.cache = new_cache; 211 } 212 out: 213 XBZRLE_cache_unlock(); 214 return ret; 215 } 216 217 static bool postcopy_preempt_active(void) 218 { 219 return migrate_postcopy_preempt() && migration_in_postcopy(); 220 } 221 222 bool ramblock_is_ignored(RAMBlock *block) 223 { 224 return !qemu_ram_is_migratable(block) || 225 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 226 } 227 228 #undef RAMBLOCK_FOREACH 229 230 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 231 { 232 RAMBlock *block; 233 int ret = 0; 234 235 RCU_READ_LOCK_GUARD(); 236 237 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 238 ret = func(block, opaque); 239 if (ret) { 240 break; 241 } 242 } 243 return ret; 244 } 245 246 static void ramblock_recv_map_init(void) 247 { 248 RAMBlock *rb; 249 250 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 251 assert(!rb->receivedmap); 252 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 253 } 254 } 255 256 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 257 { 258 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 259 rb->receivedmap); 260 } 261 262 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 263 { 264 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 265 } 266 267 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 268 { 269 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 270 } 271 272 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 273 size_t nr) 274 { 275 bitmap_set_atomic(rb->receivedmap, 276 ramblock_recv_bitmap_offset(host_addr, rb), 277 nr); 278 } 279 280 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 281 282 /* 283 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 284 * 285 * Returns >0 if success with sent bytes, or <0 if error. 286 */ 287 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 288 const char *block_name) 289 { 290 RAMBlock *block = qemu_ram_block_by_name(block_name); 291 unsigned long *le_bitmap, nbits; 292 uint64_t size; 293 294 if (!block) { 295 error_report("%s: invalid block name: %s", __func__, block_name); 296 return -1; 297 } 298 299 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 300 301 /* 302 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 303 * machines we may need 4 more bytes for padding (see below 304 * comment). So extend it a bit before hand. 305 */ 306 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 307 308 /* 309 * Always use little endian when sending the bitmap. This is 310 * required that when source and destination VMs are not using the 311 * same endianness. (Note: big endian won't work.) 312 */ 313 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 314 315 /* Size of the bitmap, in bytes */ 316 size = DIV_ROUND_UP(nbits, 8); 317 318 /* 319 * size is always aligned to 8 bytes for 64bit machines, but it 320 * may not be true for 32bit machines. We need this padding to 321 * make sure the migration can survive even between 32bit and 322 * 64bit machines. 323 */ 324 size = ROUND_UP(size, 8); 325 326 qemu_put_be64(file, size); 327 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 328 /* 329 * Mark as an end, in case the middle part is screwed up due to 330 * some "mysterious" reason. 331 */ 332 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 333 qemu_fflush(file); 334 335 g_free(le_bitmap); 336 337 if (qemu_file_get_error(file)) { 338 return qemu_file_get_error(file); 339 } 340 341 return size + sizeof(size); 342 } 343 344 /* 345 * An outstanding page request, on the source, having been received 346 * and queued 347 */ 348 struct RAMSrcPageRequest { 349 RAMBlock *rb; 350 hwaddr offset; 351 hwaddr len; 352 353 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 354 }; 355 356 /* State of RAM for migration */ 357 struct RAMState { 358 /* 359 * PageSearchStatus structures for the channels when send pages. 360 * Protected by the bitmap_mutex. 361 */ 362 PageSearchStatus pss[RAM_CHANNEL_MAX]; 363 /* UFFD file descriptor, used in 'write-tracking' migration */ 364 int uffdio_fd; 365 /* total ram size in bytes */ 366 uint64_t ram_bytes_total; 367 /* Last block that we have visited searching for dirty pages */ 368 RAMBlock *last_seen_block; 369 /* Last dirty target page we have sent */ 370 ram_addr_t last_page; 371 /* last ram version we have seen */ 372 uint32_t last_version; 373 /* How many times we have dirty too many pages */ 374 int dirty_rate_high_cnt; 375 /* these variables are used for bitmap sync */ 376 /* last time we did a full bitmap_sync */ 377 int64_t time_last_bitmap_sync; 378 /* bytes transferred at start_time */ 379 uint64_t bytes_xfer_prev; 380 /* number of dirty pages since start_time */ 381 uint64_t num_dirty_pages_period; 382 /* xbzrle misses since the beginning of the period */ 383 uint64_t xbzrle_cache_miss_prev; 384 /* Amount of xbzrle pages since the beginning of the period */ 385 uint64_t xbzrle_pages_prev; 386 /* Amount of xbzrle encoded bytes since the beginning of the period */ 387 uint64_t xbzrle_bytes_prev; 388 /* Start using XBZRLE (e.g., after the first round). */ 389 bool xbzrle_enabled; 390 /* Are we on the last stage of migration */ 391 bool last_stage; 392 /* compression statistics since the beginning of the period */ 393 /* amount of count that no free thread to compress data */ 394 uint64_t compress_thread_busy_prev; 395 /* amount bytes after compression */ 396 uint64_t compressed_size_prev; 397 /* amount of compressed pages */ 398 uint64_t compress_pages_prev; 399 400 /* total handled target pages at the beginning of period */ 401 uint64_t target_page_count_prev; 402 /* total handled target pages since start */ 403 uint64_t target_page_count; 404 /* number of dirty bits in the bitmap */ 405 uint64_t migration_dirty_pages; 406 /* 407 * Protects: 408 * - dirty/clear bitmap 409 * - migration_dirty_pages 410 * - pss structures 411 */ 412 QemuMutex bitmap_mutex; 413 /* The RAMBlock used in the last src_page_requests */ 414 RAMBlock *last_req_rb; 415 /* Queue of outstanding page requests from the destination */ 416 QemuMutex src_page_req_mutex; 417 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 418 }; 419 typedef struct RAMState RAMState; 420 421 static RAMState *ram_state; 422 423 static NotifierWithReturnList precopy_notifier_list; 424 425 /* Whether postcopy has queued requests? */ 426 static bool postcopy_has_request(RAMState *rs) 427 { 428 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 429 } 430 431 void precopy_infrastructure_init(void) 432 { 433 notifier_with_return_list_init(&precopy_notifier_list); 434 } 435 436 void precopy_add_notifier(NotifierWithReturn *n) 437 { 438 notifier_with_return_list_add(&precopy_notifier_list, n); 439 } 440 441 void precopy_remove_notifier(NotifierWithReturn *n) 442 { 443 notifier_with_return_remove(n); 444 } 445 446 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 447 { 448 PrecopyNotifyData pnd; 449 pnd.reason = reason; 450 pnd.errp = errp; 451 452 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 453 } 454 455 uint64_t ram_bytes_remaining(void) 456 { 457 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 458 0; 459 } 460 461 RAMStats ram_counters; 462 463 void ram_transferred_add(uint64_t bytes) 464 { 465 if (runstate_is_running()) { 466 stat64_add(&ram_counters.precopy_bytes, bytes); 467 } else if (migration_in_postcopy()) { 468 stat64_add(&ram_counters.postcopy_bytes, bytes); 469 } else { 470 stat64_add(&ram_counters.downtime_bytes, bytes); 471 } 472 stat64_add(&ram_counters.transferred, bytes); 473 } 474 475 struct MigrationOps { 476 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 477 }; 478 typedef struct MigrationOps MigrationOps; 479 480 MigrationOps *migration_ops; 481 482 CompressionStats compression_counters; 483 484 struct CompressParam { 485 bool done; 486 bool quit; 487 bool zero_page; 488 QEMUFile *file; 489 QemuMutex mutex; 490 QemuCond cond; 491 RAMBlock *block; 492 ram_addr_t offset; 493 494 /* internally used fields */ 495 z_stream stream; 496 uint8_t *originbuf; 497 }; 498 typedef struct CompressParam CompressParam; 499 500 struct DecompressParam { 501 bool done; 502 bool quit; 503 QemuMutex mutex; 504 QemuCond cond; 505 void *des; 506 uint8_t *compbuf; 507 int len; 508 z_stream stream; 509 }; 510 typedef struct DecompressParam DecompressParam; 511 512 static CompressParam *comp_param; 513 static QemuThread *compress_threads; 514 /* comp_done_cond is used to wake up the migration thread when 515 * one of the compression threads has finished the compression. 516 * comp_done_lock is used to co-work with comp_done_cond. 517 */ 518 static QemuMutex comp_done_lock; 519 static QemuCond comp_done_cond; 520 521 static QEMUFile *decomp_file; 522 static DecompressParam *decomp_param; 523 static QemuThread *decompress_threads; 524 static QemuMutex decomp_done_lock; 525 static QemuCond decomp_done_cond; 526 527 static int ram_save_host_page_urgent(PageSearchStatus *pss); 528 529 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 530 ram_addr_t offset, uint8_t *source_buf); 531 532 /* NOTE: page is the PFN not real ram_addr_t. */ 533 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 534 { 535 pss->block = rb; 536 pss->page = page; 537 pss->complete_round = false; 538 } 539 540 /* 541 * Check whether two PSSs are actively sending the same page. Return true 542 * if it is, false otherwise. 543 */ 544 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 545 { 546 return pss1->host_page_sending && pss2->host_page_sending && 547 (pss1->host_page_start == pss2->host_page_start); 548 } 549 550 static void *do_data_compress(void *opaque) 551 { 552 CompressParam *param = opaque; 553 RAMBlock *block; 554 ram_addr_t offset; 555 bool zero_page; 556 557 qemu_mutex_lock(¶m->mutex); 558 while (!param->quit) { 559 if (param->block) { 560 block = param->block; 561 offset = param->offset; 562 param->block = NULL; 563 qemu_mutex_unlock(¶m->mutex); 564 565 zero_page = do_compress_ram_page(param->file, ¶m->stream, 566 block, offset, param->originbuf); 567 568 qemu_mutex_lock(&comp_done_lock); 569 param->done = true; 570 param->zero_page = zero_page; 571 qemu_cond_signal(&comp_done_cond); 572 qemu_mutex_unlock(&comp_done_lock); 573 574 qemu_mutex_lock(¶m->mutex); 575 } else { 576 qemu_cond_wait(¶m->cond, ¶m->mutex); 577 } 578 } 579 qemu_mutex_unlock(¶m->mutex); 580 581 return NULL; 582 } 583 584 static void compress_threads_save_cleanup(void) 585 { 586 int i, thread_count; 587 588 if (!migrate_use_compression() || !comp_param) { 589 return; 590 } 591 592 thread_count = migrate_compress_threads(); 593 for (i = 0; i < thread_count; i++) { 594 /* 595 * we use it as a indicator which shows if the thread is 596 * properly init'd or not 597 */ 598 if (!comp_param[i].file) { 599 break; 600 } 601 602 qemu_mutex_lock(&comp_param[i].mutex); 603 comp_param[i].quit = true; 604 qemu_cond_signal(&comp_param[i].cond); 605 qemu_mutex_unlock(&comp_param[i].mutex); 606 607 qemu_thread_join(compress_threads + i); 608 qemu_mutex_destroy(&comp_param[i].mutex); 609 qemu_cond_destroy(&comp_param[i].cond); 610 deflateEnd(&comp_param[i].stream); 611 g_free(comp_param[i].originbuf); 612 qemu_fclose(comp_param[i].file); 613 comp_param[i].file = NULL; 614 } 615 qemu_mutex_destroy(&comp_done_lock); 616 qemu_cond_destroy(&comp_done_cond); 617 g_free(compress_threads); 618 g_free(comp_param); 619 compress_threads = NULL; 620 comp_param = NULL; 621 } 622 623 static int compress_threads_save_setup(void) 624 { 625 int i, thread_count; 626 627 if (!migrate_use_compression()) { 628 return 0; 629 } 630 thread_count = migrate_compress_threads(); 631 compress_threads = g_new0(QemuThread, thread_count); 632 comp_param = g_new0(CompressParam, thread_count); 633 qemu_cond_init(&comp_done_cond); 634 qemu_mutex_init(&comp_done_lock); 635 for (i = 0; i < thread_count; i++) { 636 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 637 if (!comp_param[i].originbuf) { 638 goto exit; 639 } 640 641 if (deflateInit(&comp_param[i].stream, 642 migrate_compress_level()) != Z_OK) { 643 g_free(comp_param[i].originbuf); 644 goto exit; 645 } 646 647 /* comp_param[i].file is just used as a dummy buffer to save data, 648 * set its ops to empty. 649 */ 650 comp_param[i].file = qemu_file_new_output( 651 QIO_CHANNEL(qio_channel_null_new())); 652 comp_param[i].done = true; 653 comp_param[i].quit = false; 654 qemu_mutex_init(&comp_param[i].mutex); 655 qemu_cond_init(&comp_param[i].cond); 656 qemu_thread_create(compress_threads + i, "compress", 657 do_data_compress, comp_param + i, 658 QEMU_THREAD_JOINABLE); 659 } 660 return 0; 661 662 exit: 663 compress_threads_save_cleanup(); 664 return -1; 665 } 666 667 /** 668 * save_page_header: write page header to wire 669 * 670 * If this is the 1st block, it also writes the block identification 671 * 672 * Returns the number of bytes written 673 * 674 * @pss: current PSS channel status 675 * @block: block that contains the page we want to send 676 * @offset: offset inside the block for the page 677 * in the lower bits, it contains flags 678 */ 679 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 680 RAMBlock *block, ram_addr_t offset) 681 { 682 size_t size, len; 683 bool same_block = (block == pss->last_sent_block); 684 685 if (same_block) { 686 offset |= RAM_SAVE_FLAG_CONTINUE; 687 } 688 qemu_put_be64(f, offset); 689 size = 8; 690 691 if (!same_block) { 692 len = strlen(block->idstr); 693 qemu_put_byte(f, len); 694 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 695 size += 1 + len; 696 pss->last_sent_block = block; 697 } 698 return size; 699 } 700 701 /** 702 * mig_throttle_guest_down: throttle down the guest 703 * 704 * Reduce amount of guest cpu execution to hopefully slow down memory 705 * writes. If guest dirty memory rate is reduced below the rate at 706 * which we can transfer pages to the destination then we should be 707 * able to complete migration. Some workloads dirty memory way too 708 * fast and will not effectively converge, even with auto-converge. 709 */ 710 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 711 uint64_t bytes_dirty_threshold) 712 { 713 MigrationState *s = migrate_get_current(); 714 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 715 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 716 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 717 int pct_max = s->parameters.max_cpu_throttle; 718 719 uint64_t throttle_now = cpu_throttle_get_percentage(); 720 uint64_t cpu_now, cpu_ideal, throttle_inc; 721 722 /* We have not started throttling yet. Let's start it. */ 723 if (!cpu_throttle_active()) { 724 cpu_throttle_set(pct_initial); 725 } else { 726 /* Throttling already on, just increase the rate */ 727 if (!pct_tailslow) { 728 throttle_inc = pct_increment; 729 } else { 730 /* Compute the ideal CPU percentage used by Guest, which may 731 * make the dirty rate match the dirty rate threshold. */ 732 cpu_now = 100 - throttle_now; 733 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 734 bytes_dirty_period); 735 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 736 } 737 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 738 } 739 } 740 741 void mig_throttle_counter_reset(void) 742 { 743 RAMState *rs = ram_state; 744 745 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 746 rs->num_dirty_pages_period = 0; 747 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); 748 } 749 750 /** 751 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 752 * 753 * @rs: current RAM state 754 * @current_addr: address for the zero page 755 * 756 * Update the xbzrle cache to reflect a page that's been sent as all 0. 757 * The important thing is that a stale (not-yet-0'd) page be replaced 758 * by the new data. 759 * As a bonus, if the page wasn't in the cache it gets added so that 760 * when a small write is made into the 0'd page it gets XBZRLE sent. 761 */ 762 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 763 { 764 /* We don't care if this fails to allocate a new cache page 765 * as long as it updated an old one */ 766 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 767 stat64_get(&ram_counters.dirty_sync_count)); 768 } 769 770 #define ENCODING_FLAG_XBZRLE 0x1 771 772 /** 773 * save_xbzrle_page: compress and send current page 774 * 775 * Returns: 1 means that we wrote the page 776 * 0 means that page is identical to the one already sent 777 * -1 means that xbzrle would be longer than normal 778 * 779 * @rs: current RAM state 780 * @pss: current PSS channel 781 * @current_data: pointer to the address of the page contents 782 * @current_addr: addr of the page 783 * @block: block that contains the page we want to send 784 * @offset: offset inside the block for the page 785 */ 786 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 787 uint8_t **current_data, ram_addr_t current_addr, 788 RAMBlock *block, ram_addr_t offset) 789 { 790 int encoded_len = 0, bytes_xbzrle; 791 uint8_t *prev_cached_page; 792 QEMUFile *file = pss->pss_channel; 793 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); 794 795 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 796 xbzrle_counters.cache_miss++; 797 if (!rs->last_stage) { 798 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 799 generation) == -1) { 800 return -1; 801 } else { 802 /* update *current_data when the page has been 803 inserted into cache */ 804 *current_data = get_cached_data(XBZRLE.cache, current_addr); 805 } 806 } 807 return -1; 808 } 809 810 /* 811 * Reaching here means the page has hit the xbzrle cache, no matter what 812 * encoding result it is (normal encoding, overflow or skipping the page), 813 * count the page as encoded. This is used to calculate the encoding rate. 814 * 815 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 816 * 2nd page turns out to be skipped (i.e. no new bytes written to the 817 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 818 * skipped page included. In this way, the encoding rate can tell if the 819 * guest page is good for xbzrle encoding. 820 */ 821 xbzrle_counters.pages++; 822 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 823 824 /* save current buffer into memory */ 825 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 826 827 /* XBZRLE encoding (if there is no overflow) */ 828 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, 829 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 830 TARGET_PAGE_SIZE); 831 832 /* 833 * Update the cache contents, so that it corresponds to the data 834 * sent, in all cases except where we skip the page. 835 */ 836 if (!rs->last_stage && encoded_len != 0) { 837 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 838 /* 839 * In the case where we couldn't compress, ensure that the caller 840 * sends the data from the cache, since the guest might have 841 * changed the RAM since we copied it. 842 */ 843 *current_data = prev_cached_page; 844 } 845 846 if (encoded_len == 0) { 847 trace_save_xbzrle_page_skipping(); 848 return 0; 849 } else if (encoded_len == -1) { 850 trace_save_xbzrle_page_overflow(); 851 xbzrle_counters.overflow++; 852 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 853 return -1; 854 } 855 856 /* Send XBZRLE based compressed page */ 857 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 858 offset | RAM_SAVE_FLAG_XBZRLE); 859 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 860 qemu_put_be16(file, encoded_len); 861 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 862 bytes_xbzrle += encoded_len + 1 + 2; 863 /* 864 * Like compressed_size (please see update_compress_thread_counts), 865 * the xbzrle encoded bytes don't count the 8 byte header with 866 * RAM_SAVE_FLAG_CONTINUE. 867 */ 868 xbzrle_counters.bytes += bytes_xbzrle - 8; 869 ram_transferred_add(bytes_xbzrle); 870 871 return 1; 872 } 873 874 /** 875 * pss_find_next_dirty: find the next dirty page of current ramblock 876 * 877 * This function updates pss->page to point to the next dirty page index 878 * within the ramblock to migrate, or the end of ramblock when nothing 879 * found. Note that when pss->host_page_sending==true it means we're 880 * during sending a host page, so we won't look for dirty page that is 881 * outside the host page boundary. 882 * 883 * @pss: the current page search status 884 */ 885 static void pss_find_next_dirty(PageSearchStatus *pss) 886 { 887 RAMBlock *rb = pss->block; 888 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 889 unsigned long *bitmap = rb->bmap; 890 891 if (ramblock_is_ignored(rb)) { 892 /* Points directly to the end, so we know no dirty page */ 893 pss->page = size; 894 return; 895 } 896 897 /* 898 * If during sending a host page, only look for dirty pages within the 899 * current host page being send. 900 */ 901 if (pss->host_page_sending) { 902 assert(pss->host_page_end); 903 size = MIN(size, pss->host_page_end); 904 } 905 906 pss->page = find_next_bit(bitmap, size, pss->page); 907 } 908 909 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 910 unsigned long page) 911 { 912 uint8_t shift; 913 hwaddr size, start; 914 915 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 916 return; 917 } 918 919 shift = rb->clear_bmap_shift; 920 /* 921 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 922 * can make things easier sometimes since then start address 923 * of the small chunk will always be 64 pages aligned so the 924 * bitmap will always be aligned to unsigned long. We should 925 * even be able to remove this restriction but I'm simply 926 * keeping it. 927 */ 928 assert(shift >= 6); 929 930 size = 1ULL << (TARGET_PAGE_BITS + shift); 931 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 932 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 933 memory_region_clear_dirty_bitmap(rb->mr, start, size); 934 } 935 936 static void 937 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 938 unsigned long start, 939 unsigned long npages) 940 { 941 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 942 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 943 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 944 945 /* 946 * Clear pages from start to start + npages - 1, so the end boundary is 947 * exclusive. 948 */ 949 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 950 migration_clear_memory_region_dirty_bitmap(rb, i); 951 } 952 } 953 954 /* 955 * colo_bitmap_find_diry:find contiguous dirty pages from start 956 * 957 * Returns the page offset within memory region of the start of the contiguout 958 * dirty page 959 * 960 * @rs: current RAM state 961 * @rb: RAMBlock where to search for dirty pages 962 * @start: page where we start the search 963 * @num: the number of contiguous dirty pages 964 */ 965 static inline 966 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 967 unsigned long start, unsigned long *num) 968 { 969 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 970 unsigned long *bitmap = rb->bmap; 971 unsigned long first, next; 972 973 *num = 0; 974 975 if (ramblock_is_ignored(rb)) { 976 return size; 977 } 978 979 first = find_next_bit(bitmap, size, start); 980 if (first >= size) { 981 return first; 982 } 983 next = find_next_zero_bit(bitmap, size, first + 1); 984 assert(next >= first); 985 *num = next - first; 986 return first; 987 } 988 989 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 990 RAMBlock *rb, 991 unsigned long page) 992 { 993 bool ret; 994 995 /* 996 * Clear dirty bitmap if needed. This _must_ be called before we 997 * send any of the page in the chunk because we need to make sure 998 * we can capture further page content changes when we sync dirty 999 * log the next time. So as long as we are going to send any of 1000 * the page in the chunk we clear the remote dirty bitmap for all. 1001 * Clearing it earlier won't be a problem, but too late will. 1002 */ 1003 migration_clear_memory_region_dirty_bitmap(rb, page); 1004 1005 ret = test_and_clear_bit(page, rb->bmap); 1006 if (ret) { 1007 rs->migration_dirty_pages--; 1008 } 1009 1010 return ret; 1011 } 1012 1013 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 1014 void *opaque) 1015 { 1016 const hwaddr offset = section->offset_within_region; 1017 const hwaddr size = int128_get64(section->size); 1018 const unsigned long start = offset >> TARGET_PAGE_BITS; 1019 const unsigned long npages = size >> TARGET_PAGE_BITS; 1020 RAMBlock *rb = section->mr->ram_block; 1021 uint64_t *cleared_bits = opaque; 1022 1023 /* 1024 * We don't grab ram_state->bitmap_mutex because we expect to run 1025 * only when starting migration or during postcopy recovery where 1026 * we don't have concurrent access. 1027 */ 1028 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1029 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1030 } 1031 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1032 bitmap_clear(rb->bmap, start, npages); 1033 } 1034 1035 /* 1036 * Exclude all dirty pages from migration that fall into a discarded range as 1037 * managed by a RamDiscardManager responsible for the mapped memory region of 1038 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1039 * 1040 * Discarded pages ("logically unplugged") have undefined content and must 1041 * not get migrated, because even reading these pages for migration might 1042 * result in undesired behavior. 1043 * 1044 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1045 * 1046 * Note: The result is only stable while migrating (precopy/postcopy). 1047 */ 1048 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1049 { 1050 uint64_t cleared_bits = 0; 1051 1052 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1053 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1054 MemoryRegionSection section = { 1055 .mr = rb->mr, 1056 .offset_within_region = 0, 1057 .size = int128_make64(qemu_ram_get_used_length(rb)), 1058 }; 1059 1060 ram_discard_manager_replay_discarded(rdm, §ion, 1061 dirty_bitmap_clear_section, 1062 &cleared_bits); 1063 } 1064 return cleared_bits; 1065 } 1066 1067 /* 1068 * Check if a host-page aligned page falls into a discarded range as managed by 1069 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1070 * 1071 * Note: The result is only stable while migrating (precopy/postcopy). 1072 */ 1073 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1074 { 1075 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1076 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1077 MemoryRegionSection section = { 1078 .mr = rb->mr, 1079 .offset_within_region = start, 1080 .size = int128_make64(qemu_ram_pagesize(rb)), 1081 }; 1082 1083 return !ram_discard_manager_is_populated(rdm, §ion); 1084 } 1085 return false; 1086 } 1087 1088 /* Called with RCU critical section */ 1089 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1090 { 1091 uint64_t new_dirty_pages = 1092 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1093 1094 rs->migration_dirty_pages += new_dirty_pages; 1095 rs->num_dirty_pages_period += new_dirty_pages; 1096 } 1097 1098 /** 1099 * ram_pagesize_summary: calculate all the pagesizes of a VM 1100 * 1101 * Returns a summary bitmap of the page sizes of all RAMBlocks 1102 * 1103 * For VMs with just normal pages this is equivalent to the host page 1104 * size. If it's got some huge pages then it's the OR of all the 1105 * different page sizes. 1106 */ 1107 uint64_t ram_pagesize_summary(void) 1108 { 1109 RAMBlock *block; 1110 uint64_t summary = 0; 1111 1112 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1113 summary |= block->page_size; 1114 } 1115 1116 return summary; 1117 } 1118 1119 uint64_t ram_get_total_transferred_pages(void) 1120 { 1121 return stat64_get(&ram_counters.normal_pages) + 1122 stat64_get(&ram_counters.zero_pages) + 1123 compression_counters.pages + xbzrle_counters.pages; 1124 } 1125 1126 static void migration_update_rates(RAMState *rs, int64_t end_time) 1127 { 1128 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1129 double compressed_size; 1130 1131 /* calculate period counters */ 1132 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1133 / (end_time - rs->time_last_bitmap_sync); 1134 1135 if (!page_count) { 1136 return; 1137 } 1138 1139 if (migrate_use_xbzrle()) { 1140 double encoded_size, unencoded_size; 1141 1142 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1143 rs->xbzrle_cache_miss_prev) / page_count; 1144 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1145 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1146 TARGET_PAGE_SIZE; 1147 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1148 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1149 xbzrle_counters.encoding_rate = 0; 1150 } else { 1151 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1152 } 1153 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1154 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1155 } 1156 1157 if (migrate_use_compression()) { 1158 compression_counters.busy_rate = (double)(compression_counters.busy - 1159 rs->compress_thread_busy_prev) / page_count; 1160 rs->compress_thread_busy_prev = compression_counters.busy; 1161 1162 compressed_size = compression_counters.compressed_size - 1163 rs->compressed_size_prev; 1164 if (compressed_size) { 1165 double uncompressed_size = (compression_counters.pages - 1166 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1167 1168 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1169 compression_counters.compression_rate = 1170 uncompressed_size / compressed_size; 1171 1172 rs->compress_pages_prev = compression_counters.pages; 1173 rs->compressed_size_prev = compression_counters.compressed_size; 1174 } 1175 } 1176 } 1177 1178 static void migration_trigger_throttle(RAMState *rs) 1179 { 1180 MigrationState *s = migrate_get_current(); 1181 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1182 uint64_t bytes_xfer_period = 1183 stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev; 1184 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1185 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1186 1187 /* During block migration the auto-converge logic incorrectly detects 1188 * that ram migration makes no progress. Avoid this by disabling the 1189 * throttling logic during the bulk phase of block migration. */ 1190 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1191 /* The following detection logic can be refined later. For now: 1192 Check to see if the ratio between dirtied bytes and the approx. 1193 amount of bytes that just got transferred since the last time 1194 we were in this routine reaches the threshold. If that happens 1195 twice, start or increase throttling. */ 1196 1197 if ((bytes_dirty_period > bytes_dirty_threshold) && 1198 (++rs->dirty_rate_high_cnt >= 2)) { 1199 trace_migration_throttle(); 1200 rs->dirty_rate_high_cnt = 0; 1201 mig_throttle_guest_down(bytes_dirty_period, 1202 bytes_dirty_threshold); 1203 } 1204 } 1205 } 1206 1207 static void migration_bitmap_sync(RAMState *rs) 1208 { 1209 RAMBlock *block; 1210 int64_t end_time; 1211 1212 stat64_add(&ram_counters.dirty_sync_count, 1); 1213 1214 if (!rs->time_last_bitmap_sync) { 1215 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1216 } 1217 1218 trace_migration_bitmap_sync_start(); 1219 memory_global_dirty_log_sync(); 1220 1221 qemu_mutex_lock(&rs->bitmap_mutex); 1222 WITH_RCU_READ_LOCK_GUARD() { 1223 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1224 ramblock_sync_dirty_bitmap(rs, block); 1225 } 1226 ram_counters.remaining = ram_bytes_remaining(); 1227 } 1228 qemu_mutex_unlock(&rs->bitmap_mutex); 1229 1230 memory_global_after_dirty_log_sync(); 1231 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1232 1233 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1234 1235 /* more than 1 second = 1000 millisecons */ 1236 if (end_time > rs->time_last_bitmap_sync + 1000) { 1237 migration_trigger_throttle(rs); 1238 1239 migration_update_rates(rs, end_time); 1240 1241 rs->target_page_count_prev = rs->target_page_count; 1242 1243 /* reset period counters */ 1244 rs->time_last_bitmap_sync = end_time; 1245 rs->num_dirty_pages_period = 0; 1246 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); 1247 } 1248 if (migrate_use_events()) { 1249 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); 1250 qapi_event_send_migration_pass(generation); 1251 } 1252 } 1253 1254 static void migration_bitmap_sync_precopy(RAMState *rs) 1255 { 1256 Error *local_err = NULL; 1257 1258 /* 1259 * The current notifier usage is just an optimization to migration, so we 1260 * don't stop the normal migration process in the error case. 1261 */ 1262 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1263 error_report_err(local_err); 1264 local_err = NULL; 1265 } 1266 1267 migration_bitmap_sync(rs); 1268 1269 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1270 error_report_err(local_err); 1271 } 1272 } 1273 1274 void ram_release_page(const char *rbname, uint64_t offset) 1275 { 1276 if (!migrate_release_ram() || !migration_in_postcopy()) { 1277 return; 1278 } 1279 1280 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1281 } 1282 1283 /** 1284 * save_zero_page_to_file: send the zero page to the file 1285 * 1286 * Returns the size of data written to the file, 0 means the page is not 1287 * a zero page 1288 * 1289 * @pss: current PSS channel 1290 * @block: block that contains the page we want to send 1291 * @offset: offset inside the block for the page 1292 */ 1293 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1294 RAMBlock *block, ram_addr_t offset) 1295 { 1296 uint8_t *p = block->host + offset; 1297 int len = 0; 1298 1299 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1300 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1301 qemu_put_byte(file, 0); 1302 len += 1; 1303 ram_release_page(block->idstr, offset); 1304 } 1305 return len; 1306 } 1307 1308 /** 1309 * save_zero_page: send the zero page to the stream 1310 * 1311 * Returns the number of pages written. 1312 * 1313 * @pss: current PSS channel 1314 * @block: block that contains the page we want to send 1315 * @offset: offset inside the block for the page 1316 */ 1317 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1318 ram_addr_t offset) 1319 { 1320 int len = save_zero_page_to_file(pss, f, block, offset); 1321 1322 if (len) { 1323 stat64_add(&ram_counters.zero_pages, 1); 1324 ram_transferred_add(len); 1325 return 1; 1326 } 1327 return -1; 1328 } 1329 1330 /* 1331 * @pages: the number of pages written by the control path, 1332 * < 0 - error 1333 * > 0 - number of pages written 1334 * 1335 * Return true if the pages has been saved, otherwise false is returned. 1336 */ 1337 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1338 ram_addr_t offset, int *pages) 1339 { 1340 uint64_t bytes_xmit = 0; 1341 int ret; 1342 1343 *pages = -1; 1344 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1345 TARGET_PAGE_SIZE, &bytes_xmit); 1346 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1347 return false; 1348 } 1349 1350 if (bytes_xmit) { 1351 ram_transferred_add(bytes_xmit); 1352 *pages = 1; 1353 } 1354 1355 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1356 return true; 1357 } 1358 1359 if (bytes_xmit > 0) { 1360 stat64_add(&ram_counters.normal_pages, 1); 1361 } else if (bytes_xmit == 0) { 1362 stat64_add(&ram_counters.zero_pages, 1); 1363 } 1364 1365 return true; 1366 } 1367 1368 /* 1369 * directly send the page to the stream 1370 * 1371 * Returns the number of pages written. 1372 * 1373 * @pss: current PSS channel 1374 * @block: block that contains the page we want to send 1375 * @offset: offset inside the block for the page 1376 * @buf: the page to be sent 1377 * @async: send to page asyncly 1378 */ 1379 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1380 ram_addr_t offset, uint8_t *buf, bool async) 1381 { 1382 QEMUFile *file = pss->pss_channel; 1383 1384 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1385 offset | RAM_SAVE_FLAG_PAGE)); 1386 if (async) { 1387 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1388 migrate_release_ram() && 1389 migration_in_postcopy()); 1390 } else { 1391 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1392 } 1393 ram_transferred_add(TARGET_PAGE_SIZE); 1394 stat64_add(&ram_counters.normal_pages, 1); 1395 return 1; 1396 } 1397 1398 /** 1399 * ram_save_page: send the given page to the stream 1400 * 1401 * Returns the number of pages written. 1402 * < 0 - error 1403 * >=0 - Number of pages written - this might legally be 0 1404 * if xbzrle noticed the page was the same. 1405 * 1406 * @rs: current RAM state 1407 * @block: block that contains the page we want to send 1408 * @offset: offset inside the block for the page 1409 */ 1410 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1411 { 1412 int pages = -1; 1413 uint8_t *p; 1414 bool send_async = true; 1415 RAMBlock *block = pss->block; 1416 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1417 ram_addr_t current_addr = block->offset + offset; 1418 1419 p = block->host + offset; 1420 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1421 1422 XBZRLE_cache_lock(); 1423 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1424 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1425 block, offset); 1426 if (!rs->last_stage) { 1427 /* Can't send this cached data async, since the cache page 1428 * might get updated before it gets to the wire 1429 */ 1430 send_async = false; 1431 } 1432 } 1433 1434 /* XBZRLE overflow or normal page */ 1435 if (pages == -1) { 1436 pages = save_normal_page(pss, block, offset, p, send_async); 1437 } 1438 1439 XBZRLE_cache_unlock(); 1440 1441 return pages; 1442 } 1443 1444 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1445 ram_addr_t offset) 1446 { 1447 if (multifd_queue_page(file, block, offset) < 0) { 1448 return -1; 1449 } 1450 stat64_add(&ram_counters.normal_pages, 1); 1451 1452 return 1; 1453 } 1454 1455 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1456 ram_addr_t offset, uint8_t *source_buf) 1457 { 1458 RAMState *rs = ram_state; 1459 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1460 uint8_t *p = block->host + offset; 1461 int ret; 1462 1463 if (save_zero_page_to_file(pss, f, block, offset)) { 1464 return true; 1465 } 1466 1467 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1468 1469 /* 1470 * copy it to a internal buffer to avoid it being modified by VM 1471 * so that we can catch up the error during compression and 1472 * decompression 1473 */ 1474 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1475 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1476 if (ret < 0) { 1477 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1478 error_report("compressed data failed!"); 1479 } 1480 return false; 1481 } 1482 1483 static void 1484 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1485 { 1486 ram_transferred_add(bytes_xmit); 1487 1488 if (param->zero_page) { 1489 stat64_add(&ram_counters.zero_pages, 1); 1490 return; 1491 } 1492 1493 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1494 compression_counters.compressed_size += bytes_xmit - 8; 1495 compression_counters.pages++; 1496 } 1497 1498 static bool save_page_use_compression(RAMState *rs); 1499 1500 static void flush_compressed_data(RAMState *rs) 1501 { 1502 MigrationState *ms = migrate_get_current(); 1503 int idx, len, thread_count; 1504 1505 if (!save_page_use_compression(rs)) { 1506 return; 1507 } 1508 thread_count = migrate_compress_threads(); 1509 1510 qemu_mutex_lock(&comp_done_lock); 1511 for (idx = 0; idx < thread_count; idx++) { 1512 while (!comp_param[idx].done) { 1513 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1514 } 1515 } 1516 qemu_mutex_unlock(&comp_done_lock); 1517 1518 for (idx = 0; idx < thread_count; idx++) { 1519 qemu_mutex_lock(&comp_param[idx].mutex); 1520 if (!comp_param[idx].quit) { 1521 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1522 /* 1523 * it's safe to fetch zero_page without holding comp_done_lock 1524 * as there is no further request submitted to the thread, 1525 * i.e, the thread should be waiting for a request at this point. 1526 */ 1527 update_compress_thread_counts(&comp_param[idx], len); 1528 } 1529 qemu_mutex_unlock(&comp_param[idx].mutex); 1530 } 1531 } 1532 1533 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1534 ram_addr_t offset) 1535 { 1536 param->block = block; 1537 param->offset = offset; 1538 } 1539 1540 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1541 { 1542 int idx, thread_count, bytes_xmit = -1, pages = -1; 1543 bool wait = migrate_compress_wait_thread(); 1544 MigrationState *ms = migrate_get_current(); 1545 1546 thread_count = migrate_compress_threads(); 1547 qemu_mutex_lock(&comp_done_lock); 1548 retry: 1549 for (idx = 0; idx < thread_count; idx++) { 1550 if (comp_param[idx].done) { 1551 comp_param[idx].done = false; 1552 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1553 comp_param[idx].file); 1554 qemu_mutex_lock(&comp_param[idx].mutex); 1555 set_compress_params(&comp_param[idx], block, offset); 1556 qemu_cond_signal(&comp_param[idx].cond); 1557 qemu_mutex_unlock(&comp_param[idx].mutex); 1558 pages = 1; 1559 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1560 break; 1561 } 1562 } 1563 1564 /* 1565 * wait for the free thread if the user specifies 'compress-wait-thread', 1566 * otherwise we will post the page out in the main thread as normal page. 1567 */ 1568 if (pages < 0 && wait) { 1569 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1570 goto retry; 1571 } 1572 qemu_mutex_unlock(&comp_done_lock); 1573 1574 return pages; 1575 } 1576 1577 #define PAGE_ALL_CLEAN 0 1578 #define PAGE_TRY_AGAIN 1 1579 #define PAGE_DIRTY_FOUND 2 1580 /** 1581 * find_dirty_block: find the next dirty page and update any state 1582 * associated with the search process. 1583 * 1584 * Returns: 1585 * PAGE_ALL_CLEAN: no dirty page found, give up 1586 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1587 * PAGE_DIRTY_FOUND: dirty page found 1588 * 1589 * @rs: current RAM state 1590 * @pss: data about the state of the current dirty page scan 1591 * @again: set to false if the search has scanned the whole of RAM 1592 */ 1593 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1594 { 1595 /* Update pss->page for the next dirty bit in ramblock */ 1596 pss_find_next_dirty(pss); 1597 1598 if (pss->complete_round && pss->block == rs->last_seen_block && 1599 pss->page >= rs->last_page) { 1600 /* 1601 * We've been once around the RAM and haven't found anything. 1602 * Give up. 1603 */ 1604 return PAGE_ALL_CLEAN; 1605 } 1606 if (!offset_in_ramblock(pss->block, 1607 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1608 /* Didn't find anything in this RAM Block */ 1609 pss->page = 0; 1610 pss->block = QLIST_NEXT_RCU(pss->block, next); 1611 if (!pss->block) { 1612 /* 1613 * If memory migration starts over, we will meet a dirtied page 1614 * which may still exists in compression threads's ring, so we 1615 * should flush the compressed data to make sure the new page 1616 * is not overwritten by the old one in the destination. 1617 * 1618 * Also If xbzrle is on, stop using the data compression at this 1619 * point. In theory, xbzrle can do better than compression. 1620 */ 1621 flush_compressed_data(rs); 1622 1623 /* Hit the end of the list */ 1624 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1625 /* Flag that we've looped */ 1626 pss->complete_round = true; 1627 /* After the first round, enable XBZRLE. */ 1628 if (migrate_use_xbzrle()) { 1629 rs->xbzrle_enabled = true; 1630 } 1631 } 1632 /* Didn't find anything this time, but try again on the new block */ 1633 return PAGE_TRY_AGAIN; 1634 } else { 1635 /* We've found something */ 1636 return PAGE_DIRTY_FOUND; 1637 } 1638 } 1639 1640 /** 1641 * unqueue_page: gets a page of the queue 1642 * 1643 * Helper for 'get_queued_page' - gets a page off the queue 1644 * 1645 * Returns the block of the page (or NULL if none available) 1646 * 1647 * @rs: current RAM state 1648 * @offset: used to return the offset within the RAMBlock 1649 */ 1650 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1651 { 1652 struct RAMSrcPageRequest *entry; 1653 RAMBlock *block = NULL; 1654 1655 if (!postcopy_has_request(rs)) { 1656 return NULL; 1657 } 1658 1659 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1660 1661 /* 1662 * This should _never_ change even after we take the lock, because no one 1663 * should be taking anything off the request list other than us. 1664 */ 1665 assert(postcopy_has_request(rs)); 1666 1667 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1668 block = entry->rb; 1669 *offset = entry->offset; 1670 1671 if (entry->len > TARGET_PAGE_SIZE) { 1672 entry->len -= TARGET_PAGE_SIZE; 1673 entry->offset += TARGET_PAGE_SIZE; 1674 } else { 1675 memory_region_unref(block->mr); 1676 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1677 g_free(entry); 1678 migration_consume_urgent_request(); 1679 } 1680 1681 return block; 1682 } 1683 1684 #if defined(__linux__) 1685 /** 1686 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1687 * is found, return RAM block pointer and page offset 1688 * 1689 * Returns pointer to the RAMBlock containing faulting page, 1690 * NULL if no write faults are pending 1691 * 1692 * @rs: current RAM state 1693 * @offset: page offset from the beginning of the block 1694 */ 1695 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1696 { 1697 struct uffd_msg uffd_msg; 1698 void *page_address; 1699 RAMBlock *block; 1700 int res; 1701 1702 if (!migrate_background_snapshot()) { 1703 return NULL; 1704 } 1705 1706 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1707 if (res <= 0) { 1708 return NULL; 1709 } 1710 1711 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1712 block = qemu_ram_block_from_host(page_address, false, offset); 1713 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1714 return block; 1715 } 1716 1717 /** 1718 * ram_save_release_protection: release UFFD write protection after 1719 * a range of pages has been saved 1720 * 1721 * @rs: current RAM state 1722 * @pss: page-search-status structure 1723 * @start_page: index of the first page in the range relative to pss->block 1724 * 1725 * Returns 0 on success, negative value in case of an error 1726 */ 1727 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1728 unsigned long start_page) 1729 { 1730 int res = 0; 1731 1732 /* Check if page is from UFFD-managed region. */ 1733 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1734 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1735 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1736 1737 /* Flush async buffers before un-protect. */ 1738 qemu_fflush(pss->pss_channel); 1739 /* Un-protect memory range. */ 1740 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1741 false, false); 1742 } 1743 1744 return res; 1745 } 1746 1747 /* ram_write_tracking_available: check if kernel supports required UFFD features 1748 * 1749 * Returns true if supports, false otherwise 1750 */ 1751 bool ram_write_tracking_available(void) 1752 { 1753 uint64_t uffd_features; 1754 int res; 1755 1756 res = uffd_query_features(&uffd_features); 1757 return (res == 0 && 1758 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1759 } 1760 1761 /* ram_write_tracking_compatible: check if guest configuration is 1762 * compatible with 'write-tracking' 1763 * 1764 * Returns true if compatible, false otherwise 1765 */ 1766 bool ram_write_tracking_compatible(void) 1767 { 1768 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1769 int uffd_fd; 1770 RAMBlock *block; 1771 bool ret = false; 1772 1773 /* Open UFFD file descriptor */ 1774 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1775 if (uffd_fd < 0) { 1776 return false; 1777 } 1778 1779 RCU_READ_LOCK_GUARD(); 1780 1781 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1782 uint64_t uffd_ioctls; 1783 1784 /* Nothing to do with read-only and MMIO-writable regions */ 1785 if (block->mr->readonly || block->mr->rom_device) { 1786 continue; 1787 } 1788 /* Try to register block memory via UFFD-IO to track writes */ 1789 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1790 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1791 goto out; 1792 } 1793 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1794 goto out; 1795 } 1796 } 1797 ret = true; 1798 1799 out: 1800 uffd_close_fd(uffd_fd); 1801 return ret; 1802 } 1803 1804 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1805 ram_addr_t size) 1806 { 1807 const ram_addr_t end = offset + size; 1808 1809 /* 1810 * We read one byte of each page; this will preallocate page tables if 1811 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1812 * where no page was populated yet. This might require adaption when 1813 * supporting other mappings, like shmem. 1814 */ 1815 for (; offset < end; offset += block->page_size) { 1816 char tmp = *((char *)block->host + offset); 1817 1818 /* Don't optimize the read out */ 1819 asm volatile("" : "+r" (tmp)); 1820 } 1821 } 1822 1823 static inline int populate_read_section(MemoryRegionSection *section, 1824 void *opaque) 1825 { 1826 const hwaddr size = int128_get64(section->size); 1827 hwaddr offset = section->offset_within_region; 1828 RAMBlock *block = section->mr->ram_block; 1829 1830 populate_read_range(block, offset, size); 1831 return 0; 1832 } 1833 1834 /* 1835 * ram_block_populate_read: preallocate page tables and populate pages in the 1836 * RAM block by reading a byte of each page. 1837 * 1838 * Since it's solely used for userfault_fd WP feature, here we just 1839 * hardcode page size to qemu_real_host_page_size. 1840 * 1841 * @block: RAM block to populate 1842 */ 1843 static void ram_block_populate_read(RAMBlock *rb) 1844 { 1845 /* 1846 * Skip populating all pages that fall into a discarded range as managed by 1847 * a RamDiscardManager responsible for the mapped memory region of the 1848 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1849 * must not get populated automatically. We don't have to track 1850 * modifications via userfaultfd WP reliably, because these pages will 1851 * not be part of the migration stream either way -- see 1852 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1853 * 1854 * Note: The result is only stable while migrating (precopy/postcopy). 1855 */ 1856 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1857 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1858 MemoryRegionSection section = { 1859 .mr = rb->mr, 1860 .offset_within_region = 0, 1861 .size = rb->mr->size, 1862 }; 1863 1864 ram_discard_manager_replay_populated(rdm, §ion, 1865 populate_read_section, NULL); 1866 } else { 1867 populate_read_range(rb, 0, rb->used_length); 1868 } 1869 } 1870 1871 /* 1872 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1873 */ 1874 void ram_write_tracking_prepare(void) 1875 { 1876 RAMBlock *block; 1877 1878 RCU_READ_LOCK_GUARD(); 1879 1880 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1881 /* Nothing to do with read-only and MMIO-writable regions */ 1882 if (block->mr->readonly || block->mr->rom_device) { 1883 continue; 1884 } 1885 1886 /* 1887 * Populate pages of the RAM block before enabling userfault_fd 1888 * write protection. 1889 * 1890 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1891 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1892 * pages with pte_none() entries in page table. 1893 */ 1894 ram_block_populate_read(block); 1895 } 1896 } 1897 1898 static inline int uffd_protect_section(MemoryRegionSection *section, 1899 void *opaque) 1900 { 1901 const hwaddr size = int128_get64(section->size); 1902 const hwaddr offset = section->offset_within_region; 1903 RAMBlock *rb = section->mr->ram_block; 1904 int uffd_fd = (uintptr_t)opaque; 1905 1906 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1907 false); 1908 } 1909 1910 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1911 { 1912 assert(rb->flags & RAM_UF_WRITEPROTECT); 1913 1914 /* See ram_block_populate_read() */ 1915 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1916 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1917 MemoryRegionSection section = { 1918 .mr = rb->mr, 1919 .offset_within_region = 0, 1920 .size = rb->mr->size, 1921 }; 1922 1923 return ram_discard_manager_replay_populated(rdm, §ion, 1924 uffd_protect_section, 1925 (void *)(uintptr_t)uffd_fd); 1926 } 1927 return uffd_change_protection(uffd_fd, rb->host, 1928 rb->used_length, true, false); 1929 } 1930 1931 /* 1932 * ram_write_tracking_start: start UFFD-WP memory tracking 1933 * 1934 * Returns 0 for success or negative value in case of error 1935 */ 1936 int ram_write_tracking_start(void) 1937 { 1938 int uffd_fd; 1939 RAMState *rs = ram_state; 1940 RAMBlock *block; 1941 1942 /* Open UFFD file descriptor */ 1943 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1944 if (uffd_fd < 0) { 1945 return uffd_fd; 1946 } 1947 rs->uffdio_fd = uffd_fd; 1948 1949 RCU_READ_LOCK_GUARD(); 1950 1951 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1952 /* Nothing to do with read-only and MMIO-writable regions */ 1953 if (block->mr->readonly || block->mr->rom_device) { 1954 continue; 1955 } 1956 1957 /* Register block memory with UFFD to track writes */ 1958 if (uffd_register_memory(rs->uffdio_fd, block->host, 1959 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1960 goto fail; 1961 } 1962 block->flags |= RAM_UF_WRITEPROTECT; 1963 memory_region_ref(block->mr); 1964 1965 /* Apply UFFD write protection to the block memory range */ 1966 if (ram_block_uffd_protect(block, uffd_fd)) { 1967 goto fail; 1968 } 1969 1970 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1971 block->host, block->max_length); 1972 } 1973 1974 return 0; 1975 1976 fail: 1977 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1978 1979 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1980 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1981 continue; 1982 } 1983 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1984 /* Cleanup flags and remove reference */ 1985 block->flags &= ~RAM_UF_WRITEPROTECT; 1986 memory_region_unref(block->mr); 1987 } 1988 1989 uffd_close_fd(uffd_fd); 1990 rs->uffdio_fd = -1; 1991 return -1; 1992 } 1993 1994 /** 1995 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1996 */ 1997 void ram_write_tracking_stop(void) 1998 { 1999 RAMState *rs = ram_state; 2000 RAMBlock *block; 2001 2002 RCU_READ_LOCK_GUARD(); 2003 2004 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2005 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 2006 continue; 2007 } 2008 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 2009 2010 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 2011 block->host, block->max_length); 2012 2013 /* Cleanup flags and remove reference */ 2014 block->flags &= ~RAM_UF_WRITEPROTECT; 2015 memory_region_unref(block->mr); 2016 } 2017 2018 /* Finally close UFFD file descriptor */ 2019 uffd_close_fd(rs->uffdio_fd); 2020 rs->uffdio_fd = -1; 2021 } 2022 2023 #else 2024 /* No target OS support, stubs just fail or ignore */ 2025 2026 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 2027 { 2028 (void) rs; 2029 (void) offset; 2030 2031 return NULL; 2032 } 2033 2034 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 2035 unsigned long start_page) 2036 { 2037 (void) rs; 2038 (void) pss; 2039 (void) start_page; 2040 2041 return 0; 2042 } 2043 2044 bool ram_write_tracking_available(void) 2045 { 2046 return false; 2047 } 2048 2049 bool ram_write_tracking_compatible(void) 2050 { 2051 assert(0); 2052 return false; 2053 } 2054 2055 int ram_write_tracking_start(void) 2056 { 2057 assert(0); 2058 return -1; 2059 } 2060 2061 void ram_write_tracking_stop(void) 2062 { 2063 assert(0); 2064 } 2065 #endif /* defined(__linux__) */ 2066 2067 /** 2068 * get_queued_page: unqueue a page from the postcopy requests 2069 * 2070 * Skips pages that are already sent (!dirty) 2071 * 2072 * Returns true if a queued page is found 2073 * 2074 * @rs: current RAM state 2075 * @pss: data about the state of the current dirty page scan 2076 */ 2077 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2078 { 2079 RAMBlock *block; 2080 ram_addr_t offset; 2081 bool dirty; 2082 2083 do { 2084 block = unqueue_page(rs, &offset); 2085 /* 2086 * We're sending this page, and since it's postcopy nothing else 2087 * will dirty it, and we must make sure it doesn't get sent again 2088 * even if this queue request was received after the background 2089 * search already sent it. 2090 */ 2091 if (block) { 2092 unsigned long page; 2093 2094 page = offset >> TARGET_PAGE_BITS; 2095 dirty = test_bit(page, block->bmap); 2096 if (!dirty) { 2097 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2098 page); 2099 } else { 2100 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2101 } 2102 } 2103 2104 } while (block && !dirty); 2105 2106 if (!block) { 2107 /* 2108 * Poll write faults too if background snapshot is enabled; that's 2109 * when we have vcpus got blocked by the write protected pages. 2110 */ 2111 block = poll_fault_page(rs, &offset); 2112 } 2113 2114 if (block) { 2115 /* 2116 * We want the background search to continue from the queued page 2117 * since the guest is likely to want other pages near to the page 2118 * it just requested. 2119 */ 2120 pss->block = block; 2121 pss->page = offset >> TARGET_PAGE_BITS; 2122 2123 /* 2124 * This unqueued page would break the "one round" check, even is 2125 * really rare. 2126 */ 2127 pss->complete_round = false; 2128 } 2129 2130 return !!block; 2131 } 2132 2133 /** 2134 * migration_page_queue_free: drop any remaining pages in the ram 2135 * request queue 2136 * 2137 * It should be empty at the end anyway, but in error cases there may 2138 * be some left. in case that there is any page left, we drop it. 2139 * 2140 */ 2141 static void migration_page_queue_free(RAMState *rs) 2142 { 2143 struct RAMSrcPageRequest *mspr, *next_mspr; 2144 /* This queue generally should be empty - but in the case of a failed 2145 * migration might have some droppings in. 2146 */ 2147 RCU_READ_LOCK_GUARD(); 2148 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2149 memory_region_unref(mspr->rb->mr); 2150 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2151 g_free(mspr); 2152 } 2153 } 2154 2155 /** 2156 * ram_save_queue_pages: queue the page for transmission 2157 * 2158 * A request from postcopy destination for example. 2159 * 2160 * Returns zero on success or negative on error 2161 * 2162 * @rbname: Name of the RAMBLock of the request. NULL means the 2163 * same that last one. 2164 * @start: starting address from the start of the RAMBlock 2165 * @len: length (in bytes) to send 2166 */ 2167 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2168 { 2169 RAMBlock *ramblock; 2170 RAMState *rs = ram_state; 2171 2172 stat64_add(&ram_counters.postcopy_requests, 1); 2173 RCU_READ_LOCK_GUARD(); 2174 2175 if (!rbname) { 2176 /* Reuse last RAMBlock */ 2177 ramblock = rs->last_req_rb; 2178 2179 if (!ramblock) { 2180 /* 2181 * Shouldn't happen, we can't reuse the last RAMBlock if 2182 * it's the 1st request. 2183 */ 2184 error_report("ram_save_queue_pages no previous block"); 2185 return -1; 2186 } 2187 } else { 2188 ramblock = qemu_ram_block_by_name(rbname); 2189 2190 if (!ramblock) { 2191 /* We shouldn't be asked for a non-existent RAMBlock */ 2192 error_report("ram_save_queue_pages no block '%s'", rbname); 2193 return -1; 2194 } 2195 rs->last_req_rb = ramblock; 2196 } 2197 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2198 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2199 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2200 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2201 __func__, start, len, ramblock->used_length); 2202 return -1; 2203 } 2204 2205 /* 2206 * When with postcopy preempt, we send back the page directly in the 2207 * rp-return thread. 2208 */ 2209 if (postcopy_preempt_active()) { 2210 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2211 size_t page_size = qemu_ram_pagesize(ramblock); 2212 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2213 int ret = 0; 2214 2215 qemu_mutex_lock(&rs->bitmap_mutex); 2216 2217 pss_init(pss, ramblock, page_start); 2218 /* 2219 * Always use the preempt channel, and make sure it's there. It's 2220 * safe to access without lock, because when rp-thread is running 2221 * we should be the only one who operates on the qemufile 2222 */ 2223 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2224 assert(pss->pss_channel); 2225 2226 /* 2227 * It must be either one or multiple of host page size. Just 2228 * assert; if something wrong we're mostly split brain anyway. 2229 */ 2230 assert(len % page_size == 0); 2231 while (len) { 2232 if (ram_save_host_page_urgent(pss)) { 2233 error_report("%s: ram_save_host_page_urgent() failed: " 2234 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2235 __func__, ramblock->idstr, start); 2236 ret = -1; 2237 break; 2238 } 2239 /* 2240 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2241 * will automatically be moved and point to the next host page 2242 * we're going to send, so no need to update here. 2243 * 2244 * Normally QEMU never sends >1 host page in requests, so 2245 * logically we don't even need that as the loop should only 2246 * run once, but just to be consistent. 2247 */ 2248 len -= page_size; 2249 }; 2250 qemu_mutex_unlock(&rs->bitmap_mutex); 2251 2252 return ret; 2253 } 2254 2255 struct RAMSrcPageRequest *new_entry = 2256 g_new0(struct RAMSrcPageRequest, 1); 2257 new_entry->rb = ramblock; 2258 new_entry->offset = start; 2259 new_entry->len = len; 2260 2261 memory_region_ref(ramblock->mr); 2262 qemu_mutex_lock(&rs->src_page_req_mutex); 2263 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2264 migration_make_urgent_request(); 2265 qemu_mutex_unlock(&rs->src_page_req_mutex); 2266 2267 return 0; 2268 } 2269 2270 static bool save_page_use_compression(RAMState *rs) 2271 { 2272 if (!migrate_use_compression()) { 2273 return false; 2274 } 2275 2276 /* 2277 * If xbzrle is enabled (e.g., after first round of migration), stop 2278 * using the data compression. In theory, xbzrle can do better than 2279 * compression. 2280 */ 2281 if (rs->xbzrle_enabled) { 2282 return false; 2283 } 2284 2285 return true; 2286 } 2287 2288 /* 2289 * try to compress the page before posting it out, return true if the page 2290 * has been properly handled by compression, otherwise needs other 2291 * paths to handle it 2292 */ 2293 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2294 RAMBlock *block, ram_addr_t offset) 2295 { 2296 if (!save_page_use_compression(rs)) { 2297 return false; 2298 } 2299 2300 /* 2301 * When starting the process of a new block, the first page of 2302 * the block should be sent out before other pages in the same 2303 * block, and all the pages in last block should have been sent 2304 * out, keeping this order is important, because the 'cont' flag 2305 * is used to avoid resending the block name. 2306 * 2307 * We post the fist page as normal page as compression will take 2308 * much CPU resource. 2309 */ 2310 if (block != pss->last_sent_block) { 2311 flush_compressed_data(rs); 2312 return false; 2313 } 2314 2315 if (compress_page_with_multi_thread(block, offset) > 0) { 2316 return true; 2317 } 2318 2319 compression_counters.busy++; 2320 return false; 2321 } 2322 2323 /** 2324 * ram_save_target_page_legacy: save one target page 2325 * 2326 * Returns the number of pages written 2327 * 2328 * @rs: current RAM state 2329 * @pss: data about the page we want to send 2330 */ 2331 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2332 { 2333 RAMBlock *block = pss->block; 2334 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2335 int res; 2336 2337 if (control_save_page(pss, block, offset, &res)) { 2338 return res; 2339 } 2340 2341 if (save_compress_page(rs, pss, block, offset)) { 2342 return 1; 2343 } 2344 2345 res = save_zero_page(pss, pss->pss_channel, block, offset); 2346 if (res > 0) { 2347 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2348 * page would be stale 2349 */ 2350 if (rs->xbzrle_enabled) { 2351 XBZRLE_cache_lock(); 2352 xbzrle_cache_zero_page(rs, block->offset + offset); 2353 XBZRLE_cache_unlock(); 2354 } 2355 return res; 2356 } 2357 2358 /* 2359 * Do not use multifd in postcopy as one whole host page should be 2360 * placed. Meanwhile postcopy requires atomic update of pages, so even 2361 * if host page size == guest page size the dest guest during run may 2362 * still see partially copied pages which is data corruption. 2363 */ 2364 if (migrate_use_multifd() && !migration_in_postcopy()) { 2365 return ram_save_multifd_page(pss->pss_channel, block, offset); 2366 } 2367 2368 return ram_save_page(rs, pss); 2369 } 2370 2371 /* Should be called before sending a host page */ 2372 static void pss_host_page_prepare(PageSearchStatus *pss) 2373 { 2374 /* How many guest pages are there in one host page? */ 2375 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2376 2377 pss->host_page_sending = true; 2378 if (guest_pfns <= 1) { 2379 /* 2380 * This covers both when guest psize == host psize, or when guest 2381 * has larger psize than the host (guest_pfns==0). 2382 * 2383 * For the latter, we always send one whole guest page per 2384 * iteration of the host page (example: an Alpha VM on x86 host 2385 * will have guest psize 8K while host psize 4K). 2386 */ 2387 pss->host_page_start = pss->page; 2388 pss->host_page_end = pss->page + 1; 2389 } else { 2390 /* 2391 * The host page spans over multiple guest pages, we send them 2392 * within the same host page iteration. 2393 */ 2394 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2395 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2396 } 2397 } 2398 2399 /* 2400 * Whether the page pointed by PSS is within the host page being sent. 2401 * Must be called after a previous pss_host_page_prepare(). 2402 */ 2403 static bool pss_within_range(PageSearchStatus *pss) 2404 { 2405 ram_addr_t ram_addr; 2406 2407 assert(pss->host_page_sending); 2408 2409 /* Over host-page boundary? */ 2410 if (pss->page >= pss->host_page_end) { 2411 return false; 2412 } 2413 2414 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2415 2416 return offset_in_ramblock(pss->block, ram_addr); 2417 } 2418 2419 static void pss_host_page_finish(PageSearchStatus *pss) 2420 { 2421 pss->host_page_sending = false; 2422 /* This is not needed, but just to reset it */ 2423 pss->host_page_start = pss->host_page_end = 0; 2424 } 2425 2426 /* 2427 * Send an urgent host page specified by `pss'. Need to be called with 2428 * bitmap_mutex held. 2429 * 2430 * Returns 0 if save host page succeeded, false otherwise. 2431 */ 2432 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2433 { 2434 bool page_dirty, sent = false; 2435 RAMState *rs = ram_state; 2436 int ret = 0; 2437 2438 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2439 pss_host_page_prepare(pss); 2440 2441 /* 2442 * If precopy is sending the same page, let it be done in precopy, or 2443 * we could send the same page in two channels and none of them will 2444 * receive the whole page. 2445 */ 2446 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2447 trace_postcopy_preempt_hit(pss->block->idstr, 2448 pss->page << TARGET_PAGE_BITS); 2449 return 0; 2450 } 2451 2452 do { 2453 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2454 2455 if (page_dirty) { 2456 /* Be strict to return code; it must be 1, or what else? */ 2457 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2458 error_report_once("%s: ram_save_target_page failed", __func__); 2459 ret = -1; 2460 goto out; 2461 } 2462 sent = true; 2463 } 2464 pss_find_next_dirty(pss); 2465 } while (pss_within_range(pss)); 2466 out: 2467 pss_host_page_finish(pss); 2468 /* For urgent requests, flush immediately if sent */ 2469 if (sent) { 2470 qemu_fflush(pss->pss_channel); 2471 } 2472 return ret; 2473 } 2474 2475 /** 2476 * ram_save_host_page: save a whole host page 2477 * 2478 * Starting at *offset send pages up to the end of the current host 2479 * page. It's valid for the initial offset to point into the middle of 2480 * a host page in which case the remainder of the hostpage is sent. 2481 * Only dirty target pages are sent. Note that the host page size may 2482 * be a huge page for this block. 2483 * 2484 * The saving stops at the boundary of the used_length of the block 2485 * if the RAMBlock isn't a multiple of the host page size. 2486 * 2487 * The caller must be with ram_state.bitmap_mutex held to call this 2488 * function. Note that this function can temporarily release the lock, but 2489 * when the function is returned it'll make sure the lock is still held. 2490 * 2491 * Returns the number of pages written or negative on error 2492 * 2493 * @rs: current RAM state 2494 * @pss: data about the page we want to send 2495 */ 2496 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2497 { 2498 bool page_dirty, preempt_active = postcopy_preempt_active(); 2499 int tmppages, pages = 0; 2500 size_t pagesize_bits = 2501 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2502 unsigned long start_page = pss->page; 2503 int res; 2504 2505 if (ramblock_is_ignored(pss->block)) { 2506 error_report("block %s should not be migrated !", pss->block->idstr); 2507 return 0; 2508 } 2509 2510 /* Update host page boundary information */ 2511 pss_host_page_prepare(pss); 2512 2513 do { 2514 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2515 2516 /* Check the pages is dirty and if it is send it */ 2517 if (page_dirty) { 2518 /* 2519 * Properly yield the lock only in postcopy preempt mode 2520 * because both migration thread and rp-return thread can 2521 * operate on the bitmaps. 2522 */ 2523 if (preempt_active) { 2524 qemu_mutex_unlock(&rs->bitmap_mutex); 2525 } 2526 tmppages = migration_ops->ram_save_target_page(rs, pss); 2527 if (tmppages >= 0) { 2528 pages += tmppages; 2529 /* 2530 * Allow rate limiting to happen in the middle of huge pages if 2531 * something is sent in the current iteration. 2532 */ 2533 if (pagesize_bits > 1 && tmppages > 0) { 2534 migration_rate_limit(); 2535 } 2536 } 2537 if (preempt_active) { 2538 qemu_mutex_lock(&rs->bitmap_mutex); 2539 } 2540 } else { 2541 tmppages = 0; 2542 } 2543 2544 if (tmppages < 0) { 2545 pss_host_page_finish(pss); 2546 return tmppages; 2547 } 2548 2549 pss_find_next_dirty(pss); 2550 } while (pss_within_range(pss)); 2551 2552 pss_host_page_finish(pss); 2553 2554 res = ram_save_release_protection(rs, pss, start_page); 2555 return (res < 0 ? res : pages); 2556 } 2557 2558 /** 2559 * ram_find_and_save_block: finds a dirty page and sends it to f 2560 * 2561 * Called within an RCU critical section. 2562 * 2563 * Returns the number of pages written where zero means no dirty pages, 2564 * or negative on error 2565 * 2566 * @rs: current RAM state 2567 * 2568 * On systems where host-page-size > target-page-size it will send all the 2569 * pages in a host page that are dirty. 2570 */ 2571 static int ram_find_and_save_block(RAMState *rs) 2572 { 2573 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2574 int pages = 0; 2575 2576 /* No dirty page as there is zero RAM */ 2577 if (!rs->ram_bytes_total) { 2578 return pages; 2579 } 2580 2581 /* 2582 * Always keep last_seen_block/last_page valid during this procedure, 2583 * because find_dirty_block() relies on these values (e.g., we compare 2584 * last_seen_block with pss.block to see whether we searched all the 2585 * ramblocks) to detect the completion of migration. Having NULL value 2586 * of last_seen_block can conditionally cause below loop to run forever. 2587 */ 2588 if (!rs->last_seen_block) { 2589 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2590 rs->last_page = 0; 2591 } 2592 2593 pss_init(pss, rs->last_seen_block, rs->last_page); 2594 2595 while (true){ 2596 if (!get_queued_page(rs, pss)) { 2597 /* priority queue empty, so just search for something dirty */ 2598 int res = find_dirty_block(rs, pss); 2599 if (res != PAGE_DIRTY_FOUND) { 2600 if (res == PAGE_ALL_CLEAN) { 2601 break; 2602 } else if (res == PAGE_TRY_AGAIN) { 2603 continue; 2604 } 2605 } 2606 } 2607 pages = ram_save_host_page(rs, pss); 2608 if (pages) { 2609 break; 2610 } 2611 } 2612 2613 rs->last_seen_block = pss->block; 2614 rs->last_page = pss->page; 2615 2616 return pages; 2617 } 2618 2619 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2620 { 2621 uint64_t pages = size / TARGET_PAGE_SIZE; 2622 2623 if (zero) { 2624 stat64_add(&ram_counters.zero_pages, pages); 2625 } else { 2626 stat64_add(&ram_counters.normal_pages, pages); 2627 ram_transferred_add(size); 2628 qemu_file_credit_transfer(f, size); 2629 } 2630 } 2631 2632 static uint64_t ram_bytes_total_with_ignored(void) 2633 { 2634 RAMBlock *block; 2635 uint64_t total = 0; 2636 2637 RCU_READ_LOCK_GUARD(); 2638 2639 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2640 total += block->used_length; 2641 } 2642 return total; 2643 } 2644 2645 uint64_t ram_bytes_total(void) 2646 { 2647 RAMBlock *block; 2648 uint64_t total = 0; 2649 2650 RCU_READ_LOCK_GUARD(); 2651 2652 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2653 total += block->used_length; 2654 } 2655 return total; 2656 } 2657 2658 static void xbzrle_load_setup(void) 2659 { 2660 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2661 } 2662 2663 static void xbzrle_load_cleanup(void) 2664 { 2665 g_free(XBZRLE.decoded_buf); 2666 XBZRLE.decoded_buf = NULL; 2667 } 2668 2669 static void ram_state_cleanup(RAMState **rsp) 2670 { 2671 if (*rsp) { 2672 migration_page_queue_free(*rsp); 2673 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2674 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2675 g_free(*rsp); 2676 *rsp = NULL; 2677 } 2678 } 2679 2680 static void xbzrle_cleanup(void) 2681 { 2682 XBZRLE_cache_lock(); 2683 if (XBZRLE.cache) { 2684 cache_fini(XBZRLE.cache); 2685 g_free(XBZRLE.encoded_buf); 2686 g_free(XBZRLE.current_buf); 2687 g_free(XBZRLE.zero_target_page); 2688 XBZRLE.cache = NULL; 2689 XBZRLE.encoded_buf = NULL; 2690 XBZRLE.current_buf = NULL; 2691 XBZRLE.zero_target_page = NULL; 2692 } 2693 XBZRLE_cache_unlock(); 2694 } 2695 2696 static void ram_save_cleanup(void *opaque) 2697 { 2698 RAMState **rsp = opaque; 2699 RAMBlock *block; 2700 2701 /* We don't use dirty log with background snapshots */ 2702 if (!migrate_background_snapshot()) { 2703 /* caller have hold iothread lock or is in a bh, so there is 2704 * no writing race against the migration bitmap 2705 */ 2706 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2707 /* 2708 * do not stop dirty log without starting it, since 2709 * memory_global_dirty_log_stop will assert that 2710 * memory_global_dirty_log_start/stop used in pairs 2711 */ 2712 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2713 } 2714 } 2715 2716 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2717 g_free(block->clear_bmap); 2718 block->clear_bmap = NULL; 2719 g_free(block->bmap); 2720 block->bmap = NULL; 2721 } 2722 2723 xbzrle_cleanup(); 2724 compress_threads_save_cleanup(); 2725 ram_state_cleanup(rsp); 2726 g_free(migration_ops); 2727 migration_ops = NULL; 2728 } 2729 2730 static void ram_state_reset(RAMState *rs) 2731 { 2732 int i; 2733 2734 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2735 rs->pss[i].last_sent_block = NULL; 2736 } 2737 2738 rs->last_seen_block = NULL; 2739 rs->last_page = 0; 2740 rs->last_version = ram_list.version; 2741 rs->xbzrle_enabled = false; 2742 } 2743 2744 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2745 2746 /* **** functions for postcopy ***** */ 2747 2748 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2749 { 2750 struct RAMBlock *block; 2751 2752 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2753 unsigned long *bitmap = block->bmap; 2754 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2755 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2756 2757 while (run_start < range) { 2758 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2759 ram_discard_range(block->idstr, 2760 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2761 ((ram_addr_t)(run_end - run_start)) 2762 << TARGET_PAGE_BITS); 2763 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2764 } 2765 } 2766 } 2767 2768 /** 2769 * postcopy_send_discard_bm_ram: discard a RAMBlock 2770 * 2771 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2772 * 2773 * @ms: current migration state 2774 * @block: RAMBlock to discard 2775 */ 2776 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2777 { 2778 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2779 unsigned long current; 2780 unsigned long *bitmap = block->bmap; 2781 2782 for (current = 0; current < end; ) { 2783 unsigned long one = find_next_bit(bitmap, end, current); 2784 unsigned long zero, discard_length; 2785 2786 if (one >= end) { 2787 break; 2788 } 2789 2790 zero = find_next_zero_bit(bitmap, end, one + 1); 2791 2792 if (zero >= end) { 2793 discard_length = end - one; 2794 } else { 2795 discard_length = zero - one; 2796 } 2797 postcopy_discard_send_range(ms, one, discard_length); 2798 current = one + discard_length; 2799 } 2800 } 2801 2802 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2803 2804 /** 2805 * postcopy_each_ram_send_discard: discard all RAMBlocks 2806 * 2807 * Utility for the outgoing postcopy code. 2808 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2809 * passing it bitmap indexes and name. 2810 * (qemu_ram_foreach_block ends up passing unscaled lengths 2811 * which would mean postcopy code would have to deal with target page) 2812 * 2813 * @ms: current migration state 2814 */ 2815 static void postcopy_each_ram_send_discard(MigrationState *ms) 2816 { 2817 struct RAMBlock *block; 2818 2819 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2820 postcopy_discard_send_init(ms, block->idstr); 2821 2822 /* 2823 * Deal with TPS != HPS and huge pages. It discard any partially sent 2824 * host-page size chunks, mark any partially dirty host-page size 2825 * chunks as all dirty. In this case the host-page is the host-page 2826 * for the particular RAMBlock, i.e. it might be a huge page. 2827 */ 2828 postcopy_chunk_hostpages_pass(ms, block); 2829 2830 /* 2831 * Postcopy sends chunks of bitmap over the wire, but it 2832 * just needs indexes at this point, avoids it having 2833 * target page specific code. 2834 */ 2835 postcopy_send_discard_bm_ram(ms, block); 2836 postcopy_discard_send_finish(ms); 2837 } 2838 } 2839 2840 /** 2841 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2842 * 2843 * Helper for postcopy_chunk_hostpages; it's called twice to 2844 * canonicalize the two bitmaps, that are similar, but one is 2845 * inverted. 2846 * 2847 * Postcopy requires that all target pages in a hostpage are dirty or 2848 * clean, not a mix. This function canonicalizes the bitmaps. 2849 * 2850 * @ms: current migration state 2851 * @block: block that contains the page we want to canonicalize 2852 */ 2853 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2854 { 2855 RAMState *rs = ram_state; 2856 unsigned long *bitmap = block->bmap; 2857 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2858 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2859 unsigned long run_start; 2860 2861 if (block->page_size == TARGET_PAGE_SIZE) { 2862 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2863 return; 2864 } 2865 2866 /* Find a dirty page */ 2867 run_start = find_next_bit(bitmap, pages, 0); 2868 2869 while (run_start < pages) { 2870 2871 /* 2872 * If the start of this run of pages is in the middle of a host 2873 * page, then we need to fixup this host page. 2874 */ 2875 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2876 /* Find the end of this run */ 2877 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2878 /* 2879 * If the end isn't at the start of a host page, then the 2880 * run doesn't finish at the end of a host page 2881 * and we need to discard. 2882 */ 2883 } 2884 2885 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2886 unsigned long page; 2887 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2888 host_ratio); 2889 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2890 2891 /* Clean up the bitmap */ 2892 for (page = fixup_start_addr; 2893 page < fixup_start_addr + host_ratio; page++) { 2894 /* 2895 * Remark them as dirty, updating the count for any pages 2896 * that weren't previously dirty. 2897 */ 2898 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2899 } 2900 } 2901 2902 /* Find the next dirty page for the next iteration */ 2903 run_start = find_next_bit(bitmap, pages, run_start); 2904 } 2905 } 2906 2907 /** 2908 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2909 * 2910 * Transmit the set of pages to be discarded after precopy to the target 2911 * these are pages that: 2912 * a) Have been previously transmitted but are now dirty again 2913 * b) Pages that have never been transmitted, this ensures that 2914 * any pages on the destination that have been mapped by background 2915 * tasks get discarded (transparent huge pages is the specific concern) 2916 * Hopefully this is pretty sparse 2917 * 2918 * @ms: current migration state 2919 */ 2920 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2921 { 2922 RAMState *rs = ram_state; 2923 2924 RCU_READ_LOCK_GUARD(); 2925 2926 /* This should be our last sync, the src is now paused */ 2927 migration_bitmap_sync(rs); 2928 2929 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2930 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2931 rs->last_seen_block = NULL; 2932 rs->last_page = 0; 2933 2934 postcopy_each_ram_send_discard(ms); 2935 2936 trace_ram_postcopy_send_discard_bitmap(); 2937 } 2938 2939 /** 2940 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2941 * 2942 * Returns zero on success 2943 * 2944 * @rbname: name of the RAMBlock of the request. NULL means the 2945 * same that last one. 2946 * @start: RAMBlock starting page 2947 * @length: RAMBlock size 2948 */ 2949 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2950 { 2951 trace_ram_discard_range(rbname, start, length); 2952 2953 RCU_READ_LOCK_GUARD(); 2954 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2955 2956 if (!rb) { 2957 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2958 return -1; 2959 } 2960 2961 /* 2962 * On source VM, we don't need to update the received bitmap since 2963 * we don't even have one. 2964 */ 2965 if (rb->receivedmap) { 2966 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2967 length >> qemu_target_page_bits()); 2968 } 2969 2970 return ram_block_discard_range(rb, start, length); 2971 } 2972 2973 /* 2974 * For every allocation, we will try not to crash the VM if the 2975 * allocation failed. 2976 */ 2977 static int xbzrle_init(void) 2978 { 2979 Error *local_err = NULL; 2980 2981 if (!migrate_use_xbzrle()) { 2982 return 0; 2983 } 2984 2985 XBZRLE_cache_lock(); 2986 2987 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2988 if (!XBZRLE.zero_target_page) { 2989 error_report("%s: Error allocating zero page", __func__); 2990 goto err_out; 2991 } 2992 2993 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2994 TARGET_PAGE_SIZE, &local_err); 2995 if (!XBZRLE.cache) { 2996 error_report_err(local_err); 2997 goto free_zero_page; 2998 } 2999 3000 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 3001 if (!XBZRLE.encoded_buf) { 3002 error_report("%s: Error allocating encoded_buf", __func__); 3003 goto free_cache; 3004 } 3005 3006 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 3007 if (!XBZRLE.current_buf) { 3008 error_report("%s: Error allocating current_buf", __func__); 3009 goto free_encoded_buf; 3010 } 3011 3012 /* We are all good */ 3013 XBZRLE_cache_unlock(); 3014 return 0; 3015 3016 free_encoded_buf: 3017 g_free(XBZRLE.encoded_buf); 3018 XBZRLE.encoded_buf = NULL; 3019 free_cache: 3020 cache_fini(XBZRLE.cache); 3021 XBZRLE.cache = NULL; 3022 free_zero_page: 3023 g_free(XBZRLE.zero_target_page); 3024 XBZRLE.zero_target_page = NULL; 3025 err_out: 3026 XBZRLE_cache_unlock(); 3027 return -ENOMEM; 3028 } 3029 3030 static int ram_state_init(RAMState **rsp) 3031 { 3032 *rsp = g_try_new0(RAMState, 1); 3033 3034 if (!*rsp) { 3035 error_report("%s: Init ramstate fail", __func__); 3036 return -1; 3037 } 3038 3039 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3040 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3041 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3042 (*rsp)->ram_bytes_total = ram_bytes_total(); 3043 3044 /* 3045 * Count the total number of pages used by ram blocks not including any 3046 * gaps due to alignment or unplugs. 3047 * This must match with the initial values of dirty bitmap. 3048 */ 3049 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 3050 ram_state_reset(*rsp); 3051 3052 return 0; 3053 } 3054 3055 static void ram_list_init_bitmaps(void) 3056 { 3057 MigrationState *ms = migrate_get_current(); 3058 RAMBlock *block; 3059 unsigned long pages; 3060 uint8_t shift; 3061 3062 /* Skip setting bitmap if there is no RAM */ 3063 if (ram_bytes_total()) { 3064 shift = ms->clear_bitmap_shift; 3065 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3066 error_report("clear_bitmap_shift (%u) too big, using " 3067 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3068 shift = CLEAR_BITMAP_SHIFT_MAX; 3069 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3070 error_report("clear_bitmap_shift (%u) too small, using " 3071 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3072 shift = CLEAR_BITMAP_SHIFT_MIN; 3073 } 3074 3075 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3076 pages = block->max_length >> TARGET_PAGE_BITS; 3077 /* 3078 * The initial dirty bitmap for migration must be set with all 3079 * ones to make sure we'll migrate every guest RAM page to 3080 * destination. 3081 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3082 * new migration after a failed migration, ram_list. 3083 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3084 * guest memory. 3085 */ 3086 block->bmap = bitmap_new(pages); 3087 bitmap_set(block->bmap, 0, pages); 3088 block->clear_bmap_shift = shift; 3089 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3090 } 3091 } 3092 } 3093 3094 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3095 { 3096 unsigned long pages; 3097 RAMBlock *rb; 3098 3099 RCU_READ_LOCK_GUARD(); 3100 3101 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3102 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3103 rs->migration_dirty_pages -= pages; 3104 } 3105 } 3106 3107 static void ram_init_bitmaps(RAMState *rs) 3108 { 3109 /* For memory_global_dirty_log_start below. */ 3110 qemu_mutex_lock_iothread(); 3111 qemu_mutex_lock_ramlist(); 3112 3113 WITH_RCU_READ_LOCK_GUARD() { 3114 ram_list_init_bitmaps(); 3115 /* We don't use dirty log with background snapshots */ 3116 if (!migrate_background_snapshot()) { 3117 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3118 migration_bitmap_sync_precopy(rs); 3119 } 3120 } 3121 qemu_mutex_unlock_ramlist(); 3122 qemu_mutex_unlock_iothread(); 3123 3124 /* 3125 * After an eventual first bitmap sync, fixup the initial bitmap 3126 * containing all 1s to exclude any discarded pages from migration. 3127 */ 3128 migration_bitmap_clear_discarded_pages(rs); 3129 } 3130 3131 static int ram_init_all(RAMState **rsp) 3132 { 3133 if (ram_state_init(rsp)) { 3134 return -1; 3135 } 3136 3137 if (xbzrle_init()) { 3138 ram_state_cleanup(rsp); 3139 return -1; 3140 } 3141 3142 ram_init_bitmaps(*rsp); 3143 3144 return 0; 3145 } 3146 3147 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3148 { 3149 RAMBlock *block; 3150 uint64_t pages = 0; 3151 3152 /* 3153 * Postcopy is not using xbzrle/compression, so no need for that. 3154 * Also, since source are already halted, we don't need to care 3155 * about dirty page logging as well. 3156 */ 3157 3158 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3159 pages += bitmap_count_one(block->bmap, 3160 block->used_length >> TARGET_PAGE_BITS); 3161 } 3162 3163 /* This may not be aligned with current bitmaps. Recalculate. */ 3164 rs->migration_dirty_pages = pages; 3165 3166 ram_state_reset(rs); 3167 3168 /* Update RAMState cache of output QEMUFile */ 3169 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3170 3171 trace_ram_state_resume_prepare(pages); 3172 } 3173 3174 /* 3175 * This function clears bits of the free pages reported by the caller from the 3176 * migration dirty bitmap. @addr is the host address corresponding to the 3177 * start of the continuous guest free pages, and @len is the total bytes of 3178 * those pages. 3179 */ 3180 void qemu_guest_free_page_hint(void *addr, size_t len) 3181 { 3182 RAMBlock *block; 3183 ram_addr_t offset; 3184 size_t used_len, start, npages; 3185 MigrationState *s = migrate_get_current(); 3186 3187 /* This function is currently expected to be used during live migration */ 3188 if (!migration_is_setup_or_active(s->state)) { 3189 return; 3190 } 3191 3192 for (; len > 0; len -= used_len, addr += used_len) { 3193 block = qemu_ram_block_from_host(addr, false, &offset); 3194 if (unlikely(!block || offset >= block->used_length)) { 3195 /* 3196 * The implementation might not support RAMBlock resize during 3197 * live migration, but it could happen in theory with future 3198 * updates. So we add a check here to capture that case. 3199 */ 3200 error_report_once("%s unexpected error", __func__); 3201 return; 3202 } 3203 3204 if (len <= block->used_length - offset) { 3205 used_len = len; 3206 } else { 3207 used_len = block->used_length - offset; 3208 } 3209 3210 start = offset >> TARGET_PAGE_BITS; 3211 npages = used_len >> TARGET_PAGE_BITS; 3212 3213 qemu_mutex_lock(&ram_state->bitmap_mutex); 3214 /* 3215 * The skipped free pages are equavalent to be sent from clear_bmap's 3216 * perspective, so clear the bits from the memory region bitmap which 3217 * are initially set. Otherwise those skipped pages will be sent in 3218 * the next round after syncing from the memory region bitmap. 3219 */ 3220 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3221 ram_state->migration_dirty_pages -= 3222 bitmap_count_one_with_offset(block->bmap, start, npages); 3223 bitmap_clear(block->bmap, start, npages); 3224 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3225 } 3226 } 3227 3228 /* 3229 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3230 * long-running RCU critical section. When rcu-reclaims in the code 3231 * start to become numerous it will be necessary to reduce the 3232 * granularity of these critical sections. 3233 */ 3234 3235 /** 3236 * ram_save_setup: Setup RAM for migration 3237 * 3238 * Returns zero to indicate success and negative for error 3239 * 3240 * @f: QEMUFile where to send the data 3241 * @opaque: RAMState pointer 3242 */ 3243 static int ram_save_setup(QEMUFile *f, void *opaque) 3244 { 3245 RAMState **rsp = opaque; 3246 RAMBlock *block; 3247 int ret; 3248 3249 if (compress_threads_save_setup()) { 3250 return -1; 3251 } 3252 3253 /* migration has already setup the bitmap, reuse it. */ 3254 if (!migration_in_colo_state()) { 3255 if (ram_init_all(rsp) != 0) { 3256 compress_threads_save_cleanup(); 3257 return -1; 3258 } 3259 } 3260 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3261 3262 WITH_RCU_READ_LOCK_GUARD() { 3263 qemu_put_be64(f, ram_bytes_total_with_ignored() 3264 | RAM_SAVE_FLAG_MEM_SIZE); 3265 3266 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3267 qemu_put_byte(f, strlen(block->idstr)); 3268 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3269 qemu_put_be64(f, block->used_length); 3270 if (migrate_postcopy_ram() && block->page_size != 3271 qemu_host_page_size) { 3272 qemu_put_be64(f, block->page_size); 3273 } 3274 if (migrate_ignore_shared()) { 3275 qemu_put_be64(f, block->mr->addr); 3276 } 3277 } 3278 } 3279 3280 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3281 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3282 3283 migration_ops = g_malloc0(sizeof(MigrationOps)); 3284 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3285 ret = multifd_send_sync_main(f); 3286 if (ret < 0) { 3287 return ret; 3288 } 3289 3290 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3291 qemu_fflush(f); 3292 3293 return 0; 3294 } 3295 3296 /** 3297 * ram_save_iterate: iterative stage for migration 3298 * 3299 * Returns zero to indicate success and negative for error 3300 * 3301 * @f: QEMUFile where to send the data 3302 * @opaque: RAMState pointer 3303 */ 3304 static int ram_save_iterate(QEMUFile *f, void *opaque) 3305 { 3306 RAMState **temp = opaque; 3307 RAMState *rs = *temp; 3308 int ret = 0; 3309 int i; 3310 int64_t t0; 3311 int done = 0; 3312 3313 if (blk_mig_bulk_active()) { 3314 /* Avoid transferring ram during bulk phase of block migration as 3315 * the bulk phase will usually take a long time and transferring 3316 * ram updates during that time is pointless. */ 3317 goto out; 3318 } 3319 3320 /* 3321 * We'll take this lock a little bit long, but it's okay for two reasons. 3322 * Firstly, the only possible other thread to take it is who calls 3323 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3324 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3325 * guarantees that we'll at least released it in a regular basis. 3326 */ 3327 qemu_mutex_lock(&rs->bitmap_mutex); 3328 WITH_RCU_READ_LOCK_GUARD() { 3329 if (ram_list.version != rs->last_version) { 3330 ram_state_reset(rs); 3331 } 3332 3333 /* Read version before ram_list.blocks */ 3334 smp_rmb(); 3335 3336 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3337 3338 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3339 i = 0; 3340 while ((ret = qemu_file_rate_limit(f)) == 0 || 3341 postcopy_has_request(rs)) { 3342 int pages; 3343 3344 if (qemu_file_get_error(f)) { 3345 break; 3346 } 3347 3348 pages = ram_find_and_save_block(rs); 3349 /* no more pages to sent */ 3350 if (pages == 0) { 3351 done = 1; 3352 break; 3353 } 3354 3355 if (pages < 0) { 3356 qemu_file_set_error(f, pages); 3357 break; 3358 } 3359 3360 rs->target_page_count += pages; 3361 3362 /* 3363 * During postcopy, it is necessary to make sure one whole host 3364 * page is sent in one chunk. 3365 */ 3366 if (migrate_postcopy_ram()) { 3367 flush_compressed_data(rs); 3368 } 3369 3370 /* 3371 * we want to check in the 1st loop, just in case it was the 1st 3372 * time and we had to sync the dirty bitmap. 3373 * qemu_clock_get_ns() is a bit expensive, so we only check each 3374 * some iterations 3375 */ 3376 if ((i & 63) == 0) { 3377 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3378 1000000; 3379 if (t1 > MAX_WAIT) { 3380 trace_ram_save_iterate_big_wait(t1, i); 3381 break; 3382 } 3383 } 3384 i++; 3385 } 3386 } 3387 qemu_mutex_unlock(&rs->bitmap_mutex); 3388 3389 /* 3390 * Must occur before EOS (or any QEMUFile operation) 3391 * because of RDMA protocol. 3392 */ 3393 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3394 3395 out: 3396 if (ret >= 0 3397 && migration_is_setup_or_active(migrate_get_current()->state)) { 3398 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3399 if (ret < 0) { 3400 return ret; 3401 } 3402 3403 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3404 qemu_fflush(f); 3405 ram_transferred_add(8); 3406 3407 ret = qemu_file_get_error(f); 3408 } 3409 if (ret < 0) { 3410 return ret; 3411 } 3412 3413 return done; 3414 } 3415 3416 /** 3417 * ram_save_complete: function called to send the remaining amount of ram 3418 * 3419 * Returns zero to indicate success or negative on error 3420 * 3421 * Called with iothread lock 3422 * 3423 * @f: QEMUFile where to send the data 3424 * @opaque: RAMState pointer 3425 */ 3426 static int ram_save_complete(QEMUFile *f, void *opaque) 3427 { 3428 RAMState **temp = opaque; 3429 RAMState *rs = *temp; 3430 int ret = 0; 3431 3432 rs->last_stage = !migration_in_colo_state(); 3433 3434 WITH_RCU_READ_LOCK_GUARD() { 3435 if (!migration_in_postcopy()) { 3436 migration_bitmap_sync_precopy(rs); 3437 } 3438 3439 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3440 3441 /* try transferring iterative blocks of memory */ 3442 3443 /* flush all remaining blocks regardless of rate limiting */ 3444 qemu_mutex_lock(&rs->bitmap_mutex); 3445 while (true) { 3446 int pages; 3447 3448 pages = ram_find_and_save_block(rs); 3449 /* no more blocks to sent */ 3450 if (pages == 0) { 3451 break; 3452 } 3453 if (pages < 0) { 3454 ret = pages; 3455 break; 3456 } 3457 } 3458 qemu_mutex_unlock(&rs->bitmap_mutex); 3459 3460 flush_compressed_data(rs); 3461 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3462 } 3463 3464 if (ret < 0) { 3465 return ret; 3466 } 3467 3468 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3469 if (ret < 0) { 3470 return ret; 3471 } 3472 3473 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3474 qemu_fflush(f); 3475 3476 return 0; 3477 } 3478 3479 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3480 uint64_t *can_postcopy) 3481 { 3482 RAMState **temp = opaque; 3483 RAMState *rs = *temp; 3484 3485 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3486 3487 if (migrate_postcopy_ram()) { 3488 /* We can do postcopy, and all the data is postcopiable */ 3489 *can_postcopy += remaining_size; 3490 } else { 3491 *must_precopy += remaining_size; 3492 } 3493 } 3494 3495 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3496 uint64_t *can_postcopy) 3497 { 3498 MigrationState *s = migrate_get_current(); 3499 RAMState **temp = opaque; 3500 RAMState *rs = *temp; 3501 3502 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3503 3504 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3505 qemu_mutex_lock_iothread(); 3506 WITH_RCU_READ_LOCK_GUARD() { 3507 migration_bitmap_sync_precopy(rs); 3508 } 3509 qemu_mutex_unlock_iothread(); 3510 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3511 } 3512 3513 if (migrate_postcopy_ram()) { 3514 /* We can do postcopy, and all the data is postcopiable */ 3515 *can_postcopy += remaining_size; 3516 } else { 3517 *must_precopy += remaining_size; 3518 } 3519 } 3520 3521 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3522 { 3523 unsigned int xh_len; 3524 int xh_flags; 3525 uint8_t *loaded_data; 3526 3527 /* extract RLE header */ 3528 xh_flags = qemu_get_byte(f); 3529 xh_len = qemu_get_be16(f); 3530 3531 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3532 error_report("Failed to load XBZRLE page - wrong compression!"); 3533 return -1; 3534 } 3535 3536 if (xh_len > TARGET_PAGE_SIZE) { 3537 error_report("Failed to load XBZRLE page - len overflow!"); 3538 return -1; 3539 } 3540 loaded_data = XBZRLE.decoded_buf; 3541 /* load data and decode */ 3542 /* it can change loaded_data to point to an internal buffer */ 3543 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3544 3545 /* decode RLE */ 3546 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3547 TARGET_PAGE_SIZE) == -1) { 3548 error_report("Failed to load XBZRLE page - decode error!"); 3549 return -1; 3550 } 3551 3552 return 0; 3553 } 3554 3555 /** 3556 * ram_block_from_stream: read a RAMBlock id from the migration stream 3557 * 3558 * Must be called from within a rcu critical section. 3559 * 3560 * Returns a pointer from within the RCU-protected ram_list. 3561 * 3562 * @mis: the migration incoming state pointer 3563 * @f: QEMUFile where to read the data from 3564 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3565 * @channel: the channel we're using 3566 */ 3567 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3568 QEMUFile *f, int flags, 3569 int channel) 3570 { 3571 RAMBlock *block = mis->last_recv_block[channel]; 3572 char id[256]; 3573 uint8_t len; 3574 3575 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3576 if (!block) { 3577 error_report("Ack, bad migration stream!"); 3578 return NULL; 3579 } 3580 return block; 3581 } 3582 3583 len = qemu_get_byte(f); 3584 qemu_get_buffer(f, (uint8_t *)id, len); 3585 id[len] = 0; 3586 3587 block = qemu_ram_block_by_name(id); 3588 if (!block) { 3589 error_report("Can't find block %s", id); 3590 return NULL; 3591 } 3592 3593 if (ramblock_is_ignored(block)) { 3594 error_report("block %s should not be migrated !", id); 3595 return NULL; 3596 } 3597 3598 mis->last_recv_block[channel] = block; 3599 3600 return block; 3601 } 3602 3603 static inline void *host_from_ram_block_offset(RAMBlock *block, 3604 ram_addr_t offset) 3605 { 3606 if (!offset_in_ramblock(block, offset)) { 3607 return NULL; 3608 } 3609 3610 return block->host + offset; 3611 } 3612 3613 static void *host_page_from_ram_block_offset(RAMBlock *block, 3614 ram_addr_t offset) 3615 { 3616 /* Note: Explicitly no check against offset_in_ramblock(). */ 3617 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3618 block->page_size); 3619 } 3620 3621 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3622 ram_addr_t offset) 3623 { 3624 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3625 } 3626 3627 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3628 ram_addr_t offset, bool record_bitmap) 3629 { 3630 if (!offset_in_ramblock(block, offset)) { 3631 return NULL; 3632 } 3633 if (!block->colo_cache) { 3634 error_report("%s: colo_cache is NULL in block :%s", 3635 __func__, block->idstr); 3636 return NULL; 3637 } 3638 3639 /* 3640 * During colo checkpoint, we need bitmap of these migrated pages. 3641 * It help us to decide which pages in ram cache should be flushed 3642 * into VM's RAM later. 3643 */ 3644 if (record_bitmap && 3645 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3646 ram_state->migration_dirty_pages++; 3647 } 3648 return block->colo_cache + offset; 3649 } 3650 3651 /** 3652 * ram_handle_compressed: handle the zero page case 3653 * 3654 * If a page (or a whole RDMA chunk) has been 3655 * determined to be zero, then zap it. 3656 * 3657 * @host: host address for the zero page 3658 * @ch: what the page is filled from. We only support zero 3659 * @size: size of the zero page 3660 */ 3661 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3662 { 3663 if (ch != 0 || !buffer_is_zero(host, size)) { 3664 memset(host, ch, size); 3665 } 3666 } 3667 3668 /* return the size after decompression, or negative value on error */ 3669 static int 3670 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3671 const uint8_t *source, size_t source_len) 3672 { 3673 int err; 3674 3675 err = inflateReset(stream); 3676 if (err != Z_OK) { 3677 return -1; 3678 } 3679 3680 stream->avail_in = source_len; 3681 stream->next_in = (uint8_t *)source; 3682 stream->avail_out = dest_len; 3683 stream->next_out = dest; 3684 3685 err = inflate(stream, Z_NO_FLUSH); 3686 if (err != Z_STREAM_END) { 3687 return -1; 3688 } 3689 3690 return stream->total_out; 3691 } 3692 3693 static void *do_data_decompress(void *opaque) 3694 { 3695 DecompressParam *param = opaque; 3696 unsigned long pagesize; 3697 uint8_t *des; 3698 int len, ret; 3699 3700 qemu_mutex_lock(¶m->mutex); 3701 while (!param->quit) { 3702 if (param->des) { 3703 des = param->des; 3704 len = param->len; 3705 param->des = 0; 3706 qemu_mutex_unlock(¶m->mutex); 3707 3708 pagesize = TARGET_PAGE_SIZE; 3709 3710 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3711 param->compbuf, len); 3712 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3713 error_report("decompress data failed"); 3714 qemu_file_set_error(decomp_file, ret); 3715 } 3716 3717 qemu_mutex_lock(&decomp_done_lock); 3718 param->done = true; 3719 qemu_cond_signal(&decomp_done_cond); 3720 qemu_mutex_unlock(&decomp_done_lock); 3721 3722 qemu_mutex_lock(¶m->mutex); 3723 } else { 3724 qemu_cond_wait(¶m->cond, ¶m->mutex); 3725 } 3726 } 3727 qemu_mutex_unlock(¶m->mutex); 3728 3729 return NULL; 3730 } 3731 3732 static int wait_for_decompress_done(void) 3733 { 3734 int idx, thread_count; 3735 3736 if (!migrate_use_compression()) { 3737 return 0; 3738 } 3739 3740 thread_count = migrate_decompress_threads(); 3741 qemu_mutex_lock(&decomp_done_lock); 3742 for (idx = 0; idx < thread_count; idx++) { 3743 while (!decomp_param[idx].done) { 3744 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3745 } 3746 } 3747 qemu_mutex_unlock(&decomp_done_lock); 3748 return qemu_file_get_error(decomp_file); 3749 } 3750 3751 static void compress_threads_load_cleanup(void) 3752 { 3753 int i, thread_count; 3754 3755 if (!migrate_use_compression()) { 3756 return; 3757 } 3758 thread_count = migrate_decompress_threads(); 3759 for (i = 0; i < thread_count; i++) { 3760 /* 3761 * we use it as a indicator which shows if the thread is 3762 * properly init'd or not 3763 */ 3764 if (!decomp_param[i].compbuf) { 3765 break; 3766 } 3767 3768 qemu_mutex_lock(&decomp_param[i].mutex); 3769 decomp_param[i].quit = true; 3770 qemu_cond_signal(&decomp_param[i].cond); 3771 qemu_mutex_unlock(&decomp_param[i].mutex); 3772 } 3773 for (i = 0; i < thread_count; i++) { 3774 if (!decomp_param[i].compbuf) { 3775 break; 3776 } 3777 3778 qemu_thread_join(decompress_threads + i); 3779 qemu_mutex_destroy(&decomp_param[i].mutex); 3780 qemu_cond_destroy(&decomp_param[i].cond); 3781 inflateEnd(&decomp_param[i].stream); 3782 g_free(decomp_param[i].compbuf); 3783 decomp_param[i].compbuf = NULL; 3784 } 3785 g_free(decompress_threads); 3786 g_free(decomp_param); 3787 decompress_threads = NULL; 3788 decomp_param = NULL; 3789 decomp_file = NULL; 3790 } 3791 3792 static int compress_threads_load_setup(QEMUFile *f) 3793 { 3794 int i, thread_count; 3795 3796 if (!migrate_use_compression()) { 3797 return 0; 3798 } 3799 3800 thread_count = migrate_decompress_threads(); 3801 decompress_threads = g_new0(QemuThread, thread_count); 3802 decomp_param = g_new0(DecompressParam, thread_count); 3803 qemu_mutex_init(&decomp_done_lock); 3804 qemu_cond_init(&decomp_done_cond); 3805 decomp_file = f; 3806 for (i = 0; i < thread_count; i++) { 3807 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3808 goto exit; 3809 } 3810 3811 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3812 qemu_mutex_init(&decomp_param[i].mutex); 3813 qemu_cond_init(&decomp_param[i].cond); 3814 decomp_param[i].done = true; 3815 decomp_param[i].quit = false; 3816 qemu_thread_create(decompress_threads + i, "decompress", 3817 do_data_decompress, decomp_param + i, 3818 QEMU_THREAD_JOINABLE); 3819 } 3820 return 0; 3821 exit: 3822 compress_threads_load_cleanup(); 3823 return -1; 3824 } 3825 3826 static void decompress_data_with_multi_threads(QEMUFile *f, 3827 void *host, int len) 3828 { 3829 int idx, thread_count; 3830 3831 thread_count = migrate_decompress_threads(); 3832 QEMU_LOCK_GUARD(&decomp_done_lock); 3833 while (true) { 3834 for (idx = 0; idx < thread_count; idx++) { 3835 if (decomp_param[idx].done) { 3836 decomp_param[idx].done = false; 3837 qemu_mutex_lock(&decomp_param[idx].mutex); 3838 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3839 decomp_param[idx].des = host; 3840 decomp_param[idx].len = len; 3841 qemu_cond_signal(&decomp_param[idx].cond); 3842 qemu_mutex_unlock(&decomp_param[idx].mutex); 3843 break; 3844 } 3845 } 3846 if (idx < thread_count) { 3847 break; 3848 } else { 3849 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3850 } 3851 } 3852 } 3853 3854 static void colo_init_ram_state(void) 3855 { 3856 ram_state_init(&ram_state); 3857 } 3858 3859 /* 3860 * colo cache: this is for secondary VM, we cache the whole 3861 * memory of the secondary VM, it is need to hold the global lock 3862 * to call this helper. 3863 */ 3864 int colo_init_ram_cache(void) 3865 { 3866 RAMBlock *block; 3867 3868 WITH_RCU_READ_LOCK_GUARD() { 3869 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3870 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3871 NULL, false, false); 3872 if (!block->colo_cache) { 3873 error_report("%s: Can't alloc memory for COLO cache of block %s," 3874 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3875 block->used_length); 3876 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3877 if (block->colo_cache) { 3878 qemu_anon_ram_free(block->colo_cache, block->used_length); 3879 block->colo_cache = NULL; 3880 } 3881 } 3882 return -errno; 3883 } 3884 if (!machine_dump_guest_core(current_machine)) { 3885 qemu_madvise(block->colo_cache, block->used_length, 3886 QEMU_MADV_DONTDUMP); 3887 } 3888 } 3889 } 3890 3891 /* 3892 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3893 * with to decide which page in cache should be flushed into SVM's RAM. Here 3894 * we use the same name 'ram_bitmap' as for migration. 3895 */ 3896 if (ram_bytes_total()) { 3897 RAMBlock *block; 3898 3899 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3900 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3901 block->bmap = bitmap_new(pages); 3902 } 3903 } 3904 3905 colo_init_ram_state(); 3906 return 0; 3907 } 3908 3909 /* TODO: duplicated with ram_init_bitmaps */ 3910 void colo_incoming_start_dirty_log(void) 3911 { 3912 RAMBlock *block = NULL; 3913 /* For memory_global_dirty_log_start below. */ 3914 qemu_mutex_lock_iothread(); 3915 qemu_mutex_lock_ramlist(); 3916 3917 memory_global_dirty_log_sync(); 3918 WITH_RCU_READ_LOCK_GUARD() { 3919 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3920 ramblock_sync_dirty_bitmap(ram_state, block); 3921 /* Discard this dirty bitmap record */ 3922 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3923 } 3924 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3925 } 3926 ram_state->migration_dirty_pages = 0; 3927 qemu_mutex_unlock_ramlist(); 3928 qemu_mutex_unlock_iothread(); 3929 } 3930 3931 /* It is need to hold the global lock to call this helper */ 3932 void colo_release_ram_cache(void) 3933 { 3934 RAMBlock *block; 3935 3936 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3937 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3938 g_free(block->bmap); 3939 block->bmap = NULL; 3940 } 3941 3942 WITH_RCU_READ_LOCK_GUARD() { 3943 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3944 if (block->colo_cache) { 3945 qemu_anon_ram_free(block->colo_cache, block->used_length); 3946 block->colo_cache = NULL; 3947 } 3948 } 3949 } 3950 ram_state_cleanup(&ram_state); 3951 } 3952 3953 /** 3954 * ram_load_setup: Setup RAM for migration incoming side 3955 * 3956 * Returns zero to indicate success and negative for error 3957 * 3958 * @f: QEMUFile where to receive the data 3959 * @opaque: RAMState pointer 3960 */ 3961 static int ram_load_setup(QEMUFile *f, void *opaque) 3962 { 3963 if (compress_threads_load_setup(f)) { 3964 return -1; 3965 } 3966 3967 xbzrle_load_setup(); 3968 ramblock_recv_map_init(); 3969 3970 return 0; 3971 } 3972 3973 static int ram_load_cleanup(void *opaque) 3974 { 3975 RAMBlock *rb; 3976 3977 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3978 qemu_ram_block_writeback(rb); 3979 } 3980 3981 xbzrle_load_cleanup(); 3982 compress_threads_load_cleanup(); 3983 3984 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3985 g_free(rb->receivedmap); 3986 rb->receivedmap = NULL; 3987 } 3988 3989 return 0; 3990 } 3991 3992 /** 3993 * ram_postcopy_incoming_init: allocate postcopy data structures 3994 * 3995 * Returns 0 for success and negative if there was one error 3996 * 3997 * @mis: current migration incoming state 3998 * 3999 * Allocate data structures etc needed by incoming migration with 4000 * postcopy-ram. postcopy-ram's similarly names 4001 * postcopy_ram_incoming_init does the work. 4002 */ 4003 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 4004 { 4005 return postcopy_ram_incoming_init(mis); 4006 } 4007 4008 /** 4009 * ram_load_postcopy: load a page in postcopy case 4010 * 4011 * Returns 0 for success or -errno in case of error 4012 * 4013 * Called in postcopy mode by ram_load(). 4014 * rcu_read_lock is taken prior to this being called. 4015 * 4016 * @f: QEMUFile where to send the data 4017 * @channel: the channel to use for loading 4018 */ 4019 int ram_load_postcopy(QEMUFile *f, int channel) 4020 { 4021 int flags = 0, ret = 0; 4022 bool place_needed = false; 4023 bool matches_target_page_size = false; 4024 MigrationIncomingState *mis = migration_incoming_get_current(); 4025 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 4026 4027 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4028 ram_addr_t addr; 4029 void *page_buffer = NULL; 4030 void *place_source = NULL; 4031 RAMBlock *block = NULL; 4032 uint8_t ch; 4033 int len; 4034 4035 addr = qemu_get_be64(f); 4036 4037 /* 4038 * If qemu file error, we should stop here, and then "addr" 4039 * may be invalid 4040 */ 4041 ret = qemu_file_get_error(f); 4042 if (ret) { 4043 break; 4044 } 4045 4046 flags = addr & ~TARGET_PAGE_MASK; 4047 addr &= TARGET_PAGE_MASK; 4048 4049 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 4050 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4051 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 4052 block = ram_block_from_stream(mis, f, flags, channel); 4053 if (!block) { 4054 ret = -EINVAL; 4055 break; 4056 } 4057 4058 /* 4059 * Relying on used_length is racy and can result in false positives. 4060 * We might place pages beyond used_length in case RAM was shrunk 4061 * while in postcopy, which is fine - trying to place via 4062 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4063 */ 4064 if (!block->host || addr >= block->postcopy_length) { 4065 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4066 ret = -EINVAL; 4067 break; 4068 } 4069 tmp_page->target_pages++; 4070 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4071 /* 4072 * Postcopy requires that we place whole host pages atomically; 4073 * these may be huge pages for RAMBlocks that are backed by 4074 * hugetlbfs. 4075 * To make it atomic, the data is read into a temporary page 4076 * that's moved into place later. 4077 * The migration protocol uses, possibly smaller, target-pages 4078 * however the source ensures it always sends all the components 4079 * of a host page in one chunk. 4080 */ 4081 page_buffer = tmp_page->tmp_huge_page + 4082 host_page_offset_from_ram_block_offset(block, addr); 4083 /* If all TP are zero then we can optimise the place */ 4084 if (tmp_page->target_pages == 1) { 4085 tmp_page->host_addr = 4086 host_page_from_ram_block_offset(block, addr); 4087 } else if (tmp_page->host_addr != 4088 host_page_from_ram_block_offset(block, addr)) { 4089 /* not the 1st TP within the HP */ 4090 error_report("Non-same host page detected on channel %d: " 4091 "Target host page %p, received host page %p " 4092 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4093 channel, tmp_page->host_addr, 4094 host_page_from_ram_block_offset(block, addr), 4095 block->idstr, addr, tmp_page->target_pages); 4096 ret = -EINVAL; 4097 break; 4098 } 4099 4100 /* 4101 * If it's the last part of a host page then we place the host 4102 * page 4103 */ 4104 if (tmp_page->target_pages == 4105 (block->page_size / TARGET_PAGE_SIZE)) { 4106 place_needed = true; 4107 } 4108 place_source = tmp_page->tmp_huge_page; 4109 } 4110 4111 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4112 case RAM_SAVE_FLAG_ZERO: 4113 ch = qemu_get_byte(f); 4114 /* 4115 * Can skip to set page_buffer when 4116 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4117 */ 4118 if (ch || !matches_target_page_size) { 4119 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4120 } 4121 if (ch) { 4122 tmp_page->all_zero = false; 4123 } 4124 break; 4125 4126 case RAM_SAVE_FLAG_PAGE: 4127 tmp_page->all_zero = false; 4128 if (!matches_target_page_size) { 4129 /* For huge pages, we always use temporary buffer */ 4130 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4131 } else { 4132 /* 4133 * For small pages that matches target page size, we 4134 * avoid the qemu_file copy. Instead we directly use 4135 * the buffer of QEMUFile to place the page. Note: we 4136 * cannot do any QEMUFile operation before using that 4137 * buffer to make sure the buffer is valid when 4138 * placing the page. 4139 */ 4140 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4141 TARGET_PAGE_SIZE); 4142 } 4143 break; 4144 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4145 tmp_page->all_zero = false; 4146 len = qemu_get_be32(f); 4147 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4148 error_report("Invalid compressed data length: %d", len); 4149 ret = -EINVAL; 4150 break; 4151 } 4152 decompress_data_with_multi_threads(f, page_buffer, len); 4153 break; 4154 4155 case RAM_SAVE_FLAG_EOS: 4156 /* normal exit */ 4157 multifd_recv_sync_main(); 4158 break; 4159 default: 4160 error_report("Unknown combination of migration flags: 0x%x" 4161 " (postcopy mode)", flags); 4162 ret = -EINVAL; 4163 break; 4164 } 4165 4166 /* Got the whole host page, wait for decompress before placing. */ 4167 if (place_needed) { 4168 ret |= wait_for_decompress_done(); 4169 } 4170 4171 /* Detect for any possible file errors */ 4172 if (!ret && qemu_file_get_error(f)) { 4173 ret = qemu_file_get_error(f); 4174 } 4175 4176 if (!ret && place_needed) { 4177 if (tmp_page->all_zero) { 4178 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4179 } else { 4180 ret = postcopy_place_page(mis, tmp_page->host_addr, 4181 place_source, block); 4182 } 4183 place_needed = false; 4184 postcopy_temp_page_reset(tmp_page); 4185 } 4186 } 4187 4188 return ret; 4189 } 4190 4191 static bool postcopy_is_running(void) 4192 { 4193 PostcopyState ps = postcopy_state_get(); 4194 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4195 } 4196 4197 /* 4198 * Flush content of RAM cache into SVM's memory. 4199 * Only flush the pages that be dirtied by PVM or SVM or both. 4200 */ 4201 void colo_flush_ram_cache(void) 4202 { 4203 RAMBlock *block = NULL; 4204 void *dst_host; 4205 void *src_host; 4206 unsigned long offset = 0; 4207 4208 memory_global_dirty_log_sync(); 4209 WITH_RCU_READ_LOCK_GUARD() { 4210 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4211 ramblock_sync_dirty_bitmap(ram_state, block); 4212 } 4213 } 4214 4215 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4216 WITH_RCU_READ_LOCK_GUARD() { 4217 block = QLIST_FIRST_RCU(&ram_list.blocks); 4218 4219 while (block) { 4220 unsigned long num = 0; 4221 4222 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4223 if (!offset_in_ramblock(block, 4224 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4225 offset = 0; 4226 num = 0; 4227 block = QLIST_NEXT_RCU(block, next); 4228 } else { 4229 unsigned long i = 0; 4230 4231 for (i = 0; i < num; i++) { 4232 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4233 } 4234 dst_host = block->host 4235 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4236 src_host = block->colo_cache 4237 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4238 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4239 offset += num; 4240 } 4241 } 4242 } 4243 trace_colo_flush_ram_cache_end(); 4244 } 4245 4246 /** 4247 * ram_load_precopy: load pages in precopy case 4248 * 4249 * Returns 0 for success or -errno in case of error 4250 * 4251 * Called in precopy mode by ram_load(). 4252 * rcu_read_lock is taken prior to this being called. 4253 * 4254 * @f: QEMUFile where to send the data 4255 */ 4256 static int ram_load_precopy(QEMUFile *f) 4257 { 4258 MigrationIncomingState *mis = migration_incoming_get_current(); 4259 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4260 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4261 bool postcopy_advised = migration_incoming_postcopy_advised(); 4262 if (!migrate_use_compression()) { 4263 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4264 } 4265 4266 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4267 ram_addr_t addr, total_ram_bytes; 4268 void *host = NULL, *host_bak = NULL; 4269 uint8_t ch; 4270 4271 /* 4272 * Yield periodically to let main loop run, but an iteration of 4273 * the main loop is expensive, so do it each some iterations 4274 */ 4275 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4276 aio_co_schedule(qemu_get_current_aio_context(), 4277 qemu_coroutine_self()); 4278 qemu_coroutine_yield(); 4279 } 4280 i++; 4281 4282 addr = qemu_get_be64(f); 4283 flags = addr & ~TARGET_PAGE_MASK; 4284 addr &= TARGET_PAGE_MASK; 4285 4286 if (flags & invalid_flags) { 4287 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4288 error_report("Received an unexpected compressed page"); 4289 } 4290 4291 ret = -EINVAL; 4292 break; 4293 } 4294 4295 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4296 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4297 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4298 RAM_CHANNEL_PRECOPY); 4299 4300 host = host_from_ram_block_offset(block, addr); 4301 /* 4302 * After going into COLO stage, we should not load the page 4303 * into SVM's memory directly, we put them into colo_cache firstly. 4304 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4305 * Previously, we copied all these memory in preparing stage of COLO 4306 * while we need to stop VM, which is a time-consuming process. 4307 * Here we optimize it by a trick, back-up every page while in 4308 * migration process while COLO is enabled, though it affects the 4309 * speed of the migration, but it obviously reduce the downtime of 4310 * back-up all SVM'S memory in COLO preparing stage. 4311 */ 4312 if (migration_incoming_colo_enabled()) { 4313 if (migration_incoming_in_colo_state()) { 4314 /* In COLO stage, put all pages into cache temporarily */ 4315 host = colo_cache_from_block_offset(block, addr, true); 4316 } else { 4317 /* 4318 * In migration stage but before COLO stage, 4319 * Put all pages into both cache and SVM's memory. 4320 */ 4321 host_bak = colo_cache_from_block_offset(block, addr, false); 4322 } 4323 } 4324 if (!host) { 4325 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4326 ret = -EINVAL; 4327 break; 4328 } 4329 if (!migration_incoming_in_colo_state()) { 4330 ramblock_recv_bitmap_set(block, host); 4331 } 4332 4333 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4334 } 4335 4336 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4337 case RAM_SAVE_FLAG_MEM_SIZE: 4338 /* Synchronize RAM block list */ 4339 total_ram_bytes = addr; 4340 while (!ret && total_ram_bytes) { 4341 RAMBlock *block; 4342 char id[256]; 4343 ram_addr_t length; 4344 4345 len = qemu_get_byte(f); 4346 qemu_get_buffer(f, (uint8_t *)id, len); 4347 id[len] = 0; 4348 length = qemu_get_be64(f); 4349 4350 block = qemu_ram_block_by_name(id); 4351 if (block && !qemu_ram_is_migratable(block)) { 4352 error_report("block %s should not be migrated !", id); 4353 ret = -EINVAL; 4354 } else if (block) { 4355 if (length != block->used_length) { 4356 Error *local_err = NULL; 4357 4358 ret = qemu_ram_resize(block, length, 4359 &local_err); 4360 if (local_err) { 4361 error_report_err(local_err); 4362 } 4363 } 4364 /* For postcopy we need to check hugepage sizes match */ 4365 if (postcopy_advised && migrate_postcopy_ram() && 4366 block->page_size != qemu_host_page_size) { 4367 uint64_t remote_page_size = qemu_get_be64(f); 4368 if (remote_page_size != block->page_size) { 4369 error_report("Mismatched RAM page size %s " 4370 "(local) %zd != %" PRId64, 4371 id, block->page_size, 4372 remote_page_size); 4373 ret = -EINVAL; 4374 } 4375 } 4376 if (migrate_ignore_shared()) { 4377 hwaddr addr = qemu_get_be64(f); 4378 if (ramblock_is_ignored(block) && 4379 block->mr->addr != addr) { 4380 error_report("Mismatched GPAs for block %s " 4381 "%" PRId64 "!= %" PRId64, 4382 id, (uint64_t)addr, 4383 (uint64_t)block->mr->addr); 4384 ret = -EINVAL; 4385 } 4386 } 4387 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4388 block->idstr); 4389 } else { 4390 error_report("Unknown ramblock \"%s\", cannot " 4391 "accept migration", id); 4392 ret = -EINVAL; 4393 } 4394 4395 total_ram_bytes -= length; 4396 } 4397 break; 4398 4399 case RAM_SAVE_FLAG_ZERO: 4400 ch = qemu_get_byte(f); 4401 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4402 break; 4403 4404 case RAM_SAVE_FLAG_PAGE: 4405 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4406 break; 4407 4408 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4409 len = qemu_get_be32(f); 4410 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4411 error_report("Invalid compressed data length: %d", len); 4412 ret = -EINVAL; 4413 break; 4414 } 4415 decompress_data_with_multi_threads(f, host, len); 4416 break; 4417 4418 case RAM_SAVE_FLAG_XBZRLE: 4419 if (load_xbzrle(f, addr, host) < 0) { 4420 error_report("Failed to decompress XBZRLE page at " 4421 RAM_ADDR_FMT, addr); 4422 ret = -EINVAL; 4423 break; 4424 } 4425 break; 4426 case RAM_SAVE_FLAG_EOS: 4427 /* normal exit */ 4428 multifd_recv_sync_main(); 4429 break; 4430 default: 4431 if (flags & RAM_SAVE_FLAG_HOOK) { 4432 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4433 } else { 4434 error_report("Unknown combination of migration flags: 0x%x", 4435 flags); 4436 ret = -EINVAL; 4437 } 4438 } 4439 if (!ret) { 4440 ret = qemu_file_get_error(f); 4441 } 4442 if (!ret && host_bak) { 4443 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4444 } 4445 } 4446 4447 ret |= wait_for_decompress_done(); 4448 return ret; 4449 } 4450 4451 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4452 { 4453 int ret = 0; 4454 static uint64_t seq_iter; 4455 /* 4456 * If system is running in postcopy mode, page inserts to host memory must 4457 * be atomic 4458 */ 4459 bool postcopy_running = postcopy_is_running(); 4460 4461 seq_iter++; 4462 4463 if (version_id != 4) { 4464 return -EINVAL; 4465 } 4466 4467 /* 4468 * This RCU critical section can be very long running. 4469 * When RCU reclaims in the code start to become numerous, 4470 * it will be necessary to reduce the granularity of this 4471 * critical section. 4472 */ 4473 WITH_RCU_READ_LOCK_GUARD() { 4474 if (postcopy_running) { 4475 /* 4476 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4477 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4478 * service fast page faults. 4479 */ 4480 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4481 } else { 4482 ret = ram_load_precopy(f); 4483 } 4484 } 4485 trace_ram_load_complete(ret, seq_iter); 4486 4487 return ret; 4488 } 4489 4490 static bool ram_has_postcopy(void *opaque) 4491 { 4492 RAMBlock *rb; 4493 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4494 if (ramblock_is_pmem(rb)) { 4495 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4496 "is not supported now!", rb->idstr, rb->host); 4497 return false; 4498 } 4499 } 4500 4501 return migrate_postcopy_ram(); 4502 } 4503 4504 /* Sync all the dirty bitmap with destination VM. */ 4505 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4506 { 4507 RAMBlock *block; 4508 QEMUFile *file = s->to_dst_file; 4509 int ramblock_count = 0; 4510 4511 trace_ram_dirty_bitmap_sync_start(); 4512 4513 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4514 qemu_savevm_send_recv_bitmap(file, block->idstr); 4515 trace_ram_dirty_bitmap_request(block->idstr); 4516 ramblock_count++; 4517 } 4518 4519 trace_ram_dirty_bitmap_sync_wait(); 4520 4521 /* Wait until all the ramblocks' dirty bitmap synced */ 4522 while (ramblock_count--) { 4523 qemu_sem_wait(&s->rp_state.rp_sem); 4524 } 4525 4526 trace_ram_dirty_bitmap_sync_complete(); 4527 4528 return 0; 4529 } 4530 4531 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4532 { 4533 qemu_sem_post(&s->rp_state.rp_sem); 4534 } 4535 4536 /* 4537 * Read the received bitmap, revert it as the initial dirty bitmap. 4538 * This is only used when the postcopy migration is paused but wants 4539 * to resume from a middle point. 4540 */ 4541 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4542 { 4543 int ret = -EINVAL; 4544 /* from_dst_file is always valid because we're within rp_thread */ 4545 QEMUFile *file = s->rp_state.from_dst_file; 4546 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4547 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4548 uint64_t size, end_mark; 4549 4550 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4551 4552 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4553 error_report("%s: incorrect state %s", __func__, 4554 MigrationStatus_str(s->state)); 4555 return -EINVAL; 4556 } 4557 4558 /* 4559 * Note: see comments in ramblock_recv_bitmap_send() on why we 4560 * need the endianness conversion, and the paddings. 4561 */ 4562 local_size = ROUND_UP(local_size, 8); 4563 4564 /* Add paddings */ 4565 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4566 4567 size = qemu_get_be64(file); 4568 4569 /* The size of the bitmap should match with our ramblock */ 4570 if (size != local_size) { 4571 error_report("%s: ramblock '%s' bitmap size mismatch " 4572 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4573 block->idstr, size, local_size); 4574 ret = -EINVAL; 4575 goto out; 4576 } 4577 4578 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4579 end_mark = qemu_get_be64(file); 4580 4581 ret = qemu_file_get_error(file); 4582 if (ret || size != local_size) { 4583 error_report("%s: read bitmap failed for ramblock '%s': %d" 4584 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4585 __func__, block->idstr, ret, local_size, size); 4586 ret = -EIO; 4587 goto out; 4588 } 4589 4590 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4591 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4592 __func__, block->idstr, end_mark); 4593 ret = -EINVAL; 4594 goto out; 4595 } 4596 4597 /* 4598 * Endianness conversion. We are during postcopy (though paused). 4599 * The dirty bitmap won't change. We can directly modify it. 4600 */ 4601 bitmap_from_le(block->bmap, le_bitmap, nbits); 4602 4603 /* 4604 * What we received is "received bitmap". Revert it as the initial 4605 * dirty bitmap for this ramblock. 4606 */ 4607 bitmap_complement(block->bmap, block->bmap, nbits); 4608 4609 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4610 ramblock_dirty_bitmap_clear_discarded_pages(block); 4611 4612 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4613 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4614 4615 /* 4616 * We succeeded to sync bitmap for current ramblock. If this is 4617 * the last one to sync, we need to notify the main send thread. 4618 */ 4619 ram_dirty_bitmap_reload_notify(s); 4620 4621 ret = 0; 4622 out: 4623 g_free(le_bitmap); 4624 return ret; 4625 } 4626 4627 static int ram_resume_prepare(MigrationState *s, void *opaque) 4628 { 4629 RAMState *rs = *(RAMState **)opaque; 4630 int ret; 4631 4632 ret = ram_dirty_bitmap_sync_all(s, rs); 4633 if (ret) { 4634 return ret; 4635 } 4636 4637 ram_state_resume_prepare(rs, s->to_dst_file); 4638 4639 return 0; 4640 } 4641 4642 void postcopy_preempt_shutdown_file(MigrationState *s) 4643 { 4644 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4645 qemu_fflush(s->postcopy_qemufile_src); 4646 } 4647 4648 static SaveVMHandlers savevm_ram_handlers = { 4649 .save_setup = ram_save_setup, 4650 .save_live_iterate = ram_save_iterate, 4651 .save_live_complete_postcopy = ram_save_complete, 4652 .save_live_complete_precopy = ram_save_complete, 4653 .has_postcopy = ram_has_postcopy, 4654 .state_pending_exact = ram_state_pending_exact, 4655 .state_pending_estimate = ram_state_pending_estimate, 4656 .load_state = ram_load, 4657 .save_cleanup = ram_save_cleanup, 4658 .load_setup = ram_load_setup, 4659 .load_cleanup = ram_load_cleanup, 4660 .resume_prepare = ram_resume_prepare, 4661 }; 4662 4663 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4664 size_t old_size, size_t new_size) 4665 { 4666 PostcopyState ps = postcopy_state_get(); 4667 ram_addr_t offset; 4668 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4669 Error *err = NULL; 4670 4671 if (ramblock_is_ignored(rb)) { 4672 return; 4673 } 4674 4675 if (!migration_is_idle()) { 4676 /* 4677 * Precopy code on the source cannot deal with the size of RAM blocks 4678 * changing at random points in time - especially after sending the 4679 * RAM block sizes in the migration stream, they must no longer change. 4680 * Abort and indicate a proper reason. 4681 */ 4682 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4683 migration_cancel(err); 4684 error_free(err); 4685 } 4686 4687 switch (ps) { 4688 case POSTCOPY_INCOMING_ADVISE: 4689 /* 4690 * Update what ram_postcopy_incoming_init()->init_range() does at the 4691 * time postcopy was advised. Syncing RAM blocks with the source will 4692 * result in RAM resizes. 4693 */ 4694 if (old_size < new_size) { 4695 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4696 error_report("RAM block '%s' discard of resized RAM failed", 4697 rb->idstr); 4698 } 4699 } 4700 rb->postcopy_length = new_size; 4701 break; 4702 case POSTCOPY_INCOMING_NONE: 4703 case POSTCOPY_INCOMING_RUNNING: 4704 case POSTCOPY_INCOMING_END: 4705 /* 4706 * Once our guest is running, postcopy does no longer care about 4707 * resizes. When growing, the new memory was not available on the 4708 * source, no handler needed. 4709 */ 4710 break; 4711 default: 4712 error_report("RAM block '%s' resized during postcopy state: %d", 4713 rb->idstr, ps); 4714 exit(-1); 4715 } 4716 } 4717 4718 static RAMBlockNotifier ram_mig_ram_notifier = { 4719 .ram_block_resized = ram_mig_ram_block_resized, 4720 }; 4721 4722 void ram_mig_init(void) 4723 { 4724 qemu_mutex_init(&XBZRLE.lock); 4725 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4726 ram_block_notifier_add(&ram_mig_ram_notifier); 4727 } 4728