1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 #include "options.h" 61 62 #include "hw/boards.h" /* for machine_dump_guest_core() */ 63 64 #if defined(__linux__) 65 #include "qemu/userfaultfd.h" 66 #endif /* defined(__linux__) */ 67 68 /***********************************************************/ 69 /* ram save/restore */ 70 71 /* 72 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 73 * worked for pages that were filled with the same char. We switched 74 * it to only search for the zero value. And to avoid confusion with 75 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it. 76 */ 77 /* 78 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now 79 */ 80 #define RAM_SAVE_FLAG_FULL 0x01 81 #define RAM_SAVE_FLAG_ZERO 0x02 82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 83 #define RAM_SAVE_FLAG_PAGE 0x08 84 #define RAM_SAVE_FLAG_EOS 0x10 85 #define RAM_SAVE_FLAG_CONTINUE 0x20 86 #define RAM_SAVE_FLAG_XBZRLE 0x40 87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */ 88 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 89 /* We can't use any flag that is bigger than 0x200 */ 90 91 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, 92 uint8_t *, int) = xbzrle_encode_buffer; 93 #if defined(CONFIG_AVX512BW_OPT) 94 #include "qemu/cpuid.h" 95 static void __attribute__((constructor)) init_cpu_flag(void) 96 { 97 unsigned max = __get_cpuid_max(0, NULL); 98 int a, b, c, d; 99 if (max >= 1) { 100 __cpuid(1, a, b, c, d); 101 /* We must check that AVX is not just available, but usable. */ 102 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { 103 int bv; 104 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); 105 __cpuid_count(7, 0, a, b, c, d); 106 /* 0xe6: 107 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 108 * and ZMM16-ZMM31 state are enabled by OS) 109 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) 110 */ 111 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { 112 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; 113 } 114 } 115 } 116 } 117 #endif 118 119 XBZRLECacheStats xbzrle_counters; 120 121 /* used by the search for pages to send */ 122 struct PageSearchStatus { 123 /* The migration channel used for a specific host page */ 124 QEMUFile *pss_channel; 125 /* Last block from where we have sent data */ 126 RAMBlock *last_sent_block; 127 /* Current block being searched */ 128 RAMBlock *block; 129 /* Current page to search from */ 130 unsigned long page; 131 /* Set once we wrap around */ 132 bool complete_round; 133 /* Whether we're sending a host page */ 134 bool host_page_sending; 135 /* The start/end of current host page. Invalid if host_page_sending==false */ 136 unsigned long host_page_start; 137 unsigned long host_page_end; 138 }; 139 typedef struct PageSearchStatus PageSearchStatus; 140 141 /* struct contains XBZRLE cache and a static page 142 used by the compression */ 143 static struct { 144 /* buffer used for XBZRLE encoding */ 145 uint8_t *encoded_buf; 146 /* buffer for storing page content */ 147 uint8_t *current_buf; 148 /* Cache for XBZRLE, Protected by lock. */ 149 PageCache *cache; 150 QemuMutex lock; 151 /* it will store a page full of zeros */ 152 uint8_t *zero_target_page; 153 /* buffer used for XBZRLE decoding */ 154 uint8_t *decoded_buf; 155 } XBZRLE; 156 157 static void XBZRLE_cache_lock(void) 158 { 159 if (migrate_xbzrle()) { 160 qemu_mutex_lock(&XBZRLE.lock); 161 } 162 } 163 164 static void XBZRLE_cache_unlock(void) 165 { 166 if (migrate_xbzrle()) { 167 qemu_mutex_unlock(&XBZRLE.lock); 168 } 169 } 170 171 /** 172 * xbzrle_cache_resize: resize the xbzrle cache 173 * 174 * This function is called from migrate_params_apply in main 175 * thread, possibly while a migration is in progress. A running 176 * migration may be using the cache and might finish during this call, 177 * hence changes to the cache are protected by XBZRLE.lock(). 178 * 179 * Returns 0 for success or -1 for error 180 * 181 * @new_size: new cache size 182 * @errp: set *errp if the check failed, with reason 183 */ 184 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 185 { 186 PageCache *new_cache; 187 int64_t ret = 0; 188 189 /* Check for truncation */ 190 if (new_size != (size_t)new_size) { 191 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 192 "exceeding address space"); 193 return -1; 194 } 195 196 if (new_size == migrate_xbzrle_cache_size()) { 197 /* nothing to do */ 198 return 0; 199 } 200 201 XBZRLE_cache_lock(); 202 203 if (XBZRLE.cache != NULL) { 204 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 205 if (!new_cache) { 206 ret = -1; 207 goto out; 208 } 209 210 cache_fini(XBZRLE.cache); 211 XBZRLE.cache = new_cache; 212 } 213 out: 214 XBZRLE_cache_unlock(); 215 return ret; 216 } 217 218 static bool postcopy_preempt_active(void) 219 { 220 return migrate_postcopy_preempt() && migration_in_postcopy(); 221 } 222 223 bool ramblock_is_ignored(RAMBlock *block) 224 { 225 return !qemu_ram_is_migratable(block) || 226 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 227 } 228 229 #undef RAMBLOCK_FOREACH 230 231 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 232 { 233 RAMBlock *block; 234 int ret = 0; 235 236 RCU_READ_LOCK_GUARD(); 237 238 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 239 ret = func(block, opaque); 240 if (ret) { 241 break; 242 } 243 } 244 return ret; 245 } 246 247 static void ramblock_recv_map_init(void) 248 { 249 RAMBlock *rb; 250 251 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 252 assert(!rb->receivedmap); 253 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 254 } 255 } 256 257 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 258 { 259 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 260 rb->receivedmap); 261 } 262 263 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 264 { 265 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 266 } 267 268 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 269 { 270 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 271 } 272 273 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 274 size_t nr) 275 { 276 bitmap_set_atomic(rb->receivedmap, 277 ramblock_recv_bitmap_offset(host_addr, rb), 278 nr); 279 } 280 281 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 282 283 /* 284 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 285 * 286 * Returns >0 if success with sent bytes, or <0 if error. 287 */ 288 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 289 const char *block_name) 290 { 291 RAMBlock *block = qemu_ram_block_by_name(block_name); 292 unsigned long *le_bitmap, nbits; 293 uint64_t size; 294 295 if (!block) { 296 error_report("%s: invalid block name: %s", __func__, block_name); 297 return -1; 298 } 299 300 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 301 302 /* 303 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 304 * machines we may need 4 more bytes for padding (see below 305 * comment). So extend it a bit before hand. 306 */ 307 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 308 309 /* 310 * Always use little endian when sending the bitmap. This is 311 * required that when source and destination VMs are not using the 312 * same endianness. (Note: big endian won't work.) 313 */ 314 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 315 316 /* Size of the bitmap, in bytes */ 317 size = DIV_ROUND_UP(nbits, 8); 318 319 /* 320 * size is always aligned to 8 bytes for 64bit machines, but it 321 * may not be true for 32bit machines. We need this padding to 322 * make sure the migration can survive even between 32bit and 323 * 64bit machines. 324 */ 325 size = ROUND_UP(size, 8); 326 327 qemu_put_be64(file, size); 328 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 329 /* 330 * Mark as an end, in case the middle part is screwed up due to 331 * some "mysterious" reason. 332 */ 333 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 334 qemu_fflush(file); 335 336 g_free(le_bitmap); 337 338 if (qemu_file_get_error(file)) { 339 return qemu_file_get_error(file); 340 } 341 342 return size + sizeof(size); 343 } 344 345 /* 346 * An outstanding page request, on the source, having been received 347 * and queued 348 */ 349 struct RAMSrcPageRequest { 350 RAMBlock *rb; 351 hwaddr offset; 352 hwaddr len; 353 354 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 355 }; 356 357 /* State of RAM for migration */ 358 struct RAMState { 359 /* 360 * PageSearchStatus structures for the channels when send pages. 361 * Protected by the bitmap_mutex. 362 */ 363 PageSearchStatus pss[RAM_CHANNEL_MAX]; 364 /* UFFD file descriptor, used in 'write-tracking' migration */ 365 int uffdio_fd; 366 /* total ram size in bytes */ 367 uint64_t ram_bytes_total; 368 /* Last block that we have visited searching for dirty pages */ 369 RAMBlock *last_seen_block; 370 /* Last dirty target page we have sent */ 371 ram_addr_t last_page; 372 /* last ram version we have seen */ 373 uint32_t last_version; 374 /* How many times we have dirty too many pages */ 375 int dirty_rate_high_cnt; 376 /* these variables are used for bitmap sync */ 377 /* last time we did a full bitmap_sync */ 378 int64_t time_last_bitmap_sync; 379 /* bytes transferred at start_time */ 380 uint64_t bytes_xfer_prev; 381 /* number of dirty pages since start_time */ 382 uint64_t num_dirty_pages_period; 383 /* xbzrle misses since the beginning of the period */ 384 uint64_t xbzrle_cache_miss_prev; 385 /* Amount of xbzrle pages since the beginning of the period */ 386 uint64_t xbzrle_pages_prev; 387 /* Amount of xbzrle encoded bytes since the beginning of the period */ 388 uint64_t xbzrle_bytes_prev; 389 /* Start using XBZRLE (e.g., after the first round). */ 390 bool xbzrle_enabled; 391 /* Are we on the last stage of migration */ 392 bool last_stage; 393 /* compression statistics since the beginning of the period */ 394 /* amount of count that no free thread to compress data */ 395 uint64_t compress_thread_busy_prev; 396 /* amount bytes after compression */ 397 uint64_t compressed_size_prev; 398 /* amount of compressed pages */ 399 uint64_t compress_pages_prev; 400 401 /* total handled target pages at the beginning of period */ 402 uint64_t target_page_count_prev; 403 /* total handled target pages since start */ 404 uint64_t target_page_count; 405 /* number of dirty bits in the bitmap */ 406 uint64_t migration_dirty_pages; 407 /* 408 * Protects: 409 * - dirty/clear bitmap 410 * - migration_dirty_pages 411 * - pss structures 412 */ 413 QemuMutex bitmap_mutex; 414 /* The RAMBlock used in the last src_page_requests */ 415 RAMBlock *last_req_rb; 416 /* Queue of outstanding page requests from the destination */ 417 QemuMutex src_page_req_mutex; 418 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 419 }; 420 typedef struct RAMState RAMState; 421 422 static RAMState *ram_state; 423 424 static NotifierWithReturnList precopy_notifier_list; 425 426 /* Whether postcopy has queued requests? */ 427 static bool postcopy_has_request(RAMState *rs) 428 { 429 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 430 } 431 432 void precopy_infrastructure_init(void) 433 { 434 notifier_with_return_list_init(&precopy_notifier_list); 435 } 436 437 void precopy_add_notifier(NotifierWithReturn *n) 438 { 439 notifier_with_return_list_add(&precopy_notifier_list, n); 440 } 441 442 void precopy_remove_notifier(NotifierWithReturn *n) 443 { 444 notifier_with_return_remove(n); 445 } 446 447 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 448 { 449 PrecopyNotifyData pnd; 450 pnd.reason = reason; 451 pnd.errp = errp; 452 453 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 454 } 455 456 uint64_t ram_bytes_remaining(void) 457 { 458 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 459 0; 460 } 461 462 RAMStats ram_counters; 463 464 void ram_transferred_add(uint64_t bytes) 465 { 466 if (runstate_is_running()) { 467 stat64_add(&ram_counters.precopy_bytes, bytes); 468 } else if (migration_in_postcopy()) { 469 stat64_add(&ram_counters.postcopy_bytes, bytes); 470 } else { 471 stat64_add(&ram_counters.downtime_bytes, bytes); 472 } 473 stat64_add(&ram_counters.transferred, bytes); 474 } 475 476 struct MigrationOps { 477 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 478 }; 479 typedef struct MigrationOps MigrationOps; 480 481 MigrationOps *migration_ops; 482 483 CompressionStats compression_counters; 484 485 struct CompressParam { 486 bool done; 487 bool quit; 488 bool zero_page; 489 QEMUFile *file; 490 QemuMutex mutex; 491 QemuCond cond; 492 RAMBlock *block; 493 ram_addr_t offset; 494 495 /* internally used fields */ 496 z_stream stream; 497 uint8_t *originbuf; 498 }; 499 typedef struct CompressParam CompressParam; 500 501 struct DecompressParam { 502 bool done; 503 bool quit; 504 QemuMutex mutex; 505 QemuCond cond; 506 void *des; 507 uint8_t *compbuf; 508 int len; 509 z_stream stream; 510 }; 511 typedef struct DecompressParam DecompressParam; 512 513 static CompressParam *comp_param; 514 static QemuThread *compress_threads; 515 /* comp_done_cond is used to wake up the migration thread when 516 * one of the compression threads has finished the compression. 517 * comp_done_lock is used to co-work with comp_done_cond. 518 */ 519 static QemuMutex comp_done_lock; 520 static QemuCond comp_done_cond; 521 522 static QEMUFile *decomp_file; 523 static DecompressParam *decomp_param; 524 static QemuThread *decompress_threads; 525 static QemuMutex decomp_done_lock; 526 static QemuCond decomp_done_cond; 527 528 static int ram_save_host_page_urgent(PageSearchStatus *pss); 529 530 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 531 ram_addr_t offset, uint8_t *source_buf); 532 533 /* NOTE: page is the PFN not real ram_addr_t. */ 534 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 535 { 536 pss->block = rb; 537 pss->page = page; 538 pss->complete_round = false; 539 } 540 541 /* 542 * Check whether two PSSs are actively sending the same page. Return true 543 * if it is, false otherwise. 544 */ 545 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 546 { 547 return pss1->host_page_sending && pss2->host_page_sending && 548 (pss1->host_page_start == pss2->host_page_start); 549 } 550 551 static void *do_data_compress(void *opaque) 552 { 553 CompressParam *param = opaque; 554 RAMBlock *block; 555 ram_addr_t offset; 556 bool zero_page; 557 558 qemu_mutex_lock(¶m->mutex); 559 while (!param->quit) { 560 if (param->block) { 561 block = param->block; 562 offset = param->offset; 563 param->block = NULL; 564 qemu_mutex_unlock(¶m->mutex); 565 566 zero_page = do_compress_ram_page(param->file, ¶m->stream, 567 block, offset, param->originbuf); 568 569 qemu_mutex_lock(&comp_done_lock); 570 param->done = true; 571 param->zero_page = zero_page; 572 qemu_cond_signal(&comp_done_cond); 573 qemu_mutex_unlock(&comp_done_lock); 574 575 qemu_mutex_lock(¶m->mutex); 576 } else { 577 qemu_cond_wait(¶m->cond, ¶m->mutex); 578 } 579 } 580 qemu_mutex_unlock(¶m->mutex); 581 582 return NULL; 583 } 584 585 static void compress_threads_save_cleanup(void) 586 { 587 int i, thread_count; 588 589 if (!migrate_compress() || !comp_param) { 590 return; 591 } 592 593 thread_count = migrate_compress_threads(); 594 for (i = 0; i < thread_count; i++) { 595 /* 596 * we use it as a indicator which shows if the thread is 597 * properly init'd or not 598 */ 599 if (!comp_param[i].file) { 600 break; 601 } 602 603 qemu_mutex_lock(&comp_param[i].mutex); 604 comp_param[i].quit = true; 605 qemu_cond_signal(&comp_param[i].cond); 606 qemu_mutex_unlock(&comp_param[i].mutex); 607 608 qemu_thread_join(compress_threads + i); 609 qemu_mutex_destroy(&comp_param[i].mutex); 610 qemu_cond_destroy(&comp_param[i].cond); 611 deflateEnd(&comp_param[i].stream); 612 g_free(comp_param[i].originbuf); 613 qemu_fclose(comp_param[i].file); 614 comp_param[i].file = NULL; 615 } 616 qemu_mutex_destroy(&comp_done_lock); 617 qemu_cond_destroy(&comp_done_cond); 618 g_free(compress_threads); 619 g_free(comp_param); 620 compress_threads = NULL; 621 comp_param = NULL; 622 } 623 624 static int compress_threads_save_setup(void) 625 { 626 int i, thread_count; 627 628 if (!migrate_compress()) { 629 return 0; 630 } 631 thread_count = migrate_compress_threads(); 632 compress_threads = g_new0(QemuThread, thread_count); 633 comp_param = g_new0(CompressParam, thread_count); 634 qemu_cond_init(&comp_done_cond); 635 qemu_mutex_init(&comp_done_lock); 636 for (i = 0; i < thread_count; i++) { 637 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 638 if (!comp_param[i].originbuf) { 639 goto exit; 640 } 641 642 if (deflateInit(&comp_param[i].stream, 643 migrate_compress_level()) != Z_OK) { 644 g_free(comp_param[i].originbuf); 645 goto exit; 646 } 647 648 /* comp_param[i].file is just used as a dummy buffer to save data, 649 * set its ops to empty. 650 */ 651 comp_param[i].file = qemu_file_new_output( 652 QIO_CHANNEL(qio_channel_null_new())); 653 comp_param[i].done = true; 654 comp_param[i].quit = false; 655 qemu_mutex_init(&comp_param[i].mutex); 656 qemu_cond_init(&comp_param[i].cond); 657 qemu_thread_create(compress_threads + i, "compress", 658 do_data_compress, comp_param + i, 659 QEMU_THREAD_JOINABLE); 660 } 661 return 0; 662 663 exit: 664 compress_threads_save_cleanup(); 665 return -1; 666 } 667 668 /** 669 * save_page_header: write page header to wire 670 * 671 * If this is the 1st block, it also writes the block identification 672 * 673 * Returns the number of bytes written 674 * 675 * @pss: current PSS channel status 676 * @block: block that contains the page we want to send 677 * @offset: offset inside the block for the page 678 * in the lower bits, it contains flags 679 */ 680 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 681 RAMBlock *block, ram_addr_t offset) 682 { 683 size_t size, len; 684 bool same_block = (block == pss->last_sent_block); 685 686 if (same_block) { 687 offset |= RAM_SAVE_FLAG_CONTINUE; 688 } 689 qemu_put_be64(f, offset); 690 size = 8; 691 692 if (!same_block) { 693 len = strlen(block->idstr); 694 qemu_put_byte(f, len); 695 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 696 size += 1 + len; 697 pss->last_sent_block = block; 698 } 699 return size; 700 } 701 702 /** 703 * mig_throttle_guest_down: throttle down the guest 704 * 705 * Reduce amount of guest cpu execution to hopefully slow down memory 706 * writes. If guest dirty memory rate is reduced below the rate at 707 * which we can transfer pages to the destination then we should be 708 * able to complete migration. Some workloads dirty memory way too 709 * fast and will not effectively converge, even with auto-converge. 710 */ 711 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 712 uint64_t bytes_dirty_threshold) 713 { 714 MigrationState *s = migrate_get_current(); 715 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 716 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 717 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 718 int pct_max = s->parameters.max_cpu_throttle; 719 720 uint64_t throttle_now = cpu_throttle_get_percentage(); 721 uint64_t cpu_now, cpu_ideal, throttle_inc; 722 723 /* We have not started throttling yet. Let's start it. */ 724 if (!cpu_throttle_active()) { 725 cpu_throttle_set(pct_initial); 726 } else { 727 /* Throttling already on, just increase the rate */ 728 if (!pct_tailslow) { 729 throttle_inc = pct_increment; 730 } else { 731 /* Compute the ideal CPU percentage used by Guest, which may 732 * make the dirty rate match the dirty rate threshold. */ 733 cpu_now = 100 - throttle_now; 734 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 735 bytes_dirty_period); 736 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 737 } 738 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 739 } 740 } 741 742 void mig_throttle_counter_reset(void) 743 { 744 RAMState *rs = ram_state; 745 746 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 747 rs->num_dirty_pages_period = 0; 748 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); 749 } 750 751 /** 752 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 753 * 754 * @rs: current RAM state 755 * @current_addr: address for the zero page 756 * 757 * Update the xbzrle cache to reflect a page that's been sent as all 0. 758 * The important thing is that a stale (not-yet-0'd) page be replaced 759 * by the new data. 760 * As a bonus, if the page wasn't in the cache it gets added so that 761 * when a small write is made into the 0'd page it gets XBZRLE sent. 762 */ 763 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 764 { 765 /* We don't care if this fails to allocate a new cache page 766 * as long as it updated an old one */ 767 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 768 stat64_get(&ram_counters.dirty_sync_count)); 769 } 770 771 #define ENCODING_FLAG_XBZRLE 0x1 772 773 /** 774 * save_xbzrle_page: compress and send current page 775 * 776 * Returns: 1 means that we wrote the page 777 * 0 means that page is identical to the one already sent 778 * -1 means that xbzrle would be longer than normal 779 * 780 * @rs: current RAM state 781 * @pss: current PSS channel 782 * @current_data: pointer to the address of the page contents 783 * @current_addr: addr of the page 784 * @block: block that contains the page we want to send 785 * @offset: offset inside the block for the page 786 */ 787 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 788 uint8_t **current_data, ram_addr_t current_addr, 789 RAMBlock *block, ram_addr_t offset) 790 { 791 int encoded_len = 0, bytes_xbzrle; 792 uint8_t *prev_cached_page; 793 QEMUFile *file = pss->pss_channel; 794 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); 795 796 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 797 xbzrle_counters.cache_miss++; 798 if (!rs->last_stage) { 799 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 800 generation) == -1) { 801 return -1; 802 } else { 803 /* update *current_data when the page has been 804 inserted into cache */ 805 *current_data = get_cached_data(XBZRLE.cache, current_addr); 806 } 807 } 808 return -1; 809 } 810 811 /* 812 * Reaching here means the page has hit the xbzrle cache, no matter what 813 * encoding result it is (normal encoding, overflow or skipping the page), 814 * count the page as encoded. This is used to calculate the encoding rate. 815 * 816 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 817 * 2nd page turns out to be skipped (i.e. no new bytes written to the 818 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 819 * skipped page included. In this way, the encoding rate can tell if the 820 * guest page is good for xbzrle encoding. 821 */ 822 xbzrle_counters.pages++; 823 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 824 825 /* save current buffer into memory */ 826 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 827 828 /* XBZRLE encoding (if there is no overflow) */ 829 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, 830 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 831 TARGET_PAGE_SIZE); 832 833 /* 834 * Update the cache contents, so that it corresponds to the data 835 * sent, in all cases except where we skip the page. 836 */ 837 if (!rs->last_stage && encoded_len != 0) { 838 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 839 /* 840 * In the case where we couldn't compress, ensure that the caller 841 * sends the data from the cache, since the guest might have 842 * changed the RAM since we copied it. 843 */ 844 *current_data = prev_cached_page; 845 } 846 847 if (encoded_len == 0) { 848 trace_save_xbzrle_page_skipping(); 849 return 0; 850 } else if (encoded_len == -1) { 851 trace_save_xbzrle_page_overflow(); 852 xbzrle_counters.overflow++; 853 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 854 return -1; 855 } 856 857 /* Send XBZRLE based compressed page */ 858 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 859 offset | RAM_SAVE_FLAG_XBZRLE); 860 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 861 qemu_put_be16(file, encoded_len); 862 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 863 bytes_xbzrle += encoded_len + 1 + 2; 864 /* 865 * Like compressed_size (please see update_compress_thread_counts), 866 * the xbzrle encoded bytes don't count the 8 byte header with 867 * RAM_SAVE_FLAG_CONTINUE. 868 */ 869 xbzrle_counters.bytes += bytes_xbzrle - 8; 870 ram_transferred_add(bytes_xbzrle); 871 872 return 1; 873 } 874 875 /** 876 * pss_find_next_dirty: find the next dirty page of current ramblock 877 * 878 * This function updates pss->page to point to the next dirty page index 879 * within the ramblock to migrate, or the end of ramblock when nothing 880 * found. Note that when pss->host_page_sending==true it means we're 881 * during sending a host page, so we won't look for dirty page that is 882 * outside the host page boundary. 883 * 884 * @pss: the current page search status 885 */ 886 static void pss_find_next_dirty(PageSearchStatus *pss) 887 { 888 RAMBlock *rb = pss->block; 889 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 890 unsigned long *bitmap = rb->bmap; 891 892 if (ramblock_is_ignored(rb)) { 893 /* Points directly to the end, so we know no dirty page */ 894 pss->page = size; 895 return; 896 } 897 898 /* 899 * If during sending a host page, only look for dirty pages within the 900 * current host page being send. 901 */ 902 if (pss->host_page_sending) { 903 assert(pss->host_page_end); 904 size = MIN(size, pss->host_page_end); 905 } 906 907 pss->page = find_next_bit(bitmap, size, pss->page); 908 } 909 910 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 911 unsigned long page) 912 { 913 uint8_t shift; 914 hwaddr size, start; 915 916 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 917 return; 918 } 919 920 shift = rb->clear_bmap_shift; 921 /* 922 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 923 * can make things easier sometimes since then start address 924 * of the small chunk will always be 64 pages aligned so the 925 * bitmap will always be aligned to unsigned long. We should 926 * even be able to remove this restriction but I'm simply 927 * keeping it. 928 */ 929 assert(shift >= 6); 930 931 size = 1ULL << (TARGET_PAGE_BITS + shift); 932 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 933 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 934 memory_region_clear_dirty_bitmap(rb->mr, start, size); 935 } 936 937 static void 938 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 939 unsigned long start, 940 unsigned long npages) 941 { 942 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 943 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 944 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 945 946 /* 947 * Clear pages from start to start + npages - 1, so the end boundary is 948 * exclusive. 949 */ 950 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 951 migration_clear_memory_region_dirty_bitmap(rb, i); 952 } 953 } 954 955 /* 956 * colo_bitmap_find_diry:find contiguous dirty pages from start 957 * 958 * Returns the page offset within memory region of the start of the contiguout 959 * dirty page 960 * 961 * @rs: current RAM state 962 * @rb: RAMBlock where to search for dirty pages 963 * @start: page where we start the search 964 * @num: the number of contiguous dirty pages 965 */ 966 static inline 967 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 968 unsigned long start, unsigned long *num) 969 { 970 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 971 unsigned long *bitmap = rb->bmap; 972 unsigned long first, next; 973 974 *num = 0; 975 976 if (ramblock_is_ignored(rb)) { 977 return size; 978 } 979 980 first = find_next_bit(bitmap, size, start); 981 if (first >= size) { 982 return first; 983 } 984 next = find_next_zero_bit(bitmap, size, first + 1); 985 assert(next >= first); 986 *num = next - first; 987 return first; 988 } 989 990 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 991 RAMBlock *rb, 992 unsigned long page) 993 { 994 bool ret; 995 996 /* 997 * Clear dirty bitmap if needed. This _must_ be called before we 998 * send any of the page in the chunk because we need to make sure 999 * we can capture further page content changes when we sync dirty 1000 * log the next time. So as long as we are going to send any of 1001 * the page in the chunk we clear the remote dirty bitmap for all. 1002 * Clearing it earlier won't be a problem, but too late will. 1003 */ 1004 migration_clear_memory_region_dirty_bitmap(rb, page); 1005 1006 ret = test_and_clear_bit(page, rb->bmap); 1007 if (ret) { 1008 rs->migration_dirty_pages--; 1009 } 1010 1011 return ret; 1012 } 1013 1014 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 1015 void *opaque) 1016 { 1017 const hwaddr offset = section->offset_within_region; 1018 const hwaddr size = int128_get64(section->size); 1019 const unsigned long start = offset >> TARGET_PAGE_BITS; 1020 const unsigned long npages = size >> TARGET_PAGE_BITS; 1021 RAMBlock *rb = section->mr->ram_block; 1022 uint64_t *cleared_bits = opaque; 1023 1024 /* 1025 * We don't grab ram_state->bitmap_mutex because we expect to run 1026 * only when starting migration or during postcopy recovery where 1027 * we don't have concurrent access. 1028 */ 1029 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1030 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1031 } 1032 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1033 bitmap_clear(rb->bmap, start, npages); 1034 } 1035 1036 /* 1037 * Exclude all dirty pages from migration that fall into a discarded range as 1038 * managed by a RamDiscardManager responsible for the mapped memory region of 1039 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1040 * 1041 * Discarded pages ("logically unplugged") have undefined content and must 1042 * not get migrated, because even reading these pages for migration might 1043 * result in undesired behavior. 1044 * 1045 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1046 * 1047 * Note: The result is only stable while migrating (precopy/postcopy). 1048 */ 1049 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1050 { 1051 uint64_t cleared_bits = 0; 1052 1053 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1054 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1055 MemoryRegionSection section = { 1056 .mr = rb->mr, 1057 .offset_within_region = 0, 1058 .size = int128_make64(qemu_ram_get_used_length(rb)), 1059 }; 1060 1061 ram_discard_manager_replay_discarded(rdm, §ion, 1062 dirty_bitmap_clear_section, 1063 &cleared_bits); 1064 } 1065 return cleared_bits; 1066 } 1067 1068 /* 1069 * Check if a host-page aligned page falls into a discarded range as managed by 1070 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1071 * 1072 * Note: The result is only stable while migrating (precopy/postcopy). 1073 */ 1074 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1075 { 1076 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1077 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1078 MemoryRegionSection section = { 1079 .mr = rb->mr, 1080 .offset_within_region = start, 1081 .size = int128_make64(qemu_ram_pagesize(rb)), 1082 }; 1083 1084 return !ram_discard_manager_is_populated(rdm, §ion); 1085 } 1086 return false; 1087 } 1088 1089 /* Called with RCU critical section */ 1090 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1091 { 1092 uint64_t new_dirty_pages = 1093 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1094 1095 rs->migration_dirty_pages += new_dirty_pages; 1096 rs->num_dirty_pages_period += new_dirty_pages; 1097 } 1098 1099 /** 1100 * ram_pagesize_summary: calculate all the pagesizes of a VM 1101 * 1102 * Returns a summary bitmap of the page sizes of all RAMBlocks 1103 * 1104 * For VMs with just normal pages this is equivalent to the host page 1105 * size. If it's got some huge pages then it's the OR of all the 1106 * different page sizes. 1107 */ 1108 uint64_t ram_pagesize_summary(void) 1109 { 1110 RAMBlock *block; 1111 uint64_t summary = 0; 1112 1113 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1114 summary |= block->page_size; 1115 } 1116 1117 return summary; 1118 } 1119 1120 uint64_t ram_get_total_transferred_pages(void) 1121 { 1122 return stat64_get(&ram_counters.normal_pages) + 1123 stat64_get(&ram_counters.zero_pages) + 1124 compression_counters.pages + xbzrle_counters.pages; 1125 } 1126 1127 static void migration_update_rates(RAMState *rs, int64_t end_time) 1128 { 1129 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1130 double compressed_size; 1131 1132 /* calculate period counters */ 1133 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1134 / (end_time - rs->time_last_bitmap_sync); 1135 1136 if (!page_count) { 1137 return; 1138 } 1139 1140 if (migrate_xbzrle()) { 1141 double encoded_size, unencoded_size; 1142 1143 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1144 rs->xbzrle_cache_miss_prev) / page_count; 1145 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1146 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1147 TARGET_PAGE_SIZE; 1148 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1149 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1150 xbzrle_counters.encoding_rate = 0; 1151 } else { 1152 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1153 } 1154 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1155 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1156 } 1157 1158 if (migrate_compress()) { 1159 compression_counters.busy_rate = (double)(compression_counters.busy - 1160 rs->compress_thread_busy_prev) / page_count; 1161 rs->compress_thread_busy_prev = compression_counters.busy; 1162 1163 compressed_size = compression_counters.compressed_size - 1164 rs->compressed_size_prev; 1165 if (compressed_size) { 1166 double uncompressed_size = (compression_counters.pages - 1167 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1168 1169 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1170 compression_counters.compression_rate = 1171 uncompressed_size / compressed_size; 1172 1173 rs->compress_pages_prev = compression_counters.pages; 1174 rs->compressed_size_prev = compression_counters.compressed_size; 1175 } 1176 } 1177 } 1178 1179 static void migration_trigger_throttle(RAMState *rs) 1180 { 1181 MigrationState *s = migrate_get_current(); 1182 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1183 uint64_t bytes_xfer_period = 1184 stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev; 1185 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1186 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1187 1188 /* During block migration the auto-converge logic incorrectly detects 1189 * that ram migration makes no progress. Avoid this by disabling the 1190 * throttling logic during the bulk phase of block migration. */ 1191 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1192 /* The following detection logic can be refined later. For now: 1193 Check to see if the ratio between dirtied bytes and the approx. 1194 amount of bytes that just got transferred since the last time 1195 we were in this routine reaches the threshold. If that happens 1196 twice, start or increase throttling. */ 1197 1198 if ((bytes_dirty_period > bytes_dirty_threshold) && 1199 (++rs->dirty_rate_high_cnt >= 2)) { 1200 trace_migration_throttle(); 1201 rs->dirty_rate_high_cnt = 0; 1202 mig_throttle_guest_down(bytes_dirty_period, 1203 bytes_dirty_threshold); 1204 } 1205 } 1206 } 1207 1208 static void migration_bitmap_sync(RAMState *rs) 1209 { 1210 RAMBlock *block; 1211 int64_t end_time; 1212 1213 stat64_add(&ram_counters.dirty_sync_count, 1); 1214 1215 if (!rs->time_last_bitmap_sync) { 1216 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1217 } 1218 1219 trace_migration_bitmap_sync_start(); 1220 memory_global_dirty_log_sync(); 1221 1222 qemu_mutex_lock(&rs->bitmap_mutex); 1223 WITH_RCU_READ_LOCK_GUARD() { 1224 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1225 ramblock_sync_dirty_bitmap(rs, block); 1226 } 1227 ram_counters.remaining = ram_bytes_remaining(); 1228 } 1229 qemu_mutex_unlock(&rs->bitmap_mutex); 1230 1231 memory_global_after_dirty_log_sync(); 1232 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1233 1234 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1235 1236 /* more than 1 second = 1000 millisecons */ 1237 if (end_time > rs->time_last_bitmap_sync + 1000) { 1238 migration_trigger_throttle(rs); 1239 1240 migration_update_rates(rs, end_time); 1241 1242 rs->target_page_count_prev = rs->target_page_count; 1243 1244 /* reset period counters */ 1245 rs->time_last_bitmap_sync = end_time; 1246 rs->num_dirty_pages_period = 0; 1247 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); 1248 } 1249 if (migrate_events()) { 1250 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); 1251 qapi_event_send_migration_pass(generation); 1252 } 1253 } 1254 1255 static void migration_bitmap_sync_precopy(RAMState *rs) 1256 { 1257 Error *local_err = NULL; 1258 1259 /* 1260 * The current notifier usage is just an optimization to migration, so we 1261 * don't stop the normal migration process in the error case. 1262 */ 1263 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1264 error_report_err(local_err); 1265 local_err = NULL; 1266 } 1267 1268 migration_bitmap_sync(rs); 1269 1270 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1271 error_report_err(local_err); 1272 } 1273 } 1274 1275 void ram_release_page(const char *rbname, uint64_t offset) 1276 { 1277 if (!migrate_release_ram() || !migration_in_postcopy()) { 1278 return; 1279 } 1280 1281 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1282 } 1283 1284 /** 1285 * save_zero_page_to_file: send the zero page to the file 1286 * 1287 * Returns the size of data written to the file, 0 means the page is not 1288 * a zero page 1289 * 1290 * @pss: current PSS channel 1291 * @block: block that contains the page we want to send 1292 * @offset: offset inside the block for the page 1293 */ 1294 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file, 1295 RAMBlock *block, ram_addr_t offset) 1296 { 1297 uint8_t *p = block->host + offset; 1298 int len = 0; 1299 1300 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1301 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO); 1302 qemu_put_byte(file, 0); 1303 len += 1; 1304 ram_release_page(block->idstr, offset); 1305 } 1306 return len; 1307 } 1308 1309 /** 1310 * save_zero_page: send the zero page to the stream 1311 * 1312 * Returns the number of pages written. 1313 * 1314 * @pss: current PSS channel 1315 * @block: block that contains the page we want to send 1316 * @offset: offset inside the block for the page 1317 */ 1318 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, 1319 ram_addr_t offset) 1320 { 1321 int len = save_zero_page_to_file(pss, f, block, offset); 1322 1323 if (len) { 1324 stat64_add(&ram_counters.zero_pages, 1); 1325 ram_transferred_add(len); 1326 return 1; 1327 } 1328 return -1; 1329 } 1330 1331 /* 1332 * @pages: the number of pages written by the control path, 1333 * < 0 - error 1334 * > 0 - number of pages written 1335 * 1336 * Return true if the pages has been saved, otherwise false is returned. 1337 */ 1338 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1339 ram_addr_t offset, int *pages) 1340 { 1341 uint64_t bytes_xmit = 0; 1342 int ret; 1343 1344 *pages = -1; 1345 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1346 TARGET_PAGE_SIZE, &bytes_xmit); 1347 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1348 return false; 1349 } 1350 1351 if (bytes_xmit) { 1352 ram_transferred_add(bytes_xmit); 1353 *pages = 1; 1354 } 1355 1356 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1357 return true; 1358 } 1359 1360 if (bytes_xmit > 0) { 1361 stat64_add(&ram_counters.normal_pages, 1); 1362 } else if (bytes_xmit == 0) { 1363 stat64_add(&ram_counters.zero_pages, 1); 1364 } 1365 1366 return true; 1367 } 1368 1369 /* 1370 * directly send the page to the stream 1371 * 1372 * Returns the number of pages written. 1373 * 1374 * @pss: current PSS channel 1375 * @block: block that contains the page we want to send 1376 * @offset: offset inside the block for the page 1377 * @buf: the page to be sent 1378 * @async: send to page asyncly 1379 */ 1380 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1381 ram_addr_t offset, uint8_t *buf, bool async) 1382 { 1383 QEMUFile *file = pss->pss_channel; 1384 1385 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1386 offset | RAM_SAVE_FLAG_PAGE)); 1387 if (async) { 1388 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1389 migrate_release_ram() && 1390 migration_in_postcopy()); 1391 } else { 1392 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1393 } 1394 ram_transferred_add(TARGET_PAGE_SIZE); 1395 stat64_add(&ram_counters.normal_pages, 1); 1396 return 1; 1397 } 1398 1399 /** 1400 * ram_save_page: send the given page to the stream 1401 * 1402 * Returns the number of pages written. 1403 * < 0 - error 1404 * >=0 - Number of pages written - this might legally be 0 1405 * if xbzrle noticed the page was the same. 1406 * 1407 * @rs: current RAM state 1408 * @block: block that contains the page we want to send 1409 * @offset: offset inside the block for the page 1410 */ 1411 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1412 { 1413 int pages = -1; 1414 uint8_t *p; 1415 bool send_async = true; 1416 RAMBlock *block = pss->block; 1417 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1418 ram_addr_t current_addr = block->offset + offset; 1419 1420 p = block->host + offset; 1421 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1422 1423 XBZRLE_cache_lock(); 1424 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1425 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1426 block, offset); 1427 if (!rs->last_stage) { 1428 /* Can't send this cached data async, since the cache page 1429 * might get updated before it gets to the wire 1430 */ 1431 send_async = false; 1432 } 1433 } 1434 1435 /* XBZRLE overflow or normal page */ 1436 if (pages == -1) { 1437 pages = save_normal_page(pss, block, offset, p, send_async); 1438 } 1439 1440 XBZRLE_cache_unlock(); 1441 1442 return pages; 1443 } 1444 1445 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1446 ram_addr_t offset) 1447 { 1448 if (multifd_queue_page(file, block, offset) < 0) { 1449 return -1; 1450 } 1451 stat64_add(&ram_counters.normal_pages, 1); 1452 1453 return 1; 1454 } 1455 1456 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1457 ram_addr_t offset, uint8_t *source_buf) 1458 { 1459 RAMState *rs = ram_state; 1460 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1461 uint8_t *p = block->host + offset; 1462 int ret; 1463 1464 if (save_zero_page_to_file(pss, f, block, offset)) { 1465 return true; 1466 } 1467 1468 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1469 1470 /* 1471 * copy it to a internal buffer to avoid it being modified by VM 1472 * so that we can catch up the error during compression and 1473 * decompression 1474 */ 1475 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1476 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1477 if (ret < 0) { 1478 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1479 error_report("compressed data failed!"); 1480 } 1481 return false; 1482 } 1483 1484 static void 1485 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1486 { 1487 ram_transferred_add(bytes_xmit); 1488 1489 if (param->zero_page) { 1490 stat64_add(&ram_counters.zero_pages, 1); 1491 return; 1492 } 1493 1494 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1495 compression_counters.compressed_size += bytes_xmit - 8; 1496 compression_counters.pages++; 1497 } 1498 1499 static bool save_page_use_compression(RAMState *rs); 1500 1501 static void flush_compressed_data(RAMState *rs) 1502 { 1503 MigrationState *ms = migrate_get_current(); 1504 int idx, len, thread_count; 1505 1506 if (!save_page_use_compression(rs)) { 1507 return; 1508 } 1509 thread_count = migrate_compress_threads(); 1510 1511 qemu_mutex_lock(&comp_done_lock); 1512 for (idx = 0; idx < thread_count; idx++) { 1513 while (!comp_param[idx].done) { 1514 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1515 } 1516 } 1517 qemu_mutex_unlock(&comp_done_lock); 1518 1519 for (idx = 0; idx < thread_count; idx++) { 1520 qemu_mutex_lock(&comp_param[idx].mutex); 1521 if (!comp_param[idx].quit) { 1522 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1523 /* 1524 * it's safe to fetch zero_page without holding comp_done_lock 1525 * as there is no further request submitted to the thread, 1526 * i.e, the thread should be waiting for a request at this point. 1527 */ 1528 update_compress_thread_counts(&comp_param[idx], len); 1529 } 1530 qemu_mutex_unlock(&comp_param[idx].mutex); 1531 } 1532 } 1533 1534 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1535 ram_addr_t offset) 1536 { 1537 param->block = block; 1538 param->offset = offset; 1539 } 1540 1541 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1542 { 1543 int idx, thread_count, bytes_xmit = -1, pages = -1; 1544 bool wait = migrate_compress_wait_thread(); 1545 MigrationState *ms = migrate_get_current(); 1546 1547 thread_count = migrate_compress_threads(); 1548 qemu_mutex_lock(&comp_done_lock); 1549 retry: 1550 for (idx = 0; idx < thread_count; idx++) { 1551 if (comp_param[idx].done) { 1552 comp_param[idx].done = false; 1553 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1554 comp_param[idx].file); 1555 qemu_mutex_lock(&comp_param[idx].mutex); 1556 set_compress_params(&comp_param[idx], block, offset); 1557 qemu_cond_signal(&comp_param[idx].cond); 1558 qemu_mutex_unlock(&comp_param[idx].mutex); 1559 pages = 1; 1560 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1561 break; 1562 } 1563 } 1564 1565 /* 1566 * wait for the free thread if the user specifies 'compress-wait-thread', 1567 * otherwise we will post the page out in the main thread as normal page. 1568 */ 1569 if (pages < 0 && wait) { 1570 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1571 goto retry; 1572 } 1573 qemu_mutex_unlock(&comp_done_lock); 1574 1575 return pages; 1576 } 1577 1578 #define PAGE_ALL_CLEAN 0 1579 #define PAGE_TRY_AGAIN 1 1580 #define PAGE_DIRTY_FOUND 2 1581 /** 1582 * find_dirty_block: find the next dirty page and update any state 1583 * associated with the search process. 1584 * 1585 * Returns: 1586 * PAGE_ALL_CLEAN: no dirty page found, give up 1587 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1588 * PAGE_DIRTY_FOUND: dirty page found 1589 * 1590 * @rs: current RAM state 1591 * @pss: data about the state of the current dirty page scan 1592 * @again: set to false if the search has scanned the whole of RAM 1593 */ 1594 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1595 { 1596 /* Update pss->page for the next dirty bit in ramblock */ 1597 pss_find_next_dirty(pss); 1598 1599 if (pss->complete_round && pss->block == rs->last_seen_block && 1600 pss->page >= rs->last_page) { 1601 /* 1602 * We've been once around the RAM and haven't found anything. 1603 * Give up. 1604 */ 1605 return PAGE_ALL_CLEAN; 1606 } 1607 if (!offset_in_ramblock(pss->block, 1608 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1609 /* Didn't find anything in this RAM Block */ 1610 pss->page = 0; 1611 pss->block = QLIST_NEXT_RCU(pss->block, next); 1612 if (!pss->block) { 1613 /* 1614 * If memory migration starts over, we will meet a dirtied page 1615 * which may still exists in compression threads's ring, so we 1616 * should flush the compressed data to make sure the new page 1617 * is not overwritten by the old one in the destination. 1618 * 1619 * Also If xbzrle is on, stop using the data compression at this 1620 * point. In theory, xbzrle can do better than compression. 1621 */ 1622 flush_compressed_data(rs); 1623 1624 /* Hit the end of the list */ 1625 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1626 /* Flag that we've looped */ 1627 pss->complete_round = true; 1628 /* After the first round, enable XBZRLE. */ 1629 if (migrate_xbzrle()) { 1630 rs->xbzrle_enabled = true; 1631 } 1632 } 1633 /* Didn't find anything this time, but try again on the new block */ 1634 return PAGE_TRY_AGAIN; 1635 } else { 1636 /* We've found something */ 1637 return PAGE_DIRTY_FOUND; 1638 } 1639 } 1640 1641 /** 1642 * unqueue_page: gets a page of the queue 1643 * 1644 * Helper for 'get_queued_page' - gets a page off the queue 1645 * 1646 * Returns the block of the page (or NULL if none available) 1647 * 1648 * @rs: current RAM state 1649 * @offset: used to return the offset within the RAMBlock 1650 */ 1651 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1652 { 1653 struct RAMSrcPageRequest *entry; 1654 RAMBlock *block = NULL; 1655 1656 if (!postcopy_has_request(rs)) { 1657 return NULL; 1658 } 1659 1660 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1661 1662 /* 1663 * This should _never_ change even after we take the lock, because no one 1664 * should be taking anything off the request list other than us. 1665 */ 1666 assert(postcopy_has_request(rs)); 1667 1668 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1669 block = entry->rb; 1670 *offset = entry->offset; 1671 1672 if (entry->len > TARGET_PAGE_SIZE) { 1673 entry->len -= TARGET_PAGE_SIZE; 1674 entry->offset += TARGET_PAGE_SIZE; 1675 } else { 1676 memory_region_unref(block->mr); 1677 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1678 g_free(entry); 1679 migration_consume_urgent_request(); 1680 } 1681 1682 return block; 1683 } 1684 1685 #if defined(__linux__) 1686 /** 1687 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1688 * is found, return RAM block pointer and page offset 1689 * 1690 * Returns pointer to the RAMBlock containing faulting page, 1691 * NULL if no write faults are pending 1692 * 1693 * @rs: current RAM state 1694 * @offset: page offset from the beginning of the block 1695 */ 1696 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1697 { 1698 struct uffd_msg uffd_msg; 1699 void *page_address; 1700 RAMBlock *block; 1701 int res; 1702 1703 if (!migrate_background_snapshot()) { 1704 return NULL; 1705 } 1706 1707 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1708 if (res <= 0) { 1709 return NULL; 1710 } 1711 1712 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1713 block = qemu_ram_block_from_host(page_address, false, offset); 1714 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1715 return block; 1716 } 1717 1718 /** 1719 * ram_save_release_protection: release UFFD write protection after 1720 * a range of pages has been saved 1721 * 1722 * @rs: current RAM state 1723 * @pss: page-search-status structure 1724 * @start_page: index of the first page in the range relative to pss->block 1725 * 1726 * Returns 0 on success, negative value in case of an error 1727 */ 1728 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1729 unsigned long start_page) 1730 { 1731 int res = 0; 1732 1733 /* Check if page is from UFFD-managed region. */ 1734 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1735 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1736 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1737 1738 /* Flush async buffers before un-protect. */ 1739 qemu_fflush(pss->pss_channel); 1740 /* Un-protect memory range. */ 1741 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1742 false, false); 1743 } 1744 1745 return res; 1746 } 1747 1748 /* ram_write_tracking_available: check if kernel supports required UFFD features 1749 * 1750 * Returns true if supports, false otherwise 1751 */ 1752 bool ram_write_tracking_available(void) 1753 { 1754 uint64_t uffd_features; 1755 int res; 1756 1757 res = uffd_query_features(&uffd_features); 1758 return (res == 0 && 1759 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1760 } 1761 1762 /* ram_write_tracking_compatible: check if guest configuration is 1763 * compatible with 'write-tracking' 1764 * 1765 * Returns true if compatible, false otherwise 1766 */ 1767 bool ram_write_tracking_compatible(void) 1768 { 1769 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1770 int uffd_fd; 1771 RAMBlock *block; 1772 bool ret = false; 1773 1774 /* Open UFFD file descriptor */ 1775 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1776 if (uffd_fd < 0) { 1777 return false; 1778 } 1779 1780 RCU_READ_LOCK_GUARD(); 1781 1782 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1783 uint64_t uffd_ioctls; 1784 1785 /* Nothing to do with read-only and MMIO-writable regions */ 1786 if (block->mr->readonly || block->mr->rom_device) { 1787 continue; 1788 } 1789 /* Try to register block memory via UFFD-IO to track writes */ 1790 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1791 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1792 goto out; 1793 } 1794 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1795 goto out; 1796 } 1797 } 1798 ret = true; 1799 1800 out: 1801 uffd_close_fd(uffd_fd); 1802 return ret; 1803 } 1804 1805 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1806 ram_addr_t size) 1807 { 1808 const ram_addr_t end = offset + size; 1809 1810 /* 1811 * We read one byte of each page; this will preallocate page tables if 1812 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1813 * where no page was populated yet. This might require adaption when 1814 * supporting other mappings, like shmem. 1815 */ 1816 for (; offset < end; offset += block->page_size) { 1817 char tmp = *((char *)block->host + offset); 1818 1819 /* Don't optimize the read out */ 1820 asm volatile("" : "+r" (tmp)); 1821 } 1822 } 1823 1824 static inline int populate_read_section(MemoryRegionSection *section, 1825 void *opaque) 1826 { 1827 const hwaddr size = int128_get64(section->size); 1828 hwaddr offset = section->offset_within_region; 1829 RAMBlock *block = section->mr->ram_block; 1830 1831 populate_read_range(block, offset, size); 1832 return 0; 1833 } 1834 1835 /* 1836 * ram_block_populate_read: preallocate page tables and populate pages in the 1837 * RAM block by reading a byte of each page. 1838 * 1839 * Since it's solely used for userfault_fd WP feature, here we just 1840 * hardcode page size to qemu_real_host_page_size. 1841 * 1842 * @block: RAM block to populate 1843 */ 1844 static void ram_block_populate_read(RAMBlock *rb) 1845 { 1846 /* 1847 * Skip populating all pages that fall into a discarded range as managed by 1848 * a RamDiscardManager responsible for the mapped memory region of the 1849 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1850 * must not get populated automatically. We don't have to track 1851 * modifications via userfaultfd WP reliably, because these pages will 1852 * not be part of the migration stream either way -- see 1853 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1854 * 1855 * Note: The result is only stable while migrating (precopy/postcopy). 1856 */ 1857 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1858 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1859 MemoryRegionSection section = { 1860 .mr = rb->mr, 1861 .offset_within_region = 0, 1862 .size = rb->mr->size, 1863 }; 1864 1865 ram_discard_manager_replay_populated(rdm, §ion, 1866 populate_read_section, NULL); 1867 } else { 1868 populate_read_range(rb, 0, rb->used_length); 1869 } 1870 } 1871 1872 /* 1873 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1874 */ 1875 void ram_write_tracking_prepare(void) 1876 { 1877 RAMBlock *block; 1878 1879 RCU_READ_LOCK_GUARD(); 1880 1881 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1882 /* Nothing to do with read-only and MMIO-writable regions */ 1883 if (block->mr->readonly || block->mr->rom_device) { 1884 continue; 1885 } 1886 1887 /* 1888 * Populate pages of the RAM block before enabling userfault_fd 1889 * write protection. 1890 * 1891 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1892 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1893 * pages with pte_none() entries in page table. 1894 */ 1895 ram_block_populate_read(block); 1896 } 1897 } 1898 1899 static inline int uffd_protect_section(MemoryRegionSection *section, 1900 void *opaque) 1901 { 1902 const hwaddr size = int128_get64(section->size); 1903 const hwaddr offset = section->offset_within_region; 1904 RAMBlock *rb = section->mr->ram_block; 1905 int uffd_fd = (uintptr_t)opaque; 1906 1907 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1908 false); 1909 } 1910 1911 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1912 { 1913 assert(rb->flags & RAM_UF_WRITEPROTECT); 1914 1915 /* See ram_block_populate_read() */ 1916 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1917 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1918 MemoryRegionSection section = { 1919 .mr = rb->mr, 1920 .offset_within_region = 0, 1921 .size = rb->mr->size, 1922 }; 1923 1924 return ram_discard_manager_replay_populated(rdm, §ion, 1925 uffd_protect_section, 1926 (void *)(uintptr_t)uffd_fd); 1927 } 1928 return uffd_change_protection(uffd_fd, rb->host, 1929 rb->used_length, true, false); 1930 } 1931 1932 /* 1933 * ram_write_tracking_start: start UFFD-WP memory tracking 1934 * 1935 * Returns 0 for success or negative value in case of error 1936 */ 1937 int ram_write_tracking_start(void) 1938 { 1939 int uffd_fd; 1940 RAMState *rs = ram_state; 1941 RAMBlock *block; 1942 1943 /* Open UFFD file descriptor */ 1944 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1945 if (uffd_fd < 0) { 1946 return uffd_fd; 1947 } 1948 rs->uffdio_fd = uffd_fd; 1949 1950 RCU_READ_LOCK_GUARD(); 1951 1952 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1953 /* Nothing to do with read-only and MMIO-writable regions */ 1954 if (block->mr->readonly || block->mr->rom_device) { 1955 continue; 1956 } 1957 1958 /* Register block memory with UFFD to track writes */ 1959 if (uffd_register_memory(rs->uffdio_fd, block->host, 1960 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1961 goto fail; 1962 } 1963 block->flags |= RAM_UF_WRITEPROTECT; 1964 memory_region_ref(block->mr); 1965 1966 /* Apply UFFD write protection to the block memory range */ 1967 if (ram_block_uffd_protect(block, uffd_fd)) { 1968 goto fail; 1969 } 1970 1971 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1972 block->host, block->max_length); 1973 } 1974 1975 return 0; 1976 1977 fail: 1978 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1979 1980 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1981 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1982 continue; 1983 } 1984 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1985 /* Cleanup flags and remove reference */ 1986 block->flags &= ~RAM_UF_WRITEPROTECT; 1987 memory_region_unref(block->mr); 1988 } 1989 1990 uffd_close_fd(uffd_fd); 1991 rs->uffdio_fd = -1; 1992 return -1; 1993 } 1994 1995 /** 1996 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1997 */ 1998 void ram_write_tracking_stop(void) 1999 { 2000 RAMState *rs = ram_state; 2001 RAMBlock *block; 2002 2003 RCU_READ_LOCK_GUARD(); 2004 2005 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2006 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 2007 continue; 2008 } 2009 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 2010 2011 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 2012 block->host, block->max_length); 2013 2014 /* Cleanup flags and remove reference */ 2015 block->flags &= ~RAM_UF_WRITEPROTECT; 2016 memory_region_unref(block->mr); 2017 } 2018 2019 /* Finally close UFFD file descriptor */ 2020 uffd_close_fd(rs->uffdio_fd); 2021 rs->uffdio_fd = -1; 2022 } 2023 2024 #else 2025 /* No target OS support, stubs just fail or ignore */ 2026 2027 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 2028 { 2029 (void) rs; 2030 (void) offset; 2031 2032 return NULL; 2033 } 2034 2035 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 2036 unsigned long start_page) 2037 { 2038 (void) rs; 2039 (void) pss; 2040 (void) start_page; 2041 2042 return 0; 2043 } 2044 2045 bool ram_write_tracking_available(void) 2046 { 2047 return false; 2048 } 2049 2050 bool ram_write_tracking_compatible(void) 2051 { 2052 assert(0); 2053 return false; 2054 } 2055 2056 int ram_write_tracking_start(void) 2057 { 2058 assert(0); 2059 return -1; 2060 } 2061 2062 void ram_write_tracking_stop(void) 2063 { 2064 assert(0); 2065 } 2066 #endif /* defined(__linux__) */ 2067 2068 /** 2069 * get_queued_page: unqueue a page from the postcopy requests 2070 * 2071 * Skips pages that are already sent (!dirty) 2072 * 2073 * Returns true if a queued page is found 2074 * 2075 * @rs: current RAM state 2076 * @pss: data about the state of the current dirty page scan 2077 */ 2078 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2079 { 2080 RAMBlock *block; 2081 ram_addr_t offset; 2082 bool dirty; 2083 2084 do { 2085 block = unqueue_page(rs, &offset); 2086 /* 2087 * We're sending this page, and since it's postcopy nothing else 2088 * will dirty it, and we must make sure it doesn't get sent again 2089 * even if this queue request was received after the background 2090 * search already sent it. 2091 */ 2092 if (block) { 2093 unsigned long page; 2094 2095 page = offset >> TARGET_PAGE_BITS; 2096 dirty = test_bit(page, block->bmap); 2097 if (!dirty) { 2098 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2099 page); 2100 } else { 2101 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2102 } 2103 } 2104 2105 } while (block && !dirty); 2106 2107 if (!block) { 2108 /* 2109 * Poll write faults too if background snapshot is enabled; that's 2110 * when we have vcpus got blocked by the write protected pages. 2111 */ 2112 block = poll_fault_page(rs, &offset); 2113 } 2114 2115 if (block) { 2116 /* 2117 * We want the background search to continue from the queued page 2118 * since the guest is likely to want other pages near to the page 2119 * it just requested. 2120 */ 2121 pss->block = block; 2122 pss->page = offset >> TARGET_PAGE_BITS; 2123 2124 /* 2125 * This unqueued page would break the "one round" check, even is 2126 * really rare. 2127 */ 2128 pss->complete_round = false; 2129 } 2130 2131 return !!block; 2132 } 2133 2134 /** 2135 * migration_page_queue_free: drop any remaining pages in the ram 2136 * request queue 2137 * 2138 * It should be empty at the end anyway, but in error cases there may 2139 * be some left. in case that there is any page left, we drop it. 2140 * 2141 */ 2142 static void migration_page_queue_free(RAMState *rs) 2143 { 2144 struct RAMSrcPageRequest *mspr, *next_mspr; 2145 /* This queue generally should be empty - but in the case of a failed 2146 * migration might have some droppings in. 2147 */ 2148 RCU_READ_LOCK_GUARD(); 2149 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2150 memory_region_unref(mspr->rb->mr); 2151 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2152 g_free(mspr); 2153 } 2154 } 2155 2156 /** 2157 * ram_save_queue_pages: queue the page for transmission 2158 * 2159 * A request from postcopy destination for example. 2160 * 2161 * Returns zero on success or negative on error 2162 * 2163 * @rbname: Name of the RAMBLock of the request. NULL means the 2164 * same that last one. 2165 * @start: starting address from the start of the RAMBlock 2166 * @len: length (in bytes) to send 2167 */ 2168 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2169 { 2170 RAMBlock *ramblock; 2171 RAMState *rs = ram_state; 2172 2173 stat64_add(&ram_counters.postcopy_requests, 1); 2174 RCU_READ_LOCK_GUARD(); 2175 2176 if (!rbname) { 2177 /* Reuse last RAMBlock */ 2178 ramblock = rs->last_req_rb; 2179 2180 if (!ramblock) { 2181 /* 2182 * Shouldn't happen, we can't reuse the last RAMBlock if 2183 * it's the 1st request. 2184 */ 2185 error_report("ram_save_queue_pages no previous block"); 2186 return -1; 2187 } 2188 } else { 2189 ramblock = qemu_ram_block_by_name(rbname); 2190 2191 if (!ramblock) { 2192 /* We shouldn't be asked for a non-existent RAMBlock */ 2193 error_report("ram_save_queue_pages no block '%s'", rbname); 2194 return -1; 2195 } 2196 rs->last_req_rb = ramblock; 2197 } 2198 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2199 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2200 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2201 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2202 __func__, start, len, ramblock->used_length); 2203 return -1; 2204 } 2205 2206 /* 2207 * When with postcopy preempt, we send back the page directly in the 2208 * rp-return thread. 2209 */ 2210 if (postcopy_preempt_active()) { 2211 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2212 size_t page_size = qemu_ram_pagesize(ramblock); 2213 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2214 int ret = 0; 2215 2216 qemu_mutex_lock(&rs->bitmap_mutex); 2217 2218 pss_init(pss, ramblock, page_start); 2219 /* 2220 * Always use the preempt channel, and make sure it's there. It's 2221 * safe to access without lock, because when rp-thread is running 2222 * we should be the only one who operates on the qemufile 2223 */ 2224 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2225 assert(pss->pss_channel); 2226 2227 /* 2228 * It must be either one or multiple of host page size. Just 2229 * assert; if something wrong we're mostly split brain anyway. 2230 */ 2231 assert(len % page_size == 0); 2232 while (len) { 2233 if (ram_save_host_page_urgent(pss)) { 2234 error_report("%s: ram_save_host_page_urgent() failed: " 2235 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2236 __func__, ramblock->idstr, start); 2237 ret = -1; 2238 break; 2239 } 2240 /* 2241 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2242 * will automatically be moved and point to the next host page 2243 * we're going to send, so no need to update here. 2244 * 2245 * Normally QEMU never sends >1 host page in requests, so 2246 * logically we don't even need that as the loop should only 2247 * run once, but just to be consistent. 2248 */ 2249 len -= page_size; 2250 }; 2251 qemu_mutex_unlock(&rs->bitmap_mutex); 2252 2253 return ret; 2254 } 2255 2256 struct RAMSrcPageRequest *new_entry = 2257 g_new0(struct RAMSrcPageRequest, 1); 2258 new_entry->rb = ramblock; 2259 new_entry->offset = start; 2260 new_entry->len = len; 2261 2262 memory_region_ref(ramblock->mr); 2263 qemu_mutex_lock(&rs->src_page_req_mutex); 2264 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2265 migration_make_urgent_request(); 2266 qemu_mutex_unlock(&rs->src_page_req_mutex); 2267 2268 return 0; 2269 } 2270 2271 static bool save_page_use_compression(RAMState *rs) 2272 { 2273 if (!migrate_compress()) { 2274 return false; 2275 } 2276 2277 /* 2278 * If xbzrle is enabled (e.g., after first round of migration), stop 2279 * using the data compression. In theory, xbzrle can do better than 2280 * compression. 2281 */ 2282 if (rs->xbzrle_enabled) { 2283 return false; 2284 } 2285 2286 return true; 2287 } 2288 2289 /* 2290 * try to compress the page before posting it out, return true if the page 2291 * has been properly handled by compression, otherwise needs other 2292 * paths to handle it 2293 */ 2294 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2295 RAMBlock *block, ram_addr_t offset) 2296 { 2297 if (!save_page_use_compression(rs)) { 2298 return false; 2299 } 2300 2301 /* 2302 * When starting the process of a new block, the first page of 2303 * the block should be sent out before other pages in the same 2304 * block, and all the pages in last block should have been sent 2305 * out, keeping this order is important, because the 'cont' flag 2306 * is used to avoid resending the block name. 2307 * 2308 * We post the fist page as normal page as compression will take 2309 * much CPU resource. 2310 */ 2311 if (block != pss->last_sent_block) { 2312 flush_compressed_data(rs); 2313 return false; 2314 } 2315 2316 if (compress_page_with_multi_thread(block, offset) > 0) { 2317 return true; 2318 } 2319 2320 compression_counters.busy++; 2321 return false; 2322 } 2323 2324 /** 2325 * ram_save_target_page_legacy: save one target page 2326 * 2327 * Returns the number of pages written 2328 * 2329 * @rs: current RAM state 2330 * @pss: data about the page we want to send 2331 */ 2332 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 2333 { 2334 RAMBlock *block = pss->block; 2335 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2336 int res; 2337 2338 if (control_save_page(pss, block, offset, &res)) { 2339 return res; 2340 } 2341 2342 if (save_compress_page(rs, pss, block, offset)) { 2343 return 1; 2344 } 2345 2346 res = save_zero_page(pss, pss->pss_channel, block, offset); 2347 if (res > 0) { 2348 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2349 * page would be stale 2350 */ 2351 if (rs->xbzrle_enabled) { 2352 XBZRLE_cache_lock(); 2353 xbzrle_cache_zero_page(rs, block->offset + offset); 2354 XBZRLE_cache_unlock(); 2355 } 2356 return res; 2357 } 2358 2359 /* 2360 * Do not use multifd in postcopy as one whole host page should be 2361 * placed. Meanwhile postcopy requires atomic update of pages, so even 2362 * if host page size == guest page size the dest guest during run may 2363 * still see partially copied pages which is data corruption. 2364 */ 2365 if (migrate_multifd() && !migration_in_postcopy()) { 2366 return ram_save_multifd_page(pss->pss_channel, block, offset); 2367 } 2368 2369 return ram_save_page(rs, pss); 2370 } 2371 2372 /* Should be called before sending a host page */ 2373 static void pss_host_page_prepare(PageSearchStatus *pss) 2374 { 2375 /* How many guest pages are there in one host page? */ 2376 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2377 2378 pss->host_page_sending = true; 2379 if (guest_pfns <= 1) { 2380 /* 2381 * This covers both when guest psize == host psize, or when guest 2382 * has larger psize than the host (guest_pfns==0). 2383 * 2384 * For the latter, we always send one whole guest page per 2385 * iteration of the host page (example: an Alpha VM on x86 host 2386 * will have guest psize 8K while host psize 4K). 2387 */ 2388 pss->host_page_start = pss->page; 2389 pss->host_page_end = pss->page + 1; 2390 } else { 2391 /* 2392 * The host page spans over multiple guest pages, we send them 2393 * within the same host page iteration. 2394 */ 2395 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2396 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2397 } 2398 } 2399 2400 /* 2401 * Whether the page pointed by PSS is within the host page being sent. 2402 * Must be called after a previous pss_host_page_prepare(). 2403 */ 2404 static bool pss_within_range(PageSearchStatus *pss) 2405 { 2406 ram_addr_t ram_addr; 2407 2408 assert(pss->host_page_sending); 2409 2410 /* Over host-page boundary? */ 2411 if (pss->page >= pss->host_page_end) { 2412 return false; 2413 } 2414 2415 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2416 2417 return offset_in_ramblock(pss->block, ram_addr); 2418 } 2419 2420 static void pss_host_page_finish(PageSearchStatus *pss) 2421 { 2422 pss->host_page_sending = false; 2423 /* This is not needed, but just to reset it */ 2424 pss->host_page_start = pss->host_page_end = 0; 2425 } 2426 2427 /* 2428 * Send an urgent host page specified by `pss'. Need to be called with 2429 * bitmap_mutex held. 2430 * 2431 * Returns 0 if save host page succeeded, false otherwise. 2432 */ 2433 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2434 { 2435 bool page_dirty, sent = false; 2436 RAMState *rs = ram_state; 2437 int ret = 0; 2438 2439 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2440 pss_host_page_prepare(pss); 2441 2442 /* 2443 * If precopy is sending the same page, let it be done in precopy, or 2444 * we could send the same page in two channels and none of them will 2445 * receive the whole page. 2446 */ 2447 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2448 trace_postcopy_preempt_hit(pss->block->idstr, 2449 pss->page << TARGET_PAGE_BITS); 2450 return 0; 2451 } 2452 2453 do { 2454 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2455 2456 if (page_dirty) { 2457 /* Be strict to return code; it must be 1, or what else? */ 2458 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2459 error_report_once("%s: ram_save_target_page failed", __func__); 2460 ret = -1; 2461 goto out; 2462 } 2463 sent = true; 2464 } 2465 pss_find_next_dirty(pss); 2466 } while (pss_within_range(pss)); 2467 out: 2468 pss_host_page_finish(pss); 2469 /* For urgent requests, flush immediately if sent */ 2470 if (sent) { 2471 qemu_fflush(pss->pss_channel); 2472 } 2473 return ret; 2474 } 2475 2476 /** 2477 * ram_save_host_page: save a whole host page 2478 * 2479 * Starting at *offset send pages up to the end of the current host 2480 * page. It's valid for the initial offset to point into the middle of 2481 * a host page in which case the remainder of the hostpage is sent. 2482 * Only dirty target pages are sent. Note that the host page size may 2483 * be a huge page for this block. 2484 * 2485 * The saving stops at the boundary of the used_length of the block 2486 * if the RAMBlock isn't a multiple of the host page size. 2487 * 2488 * The caller must be with ram_state.bitmap_mutex held to call this 2489 * function. Note that this function can temporarily release the lock, but 2490 * when the function is returned it'll make sure the lock is still held. 2491 * 2492 * Returns the number of pages written or negative on error 2493 * 2494 * @rs: current RAM state 2495 * @pss: data about the page we want to send 2496 */ 2497 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2498 { 2499 bool page_dirty, preempt_active = postcopy_preempt_active(); 2500 int tmppages, pages = 0; 2501 size_t pagesize_bits = 2502 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2503 unsigned long start_page = pss->page; 2504 int res; 2505 2506 if (ramblock_is_ignored(pss->block)) { 2507 error_report("block %s should not be migrated !", pss->block->idstr); 2508 return 0; 2509 } 2510 2511 /* Update host page boundary information */ 2512 pss_host_page_prepare(pss); 2513 2514 do { 2515 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2516 2517 /* Check the pages is dirty and if it is send it */ 2518 if (page_dirty) { 2519 /* 2520 * Properly yield the lock only in postcopy preempt mode 2521 * because both migration thread and rp-return thread can 2522 * operate on the bitmaps. 2523 */ 2524 if (preempt_active) { 2525 qemu_mutex_unlock(&rs->bitmap_mutex); 2526 } 2527 tmppages = migration_ops->ram_save_target_page(rs, pss); 2528 if (tmppages >= 0) { 2529 pages += tmppages; 2530 /* 2531 * Allow rate limiting to happen in the middle of huge pages if 2532 * something is sent in the current iteration. 2533 */ 2534 if (pagesize_bits > 1 && tmppages > 0) { 2535 migration_rate_limit(); 2536 } 2537 } 2538 if (preempt_active) { 2539 qemu_mutex_lock(&rs->bitmap_mutex); 2540 } 2541 } else { 2542 tmppages = 0; 2543 } 2544 2545 if (tmppages < 0) { 2546 pss_host_page_finish(pss); 2547 return tmppages; 2548 } 2549 2550 pss_find_next_dirty(pss); 2551 } while (pss_within_range(pss)); 2552 2553 pss_host_page_finish(pss); 2554 2555 res = ram_save_release_protection(rs, pss, start_page); 2556 return (res < 0 ? res : pages); 2557 } 2558 2559 /** 2560 * ram_find_and_save_block: finds a dirty page and sends it to f 2561 * 2562 * Called within an RCU critical section. 2563 * 2564 * Returns the number of pages written where zero means no dirty pages, 2565 * or negative on error 2566 * 2567 * @rs: current RAM state 2568 * 2569 * On systems where host-page-size > target-page-size it will send all the 2570 * pages in a host page that are dirty. 2571 */ 2572 static int ram_find_and_save_block(RAMState *rs) 2573 { 2574 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2575 int pages = 0; 2576 2577 /* No dirty page as there is zero RAM */ 2578 if (!rs->ram_bytes_total) { 2579 return pages; 2580 } 2581 2582 /* 2583 * Always keep last_seen_block/last_page valid during this procedure, 2584 * because find_dirty_block() relies on these values (e.g., we compare 2585 * last_seen_block with pss.block to see whether we searched all the 2586 * ramblocks) to detect the completion of migration. Having NULL value 2587 * of last_seen_block can conditionally cause below loop to run forever. 2588 */ 2589 if (!rs->last_seen_block) { 2590 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2591 rs->last_page = 0; 2592 } 2593 2594 pss_init(pss, rs->last_seen_block, rs->last_page); 2595 2596 while (true){ 2597 if (!get_queued_page(rs, pss)) { 2598 /* priority queue empty, so just search for something dirty */ 2599 int res = find_dirty_block(rs, pss); 2600 if (res != PAGE_DIRTY_FOUND) { 2601 if (res == PAGE_ALL_CLEAN) { 2602 break; 2603 } else if (res == PAGE_TRY_AGAIN) { 2604 continue; 2605 } 2606 } 2607 } 2608 pages = ram_save_host_page(rs, pss); 2609 if (pages) { 2610 break; 2611 } 2612 } 2613 2614 rs->last_seen_block = pss->block; 2615 rs->last_page = pss->page; 2616 2617 return pages; 2618 } 2619 2620 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2621 { 2622 uint64_t pages = size / TARGET_PAGE_SIZE; 2623 2624 if (zero) { 2625 stat64_add(&ram_counters.zero_pages, pages); 2626 } else { 2627 stat64_add(&ram_counters.normal_pages, pages); 2628 ram_transferred_add(size); 2629 qemu_file_credit_transfer(f, size); 2630 } 2631 } 2632 2633 static uint64_t ram_bytes_total_with_ignored(void) 2634 { 2635 RAMBlock *block; 2636 uint64_t total = 0; 2637 2638 RCU_READ_LOCK_GUARD(); 2639 2640 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2641 total += block->used_length; 2642 } 2643 return total; 2644 } 2645 2646 uint64_t ram_bytes_total(void) 2647 { 2648 RAMBlock *block; 2649 uint64_t total = 0; 2650 2651 RCU_READ_LOCK_GUARD(); 2652 2653 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2654 total += block->used_length; 2655 } 2656 return total; 2657 } 2658 2659 static void xbzrle_load_setup(void) 2660 { 2661 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2662 } 2663 2664 static void xbzrle_load_cleanup(void) 2665 { 2666 g_free(XBZRLE.decoded_buf); 2667 XBZRLE.decoded_buf = NULL; 2668 } 2669 2670 static void ram_state_cleanup(RAMState **rsp) 2671 { 2672 if (*rsp) { 2673 migration_page_queue_free(*rsp); 2674 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2675 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2676 g_free(*rsp); 2677 *rsp = NULL; 2678 } 2679 } 2680 2681 static void xbzrle_cleanup(void) 2682 { 2683 XBZRLE_cache_lock(); 2684 if (XBZRLE.cache) { 2685 cache_fini(XBZRLE.cache); 2686 g_free(XBZRLE.encoded_buf); 2687 g_free(XBZRLE.current_buf); 2688 g_free(XBZRLE.zero_target_page); 2689 XBZRLE.cache = NULL; 2690 XBZRLE.encoded_buf = NULL; 2691 XBZRLE.current_buf = NULL; 2692 XBZRLE.zero_target_page = NULL; 2693 } 2694 XBZRLE_cache_unlock(); 2695 } 2696 2697 static void ram_save_cleanup(void *opaque) 2698 { 2699 RAMState **rsp = opaque; 2700 RAMBlock *block; 2701 2702 /* We don't use dirty log with background snapshots */ 2703 if (!migrate_background_snapshot()) { 2704 /* caller have hold iothread lock or is in a bh, so there is 2705 * no writing race against the migration bitmap 2706 */ 2707 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2708 /* 2709 * do not stop dirty log without starting it, since 2710 * memory_global_dirty_log_stop will assert that 2711 * memory_global_dirty_log_start/stop used in pairs 2712 */ 2713 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2714 } 2715 } 2716 2717 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2718 g_free(block->clear_bmap); 2719 block->clear_bmap = NULL; 2720 g_free(block->bmap); 2721 block->bmap = NULL; 2722 } 2723 2724 xbzrle_cleanup(); 2725 compress_threads_save_cleanup(); 2726 ram_state_cleanup(rsp); 2727 g_free(migration_ops); 2728 migration_ops = NULL; 2729 } 2730 2731 static void ram_state_reset(RAMState *rs) 2732 { 2733 int i; 2734 2735 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2736 rs->pss[i].last_sent_block = NULL; 2737 } 2738 2739 rs->last_seen_block = NULL; 2740 rs->last_page = 0; 2741 rs->last_version = ram_list.version; 2742 rs->xbzrle_enabled = false; 2743 } 2744 2745 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2746 2747 /* **** functions for postcopy ***** */ 2748 2749 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2750 { 2751 struct RAMBlock *block; 2752 2753 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2754 unsigned long *bitmap = block->bmap; 2755 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2756 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2757 2758 while (run_start < range) { 2759 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2760 ram_discard_range(block->idstr, 2761 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2762 ((ram_addr_t)(run_end - run_start)) 2763 << TARGET_PAGE_BITS); 2764 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2765 } 2766 } 2767 } 2768 2769 /** 2770 * postcopy_send_discard_bm_ram: discard a RAMBlock 2771 * 2772 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2773 * 2774 * @ms: current migration state 2775 * @block: RAMBlock to discard 2776 */ 2777 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2778 { 2779 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2780 unsigned long current; 2781 unsigned long *bitmap = block->bmap; 2782 2783 for (current = 0; current < end; ) { 2784 unsigned long one = find_next_bit(bitmap, end, current); 2785 unsigned long zero, discard_length; 2786 2787 if (one >= end) { 2788 break; 2789 } 2790 2791 zero = find_next_zero_bit(bitmap, end, one + 1); 2792 2793 if (zero >= end) { 2794 discard_length = end - one; 2795 } else { 2796 discard_length = zero - one; 2797 } 2798 postcopy_discard_send_range(ms, one, discard_length); 2799 current = one + discard_length; 2800 } 2801 } 2802 2803 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2804 2805 /** 2806 * postcopy_each_ram_send_discard: discard all RAMBlocks 2807 * 2808 * Utility for the outgoing postcopy code. 2809 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2810 * passing it bitmap indexes and name. 2811 * (qemu_ram_foreach_block ends up passing unscaled lengths 2812 * which would mean postcopy code would have to deal with target page) 2813 * 2814 * @ms: current migration state 2815 */ 2816 static void postcopy_each_ram_send_discard(MigrationState *ms) 2817 { 2818 struct RAMBlock *block; 2819 2820 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2821 postcopy_discard_send_init(ms, block->idstr); 2822 2823 /* 2824 * Deal with TPS != HPS and huge pages. It discard any partially sent 2825 * host-page size chunks, mark any partially dirty host-page size 2826 * chunks as all dirty. In this case the host-page is the host-page 2827 * for the particular RAMBlock, i.e. it might be a huge page. 2828 */ 2829 postcopy_chunk_hostpages_pass(ms, block); 2830 2831 /* 2832 * Postcopy sends chunks of bitmap over the wire, but it 2833 * just needs indexes at this point, avoids it having 2834 * target page specific code. 2835 */ 2836 postcopy_send_discard_bm_ram(ms, block); 2837 postcopy_discard_send_finish(ms); 2838 } 2839 } 2840 2841 /** 2842 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2843 * 2844 * Helper for postcopy_chunk_hostpages; it's called twice to 2845 * canonicalize the two bitmaps, that are similar, but one is 2846 * inverted. 2847 * 2848 * Postcopy requires that all target pages in a hostpage are dirty or 2849 * clean, not a mix. This function canonicalizes the bitmaps. 2850 * 2851 * @ms: current migration state 2852 * @block: block that contains the page we want to canonicalize 2853 */ 2854 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2855 { 2856 RAMState *rs = ram_state; 2857 unsigned long *bitmap = block->bmap; 2858 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2859 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2860 unsigned long run_start; 2861 2862 if (block->page_size == TARGET_PAGE_SIZE) { 2863 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2864 return; 2865 } 2866 2867 /* Find a dirty page */ 2868 run_start = find_next_bit(bitmap, pages, 0); 2869 2870 while (run_start < pages) { 2871 2872 /* 2873 * If the start of this run of pages is in the middle of a host 2874 * page, then we need to fixup this host page. 2875 */ 2876 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2877 /* Find the end of this run */ 2878 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2879 /* 2880 * If the end isn't at the start of a host page, then the 2881 * run doesn't finish at the end of a host page 2882 * and we need to discard. 2883 */ 2884 } 2885 2886 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2887 unsigned long page; 2888 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2889 host_ratio); 2890 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2891 2892 /* Clean up the bitmap */ 2893 for (page = fixup_start_addr; 2894 page < fixup_start_addr + host_ratio; page++) { 2895 /* 2896 * Remark them as dirty, updating the count for any pages 2897 * that weren't previously dirty. 2898 */ 2899 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2900 } 2901 } 2902 2903 /* Find the next dirty page for the next iteration */ 2904 run_start = find_next_bit(bitmap, pages, run_start); 2905 } 2906 } 2907 2908 /** 2909 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2910 * 2911 * Transmit the set of pages to be discarded after precopy to the target 2912 * these are pages that: 2913 * a) Have been previously transmitted but are now dirty again 2914 * b) Pages that have never been transmitted, this ensures that 2915 * any pages on the destination that have been mapped by background 2916 * tasks get discarded (transparent huge pages is the specific concern) 2917 * Hopefully this is pretty sparse 2918 * 2919 * @ms: current migration state 2920 */ 2921 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2922 { 2923 RAMState *rs = ram_state; 2924 2925 RCU_READ_LOCK_GUARD(); 2926 2927 /* This should be our last sync, the src is now paused */ 2928 migration_bitmap_sync(rs); 2929 2930 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2931 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2932 rs->last_seen_block = NULL; 2933 rs->last_page = 0; 2934 2935 postcopy_each_ram_send_discard(ms); 2936 2937 trace_ram_postcopy_send_discard_bitmap(); 2938 } 2939 2940 /** 2941 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2942 * 2943 * Returns zero on success 2944 * 2945 * @rbname: name of the RAMBlock of the request. NULL means the 2946 * same that last one. 2947 * @start: RAMBlock starting page 2948 * @length: RAMBlock size 2949 */ 2950 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2951 { 2952 trace_ram_discard_range(rbname, start, length); 2953 2954 RCU_READ_LOCK_GUARD(); 2955 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2956 2957 if (!rb) { 2958 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2959 return -1; 2960 } 2961 2962 /* 2963 * On source VM, we don't need to update the received bitmap since 2964 * we don't even have one. 2965 */ 2966 if (rb->receivedmap) { 2967 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2968 length >> qemu_target_page_bits()); 2969 } 2970 2971 return ram_block_discard_range(rb, start, length); 2972 } 2973 2974 /* 2975 * For every allocation, we will try not to crash the VM if the 2976 * allocation failed. 2977 */ 2978 static int xbzrle_init(void) 2979 { 2980 Error *local_err = NULL; 2981 2982 if (!migrate_xbzrle()) { 2983 return 0; 2984 } 2985 2986 XBZRLE_cache_lock(); 2987 2988 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2989 if (!XBZRLE.zero_target_page) { 2990 error_report("%s: Error allocating zero page", __func__); 2991 goto err_out; 2992 } 2993 2994 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2995 TARGET_PAGE_SIZE, &local_err); 2996 if (!XBZRLE.cache) { 2997 error_report_err(local_err); 2998 goto free_zero_page; 2999 } 3000 3001 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 3002 if (!XBZRLE.encoded_buf) { 3003 error_report("%s: Error allocating encoded_buf", __func__); 3004 goto free_cache; 3005 } 3006 3007 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 3008 if (!XBZRLE.current_buf) { 3009 error_report("%s: Error allocating current_buf", __func__); 3010 goto free_encoded_buf; 3011 } 3012 3013 /* We are all good */ 3014 XBZRLE_cache_unlock(); 3015 return 0; 3016 3017 free_encoded_buf: 3018 g_free(XBZRLE.encoded_buf); 3019 XBZRLE.encoded_buf = NULL; 3020 free_cache: 3021 cache_fini(XBZRLE.cache); 3022 XBZRLE.cache = NULL; 3023 free_zero_page: 3024 g_free(XBZRLE.zero_target_page); 3025 XBZRLE.zero_target_page = NULL; 3026 err_out: 3027 XBZRLE_cache_unlock(); 3028 return -ENOMEM; 3029 } 3030 3031 static int ram_state_init(RAMState **rsp) 3032 { 3033 *rsp = g_try_new0(RAMState, 1); 3034 3035 if (!*rsp) { 3036 error_report("%s: Init ramstate fail", __func__); 3037 return -1; 3038 } 3039 3040 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3041 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3042 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3043 (*rsp)->ram_bytes_total = ram_bytes_total(); 3044 3045 /* 3046 * Count the total number of pages used by ram blocks not including any 3047 * gaps due to alignment or unplugs. 3048 * This must match with the initial values of dirty bitmap. 3049 */ 3050 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 3051 ram_state_reset(*rsp); 3052 3053 return 0; 3054 } 3055 3056 static void ram_list_init_bitmaps(void) 3057 { 3058 MigrationState *ms = migrate_get_current(); 3059 RAMBlock *block; 3060 unsigned long pages; 3061 uint8_t shift; 3062 3063 /* Skip setting bitmap if there is no RAM */ 3064 if (ram_bytes_total()) { 3065 shift = ms->clear_bitmap_shift; 3066 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3067 error_report("clear_bitmap_shift (%u) too big, using " 3068 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3069 shift = CLEAR_BITMAP_SHIFT_MAX; 3070 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3071 error_report("clear_bitmap_shift (%u) too small, using " 3072 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3073 shift = CLEAR_BITMAP_SHIFT_MIN; 3074 } 3075 3076 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3077 pages = block->max_length >> TARGET_PAGE_BITS; 3078 /* 3079 * The initial dirty bitmap for migration must be set with all 3080 * ones to make sure we'll migrate every guest RAM page to 3081 * destination. 3082 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3083 * new migration after a failed migration, ram_list. 3084 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3085 * guest memory. 3086 */ 3087 block->bmap = bitmap_new(pages); 3088 bitmap_set(block->bmap, 0, pages); 3089 block->clear_bmap_shift = shift; 3090 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3091 } 3092 } 3093 } 3094 3095 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3096 { 3097 unsigned long pages; 3098 RAMBlock *rb; 3099 3100 RCU_READ_LOCK_GUARD(); 3101 3102 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3103 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3104 rs->migration_dirty_pages -= pages; 3105 } 3106 } 3107 3108 static void ram_init_bitmaps(RAMState *rs) 3109 { 3110 /* For memory_global_dirty_log_start below. */ 3111 qemu_mutex_lock_iothread(); 3112 qemu_mutex_lock_ramlist(); 3113 3114 WITH_RCU_READ_LOCK_GUARD() { 3115 ram_list_init_bitmaps(); 3116 /* We don't use dirty log with background snapshots */ 3117 if (!migrate_background_snapshot()) { 3118 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3119 migration_bitmap_sync_precopy(rs); 3120 } 3121 } 3122 qemu_mutex_unlock_ramlist(); 3123 qemu_mutex_unlock_iothread(); 3124 3125 /* 3126 * After an eventual first bitmap sync, fixup the initial bitmap 3127 * containing all 1s to exclude any discarded pages from migration. 3128 */ 3129 migration_bitmap_clear_discarded_pages(rs); 3130 } 3131 3132 static int ram_init_all(RAMState **rsp) 3133 { 3134 if (ram_state_init(rsp)) { 3135 return -1; 3136 } 3137 3138 if (xbzrle_init()) { 3139 ram_state_cleanup(rsp); 3140 return -1; 3141 } 3142 3143 ram_init_bitmaps(*rsp); 3144 3145 return 0; 3146 } 3147 3148 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3149 { 3150 RAMBlock *block; 3151 uint64_t pages = 0; 3152 3153 /* 3154 * Postcopy is not using xbzrle/compression, so no need for that. 3155 * Also, since source are already halted, we don't need to care 3156 * about dirty page logging as well. 3157 */ 3158 3159 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3160 pages += bitmap_count_one(block->bmap, 3161 block->used_length >> TARGET_PAGE_BITS); 3162 } 3163 3164 /* This may not be aligned with current bitmaps. Recalculate. */ 3165 rs->migration_dirty_pages = pages; 3166 3167 ram_state_reset(rs); 3168 3169 /* Update RAMState cache of output QEMUFile */ 3170 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3171 3172 trace_ram_state_resume_prepare(pages); 3173 } 3174 3175 /* 3176 * This function clears bits of the free pages reported by the caller from the 3177 * migration dirty bitmap. @addr is the host address corresponding to the 3178 * start of the continuous guest free pages, and @len is the total bytes of 3179 * those pages. 3180 */ 3181 void qemu_guest_free_page_hint(void *addr, size_t len) 3182 { 3183 RAMBlock *block; 3184 ram_addr_t offset; 3185 size_t used_len, start, npages; 3186 MigrationState *s = migrate_get_current(); 3187 3188 /* This function is currently expected to be used during live migration */ 3189 if (!migration_is_setup_or_active(s->state)) { 3190 return; 3191 } 3192 3193 for (; len > 0; len -= used_len, addr += used_len) { 3194 block = qemu_ram_block_from_host(addr, false, &offset); 3195 if (unlikely(!block || offset >= block->used_length)) { 3196 /* 3197 * The implementation might not support RAMBlock resize during 3198 * live migration, but it could happen in theory with future 3199 * updates. So we add a check here to capture that case. 3200 */ 3201 error_report_once("%s unexpected error", __func__); 3202 return; 3203 } 3204 3205 if (len <= block->used_length - offset) { 3206 used_len = len; 3207 } else { 3208 used_len = block->used_length - offset; 3209 } 3210 3211 start = offset >> TARGET_PAGE_BITS; 3212 npages = used_len >> TARGET_PAGE_BITS; 3213 3214 qemu_mutex_lock(&ram_state->bitmap_mutex); 3215 /* 3216 * The skipped free pages are equavalent to be sent from clear_bmap's 3217 * perspective, so clear the bits from the memory region bitmap which 3218 * are initially set. Otherwise those skipped pages will be sent in 3219 * the next round after syncing from the memory region bitmap. 3220 */ 3221 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3222 ram_state->migration_dirty_pages -= 3223 bitmap_count_one_with_offset(block->bmap, start, npages); 3224 bitmap_clear(block->bmap, start, npages); 3225 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3226 } 3227 } 3228 3229 /* 3230 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3231 * long-running RCU critical section. When rcu-reclaims in the code 3232 * start to become numerous it will be necessary to reduce the 3233 * granularity of these critical sections. 3234 */ 3235 3236 /** 3237 * ram_save_setup: Setup RAM for migration 3238 * 3239 * Returns zero to indicate success and negative for error 3240 * 3241 * @f: QEMUFile where to send the data 3242 * @opaque: RAMState pointer 3243 */ 3244 static int ram_save_setup(QEMUFile *f, void *opaque) 3245 { 3246 RAMState **rsp = opaque; 3247 RAMBlock *block; 3248 int ret; 3249 3250 if (compress_threads_save_setup()) { 3251 return -1; 3252 } 3253 3254 /* migration has already setup the bitmap, reuse it. */ 3255 if (!migration_in_colo_state()) { 3256 if (ram_init_all(rsp) != 0) { 3257 compress_threads_save_cleanup(); 3258 return -1; 3259 } 3260 } 3261 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3262 3263 WITH_RCU_READ_LOCK_GUARD() { 3264 qemu_put_be64(f, ram_bytes_total_with_ignored() 3265 | RAM_SAVE_FLAG_MEM_SIZE); 3266 3267 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3268 qemu_put_byte(f, strlen(block->idstr)); 3269 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3270 qemu_put_be64(f, block->used_length); 3271 if (migrate_postcopy_ram() && block->page_size != 3272 qemu_host_page_size) { 3273 qemu_put_be64(f, block->page_size); 3274 } 3275 if (migrate_ignore_shared()) { 3276 qemu_put_be64(f, block->mr->addr); 3277 } 3278 } 3279 } 3280 3281 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3282 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3283 3284 migration_ops = g_malloc0(sizeof(MigrationOps)); 3285 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3286 ret = multifd_send_sync_main(f); 3287 if (ret < 0) { 3288 return ret; 3289 } 3290 3291 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3292 qemu_fflush(f); 3293 3294 return 0; 3295 } 3296 3297 /** 3298 * ram_save_iterate: iterative stage for migration 3299 * 3300 * Returns zero to indicate success and negative for error 3301 * 3302 * @f: QEMUFile where to send the data 3303 * @opaque: RAMState pointer 3304 */ 3305 static int ram_save_iterate(QEMUFile *f, void *opaque) 3306 { 3307 RAMState **temp = opaque; 3308 RAMState *rs = *temp; 3309 int ret = 0; 3310 int i; 3311 int64_t t0; 3312 int done = 0; 3313 3314 if (blk_mig_bulk_active()) { 3315 /* Avoid transferring ram during bulk phase of block migration as 3316 * the bulk phase will usually take a long time and transferring 3317 * ram updates during that time is pointless. */ 3318 goto out; 3319 } 3320 3321 /* 3322 * We'll take this lock a little bit long, but it's okay for two reasons. 3323 * Firstly, the only possible other thread to take it is who calls 3324 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3325 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3326 * guarantees that we'll at least released it in a regular basis. 3327 */ 3328 qemu_mutex_lock(&rs->bitmap_mutex); 3329 WITH_RCU_READ_LOCK_GUARD() { 3330 if (ram_list.version != rs->last_version) { 3331 ram_state_reset(rs); 3332 } 3333 3334 /* Read version before ram_list.blocks */ 3335 smp_rmb(); 3336 3337 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3338 3339 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3340 i = 0; 3341 while ((ret = qemu_file_rate_limit(f)) == 0 || 3342 postcopy_has_request(rs)) { 3343 int pages; 3344 3345 if (qemu_file_get_error(f)) { 3346 break; 3347 } 3348 3349 pages = ram_find_and_save_block(rs); 3350 /* no more pages to sent */ 3351 if (pages == 0) { 3352 done = 1; 3353 break; 3354 } 3355 3356 if (pages < 0) { 3357 qemu_file_set_error(f, pages); 3358 break; 3359 } 3360 3361 rs->target_page_count += pages; 3362 3363 /* 3364 * During postcopy, it is necessary to make sure one whole host 3365 * page is sent in one chunk. 3366 */ 3367 if (migrate_postcopy_ram()) { 3368 flush_compressed_data(rs); 3369 } 3370 3371 /* 3372 * we want to check in the 1st loop, just in case it was the 1st 3373 * time and we had to sync the dirty bitmap. 3374 * qemu_clock_get_ns() is a bit expensive, so we only check each 3375 * some iterations 3376 */ 3377 if ((i & 63) == 0) { 3378 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3379 1000000; 3380 if (t1 > MAX_WAIT) { 3381 trace_ram_save_iterate_big_wait(t1, i); 3382 break; 3383 } 3384 } 3385 i++; 3386 } 3387 } 3388 qemu_mutex_unlock(&rs->bitmap_mutex); 3389 3390 /* 3391 * Must occur before EOS (or any QEMUFile operation) 3392 * because of RDMA protocol. 3393 */ 3394 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3395 3396 out: 3397 if (ret >= 0 3398 && migration_is_setup_or_active(migrate_get_current()->state)) { 3399 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3400 if (ret < 0) { 3401 return ret; 3402 } 3403 3404 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3405 qemu_fflush(f); 3406 ram_transferred_add(8); 3407 3408 ret = qemu_file_get_error(f); 3409 } 3410 if (ret < 0) { 3411 return ret; 3412 } 3413 3414 return done; 3415 } 3416 3417 /** 3418 * ram_save_complete: function called to send the remaining amount of ram 3419 * 3420 * Returns zero to indicate success or negative on error 3421 * 3422 * Called with iothread lock 3423 * 3424 * @f: QEMUFile where to send the data 3425 * @opaque: RAMState pointer 3426 */ 3427 static int ram_save_complete(QEMUFile *f, void *opaque) 3428 { 3429 RAMState **temp = opaque; 3430 RAMState *rs = *temp; 3431 int ret = 0; 3432 3433 rs->last_stage = !migration_in_colo_state(); 3434 3435 WITH_RCU_READ_LOCK_GUARD() { 3436 if (!migration_in_postcopy()) { 3437 migration_bitmap_sync_precopy(rs); 3438 } 3439 3440 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3441 3442 /* try transferring iterative blocks of memory */ 3443 3444 /* flush all remaining blocks regardless of rate limiting */ 3445 qemu_mutex_lock(&rs->bitmap_mutex); 3446 while (true) { 3447 int pages; 3448 3449 pages = ram_find_and_save_block(rs); 3450 /* no more blocks to sent */ 3451 if (pages == 0) { 3452 break; 3453 } 3454 if (pages < 0) { 3455 ret = pages; 3456 break; 3457 } 3458 } 3459 qemu_mutex_unlock(&rs->bitmap_mutex); 3460 3461 flush_compressed_data(rs); 3462 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3463 } 3464 3465 if (ret < 0) { 3466 return ret; 3467 } 3468 3469 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3470 if (ret < 0) { 3471 return ret; 3472 } 3473 3474 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3475 qemu_fflush(f); 3476 3477 return 0; 3478 } 3479 3480 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3481 uint64_t *can_postcopy) 3482 { 3483 RAMState **temp = opaque; 3484 RAMState *rs = *temp; 3485 3486 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3487 3488 if (migrate_postcopy_ram()) { 3489 /* We can do postcopy, and all the data is postcopiable */ 3490 *can_postcopy += remaining_size; 3491 } else { 3492 *must_precopy += remaining_size; 3493 } 3494 } 3495 3496 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3497 uint64_t *can_postcopy) 3498 { 3499 MigrationState *s = migrate_get_current(); 3500 RAMState **temp = opaque; 3501 RAMState *rs = *temp; 3502 3503 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3504 3505 if (!migration_in_postcopy() && remaining_size < s->threshold_size) { 3506 qemu_mutex_lock_iothread(); 3507 WITH_RCU_READ_LOCK_GUARD() { 3508 migration_bitmap_sync_precopy(rs); 3509 } 3510 qemu_mutex_unlock_iothread(); 3511 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3512 } 3513 3514 if (migrate_postcopy_ram()) { 3515 /* We can do postcopy, and all the data is postcopiable */ 3516 *can_postcopy += remaining_size; 3517 } else { 3518 *must_precopy += remaining_size; 3519 } 3520 } 3521 3522 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3523 { 3524 unsigned int xh_len; 3525 int xh_flags; 3526 uint8_t *loaded_data; 3527 3528 /* extract RLE header */ 3529 xh_flags = qemu_get_byte(f); 3530 xh_len = qemu_get_be16(f); 3531 3532 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3533 error_report("Failed to load XBZRLE page - wrong compression!"); 3534 return -1; 3535 } 3536 3537 if (xh_len > TARGET_PAGE_SIZE) { 3538 error_report("Failed to load XBZRLE page - len overflow!"); 3539 return -1; 3540 } 3541 loaded_data = XBZRLE.decoded_buf; 3542 /* load data and decode */ 3543 /* it can change loaded_data to point to an internal buffer */ 3544 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3545 3546 /* decode RLE */ 3547 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3548 TARGET_PAGE_SIZE) == -1) { 3549 error_report("Failed to load XBZRLE page - decode error!"); 3550 return -1; 3551 } 3552 3553 return 0; 3554 } 3555 3556 /** 3557 * ram_block_from_stream: read a RAMBlock id from the migration stream 3558 * 3559 * Must be called from within a rcu critical section. 3560 * 3561 * Returns a pointer from within the RCU-protected ram_list. 3562 * 3563 * @mis: the migration incoming state pointer 3564 * @f: QEMUFile where to read the data from 3565 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3566 * @channel: the channel we're using 3567 */ 3568 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3569 QEMUFile *f, int flags, 3570 int channel) 3571 { 3572 RAMBlock *block = mis->last_recv_block[channel]; 3573 char id[256]; 3574 uint8_t len; 3575 3576 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3577 if (!block) { 3578 error_report("Ack, bad migration stream!"); 3579 return NULL; 3580 } 3581 return block; 3582 } 3583 3584 len = qemu_get_byte(f); 3585 qemu_get_buffer(f, (uint8_t *)id, len); 3586 id[len] = 0; 3587 3588 block = qemu_ram_block_by_name(id); 3589 if (!block) { 3590 error_report("Can't find block %s", id); 3591 return NULL; 3592 } 3593 3594 if (ramblock_is_ignored(block)) { 3595 error_report("block %s should not be migrated !", id); 3596 return NULL; 3597 } 3598 3599 mis->last_recv_block[channel] = block; 3600 3601 return block; 3602 } 3603 3604 static inline void *host_from_ram_block_offset(RAMBlock *block, 3605 ram_addr_t offset) 3606 { 3607 if (!offset_in_ramblock(block, offset)) { 3608 return NULL; 3609 } 3610 3611 return block->host + offset; 3612 } 3613 3614 static void *host_page_from_ram_block_offset(RAMBlock *block, 3615 ram_addr_t offset) 3616 { 3617 /* Note: Explicitly no check against offset_in_ramblock(). */ 3618 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3619 block->page_size); 3620 } 3621 3622 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3623 ram_addr_t offset) 3624 { 3625 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3626 } 3627 3628 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3629 ram_addr_t offset, bool record_bitmap) 3630 { 3631 if (!offset_in_ramblock(block, offset)) { 3632 return NULL; 3633 } 3634 if (!block->colo_cache) { 3635 error_report("%s: colo_cache is NULL in block :%s", 3636 __func__, block->idstr); 3637 return NULL; 3638 } 3639 3640 /* 3641 * During colo checkpoint, we need bitmap of these migrated pages. 3642 * It help us to decide which pages in ram cache should be flushed 3643 * into VM's RAM later. 3644 */ 3645 if (record_bitmap && 3646 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3647 ram_state->migration_dirty_pages++; 3648 } 3649 return block->colo_cache + offset; 3650 } 3651 3652 /** 3653 * ram_handle_compressed: handle the zero page case 3654 * 3655 * If a page (or a whole RDMA chunk) has been 3656 * determined to be zero, then zap it. 3657 * 3658 * @host: host address for the zero page 3659 * @ch: what the page is filled from. We only support zero 3660 * @size: size of the zero page 3661 */ 3662 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3663 { 3664 if (ch != 0 || !buffer_is_zero(host, size)) { 3665 memset(host, ch, size); 3666 } 3667 } 3668 3669 /* return the size after decompression, or negative value on error */ 3670 static int 3671 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3672 const uint8_t *source, size_t source_len) 3673 { 3674 int err; 3675 3676 err = inflateReset(stream); 3677 if (err != Z_OK) { 3678 return -1; 3679 } 3680 3681 stream->avail_in = source_len; 3682 stream->next_in = (uint8_t *)source; 3683 stream->avail_out = dest_len; 3684 stream->next_out = dest; 3685 3686 err = inflate(stream, Z_NO_FLUSH); 3687 if (err != Z_STREAM_END) { 3688 return -1; 3689 } 3690 3691 return stream->total_out; 3692 } 3693 3694 static void *do_data_decompress(void *opaque) 3695 { 3696 DecompressParam *param = opaque; 3697 unsigned long pagesize; 3698 uint8_t *des; 3699 int len, ret; 3700 3701 qemu_mutex_lock(¶m->mutex); 3702 while (!param->quit) { 3703 if (param->des) { 3704 des = param->des; 3705 len = param->len; 3706 param->des = 0; 3707 qemu_mutex_unlock(¶m->mutex); 3708 3709 pagesize = TARGET_PAGE_SIZE; 3710 3711 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3712 param->compbuf, len); 3713 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3714 error_report("decompress data failed"); 3715 qemu_file_set_error(decomp_file, ret); 3716 } 3717 3718 qemu_mutex_lock(&decomp_done_lock); 3719 param->done = true; 3720 qemu_cond_signal(&decomp_done_cond); 3721 qemu_mutex_unlock(&decomp_done_lock); 3722 3723 qemu_mutex_lock(¶m->mutex); 3724 } else { 3725 qemu_cond_wait(¶m->cond, ¶m->mutex); 3726 } 3727 } 3728 qemu_mutex_unlock(¶m->mutex); 3729 3730 return NULL; 3731 } 3732 3733 static int wait_for_decompress_done(void) 3734 { 3735 int idx, thread_count; 3736 3737 if (!migrate_compress()) { 3738 return 0; 3739 } 3740 3741 thread_count = migrate_decompress_threads(); 3742 qemu_mutex_lock(&decomp_done_lock); 3743 for (idx = 0; idx < thread_count; idx++) { 3744 while (!decomp_param[idx].done) { 3745 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3746 } 3747 } 3748 qemu_mutex_unlock(&decomp_done_lock); 3749 return qemu_file_get_error(decomp_file); 3750 } 3751 3752 static void compress_threads_load_cleanup(void) 3753 { 3754 int i, thread_count; 3755 3756 if (!migrate_compress()) { 3757 return; 3758 } 3759 thread_count = migrate_decompress_threads(); 3760 for (i = 0; i < thread_count; i++) { 3761 /* 3762 * we use it as a indicator which shows if the thread is 3763 * properly init'd or not 3764 */ 3765 if (!decomp_param[i].compbuf) { 3766 break; 3767 } 3768 3769 qemu_mutex_lock(&decomp_param[i].mutex); 3770 decomp_param[i].quit = true; 3771 qemu_cond_signal(&decomp_param[i].cond); 3772 qemu_mutex_unlock(&decomp_param[i].mutex); 3773 } 3774 for (i = 0; i < thread_count; i++) { 3775 if (!decomp_param[i].compbuf) { 3776 break; 3777 } 3778 3779 qemu_thread_join(decompress_threads + i); 3780 qemu_mutex_destroy(&decomp_param[i].mutex); 3781 qemu_cond_destroy(&decomp_param[i].cond); 3782 inflateEnd(&decomp_param[i].stream); 3783 g_free(decomp_param[i].compbuf); 3784 decomp_param[i].compbuf = NULL; 3785 } 3786 g_free(decompress_threads); 3787 g_free(decomp_param); 3788 decompress_threads = NULL; 3789 decomp_param = NULL; 3790 decomp_file = NULL; 3791 } 3792 3793 static int compress_threads_load_setup(QEMUFile *f) 3794 { 3795 int i, thread_count; 3796 3797 if (!migrate_compress()) { 3798 return 0; 3799 } 3800 3801 thread_count = migrate_decompress_threads(); 3802 decompress_threads = g_new0(QemuThread, thread_count); 3803 decomp_param = g_new0(DecompressParam, thread_count); 3804 qemu_mutex_init(&decomp_done_lock); 3805 qemu_cond_init(&decomp_done_cond); 3806 decomp_file = f; 3807 for (i = 0; i < thread_count; i++) { 3808 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3809 goto exit; 3810 } 3811 3812 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3813 qemu_mutex_init(&decomp_param[i].mutex); 3814 qemu_cond_init(&decomp_param[i].cond); 3815 decomp_param[i].done = true; 3816 decomp_param[i].quit = false; 3817 qemu_thread_create(decompress_threads + i, "decompress", 3818 do_data_decompress, decomp_param + i, 3819 QEMU_THREAD_JOINABLE); 3820 } 3821 return 0; 3822 exit: 3823 compress_threads_load_cleanup(); 3824 return -1; 3825 } 3826 3827 static void decompress_data_with_multi_threads(QEMUFile *f, 3828 void *host, int len) 3829 { 3830 int idx, thread_count; 3831 3832 thread_count = migrate_decompress_threads(); 3833 QEMU_LOCK_GUARD(&decomp_done_lock); 3834 while (true) { 3835 for (idx = 0; idx < thread_count; idx++) { 3836 if (decomp_param[idx].done) { 3837 decomp_param[idx].done = false; 3838 qemu_mutex_lock(&decomp_param[idx].mutex); 3839 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3840 decomp_param[idx].des = host; 3841 decomp_param[idx].len = len; 3842 qemu_cond_signal(&decomp_param[idx].cond); 3843 qemu_mutex_unlock(&decomp_param[idx].mutex); 3844 break; 3845 } 3846 } 3847 if (idx < thread_count) { 3848 break; 3849 } else { 3850 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3851 } 3852 } 3853 } 3854 3855 static void colo_init_ram_state(void) 3856 { 3857 ram_state_init(&ram_state); 3858 } 3859 3860 /* 3861 * colo cache: this is for secondary VM, we cache the whole 3862 * memory of the secondary VM, it is need to hold the global lock 3863 * to call this helper. 3864 */ 3865 int colo_init_ram_cache(void) 3866 { 3867 RAMBlock *block; 3868 3869 WITH_RCU_READ_LOCK_GUARD() { 3870 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3871 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3872 NULL, false, false); 3873 if (!block->colo_cache) { 3874 error_report("%s: Can't alloc memory for COLO cache of block %s," 3875 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3876 block->used_length); 3877 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3878 if (block->colo_cache) { 3879 qemu_anon_ram_free(block->colo_cache, block->used_length); 3880 block->colo_cache = NULL; 3881 } 3882 } 3883 return -errno; 3884 } 3885 if (!machine_dump_guest_core(current_machine)) { 3886 qemu_madvise(block->colo_cache, block->used_length, 3887 QEMU_MADV_DONTDUMP); 3888 } 3889 } 3890 } 3891 3892 /* 3893 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3894 * with to decide which page in cache should be flushed into SVM's RAM. Here 3895 * we use the same name 'ram_bitmap' as for migration. 3896 */ 3897 if (ram_bytes_total()) { 3898 RAMBlock *block; 3899 3900 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3901 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3902 block->bmap = bitmap_new(pages); 3903 } 3904 } 3905 3906 colo_init_ram_state(); 3907 return 0; 3908 } 3909 3910 /* TODO: duplicated with ram_init_bitmaps */ 3911 void colo_incoming_start_dirty_log(void) 3912 { 3913 RAMBlock *block = NULL; 3914 /* For memory_global_dirty_log_start below. */ 3915 qemu_mutex_lock_iothread(); 3916 qemu_mutex_lock_ramlist(); 3917 3918 memory_global_dirty_log_sync(); 3919 WITH_RCU_READ_LOCK_GUARD() { 3920 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3921 ramblock_sync_dirty_bitmap(ram_state, block); 3922 /* Discard this dirty bitmap record */ 3923 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3924 } 3925 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3926 } 3927 ram_state->migration_dirty_pages = 0; 3928 qemu_mutex_unlock_ramlist(); 3929 qemu_mutex_unlock_iothread(); 3930 } 3931 3932 /* It is need to hold the global lock to call this helper */ 3933 void colo_release_ram_cache(void) 3934 { 3935 RAMBlock *block; 3936 3937 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3938 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3939 g_free(block->bmap); 3940 block->bmap = NULL; 3941 } 3942 3943 WITH_RCU_READ_LOCK_GUARD() { 3944 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3945 if (block->colo_cache) { 3946 qemu_anon_ram_free(block->colo_cache, block->used_length); 3947 block->colo_cache = NULL; 3948 } 3949 } 3950 } 3951 ram_state_cleanup(&ram_state); 3952 } 3953 3954 /** 3955 * ram_load_setup: Setup RAM for migration incoming side 3956 * 3957 * Returns zero to indicate success and negative for error 3958 * 3959 * @f: QEMUFile where to receive the data 3960 * @opaque: RAMState pointer 3961 */ 3962 static int ram_load_setup(QEMUFile *f, void *opaque) 3963 { 3964 if (compress_threads_load_setup(f)) { 3965 return -1; 3966 } 3967 3968 xbzrle_load_setup(); 3969 ramblock_recv_map_init(); 3970 3971 return 0; 3972 } 3973 3974 static int ram_load_cleanup(void *opaque) 3975 { 3976 RAMBlock *rb; 3977 3978 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3979 qemu_ram_block_writeback(rb); 3980 } 3981 3982 xbzrle_load_cleanup(); 3983 compress_threads_load_cleanup(); 3984 3985 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3986 g_free(rb->receivedmap); 3987 rb->receivedmap = NULL; 3988 } 3989 3990 return 0; 3991 } 3992 3993 /** 3994 * ram_postcopy_incoming_init: allocate postcopy data structures 3995 * 3996 * Returns 0 for success and negative if there was one error 3997 * 3998 * @mis: current migration incoming state 3999 * 4000 * Allocate data structures etc needed by incoming migration with 4001 * postcopy-ram. postcopy-ram's similarly names 4002 * postcopy_ram_incoming_init does the work. 4003 */ 4004 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 4005 { 4006 return postcopy_ram_incoming_init(mis); 4007 } 4008 4009 /** 4010 * ram_load_postcopy: load a page in postcopy case 4011 * 4012 * Returns 0 for success or -errno in case of error 4013 * 4014 * Called in postcopy mode by ram_load(). 4015 * rcu_read_lock is taken prior to this being called. 4016 * 4017 * @f: QEMUFile where to send the data 4018 * @channel: the channel to use for loading 4019 */ 4020 int ram_load_postcopy(QEMUFile *f, int channel) 4021 { 4022 int flags = 0, ret = 0; 4023 bool place_needed = false; 4024 bool matches_target_page_size = false; 4025 MigrationIncomingState *mis = migration_incoming_get_current(); 4026 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 4027 4028 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4029 ram_addr_t addr; 4030 void *page_buffer = NULL; 4031 void *place_source = NULL; 4032 RAMBlock *block = NULL; 4033 uint8_t ch; 4034 int len; 4035 4036 addr = qemu_get_be64(f); 4037 4038 /* 4039 * If qemu file error, we should stop here, and then "addr" 4040 * may be invalid 4041 */ 4042 ret = qemu_file_get_error(f); 4043 if (ret) { 4044 break; 4045 } 4046 4047 flags = addr & ~TARGET_PAGE_MASK; 4048 addr &= TARGET_PAGE_MASK; 4049 4050 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 4051 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4052 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 4053 block = ram_block_from_stream(mis, f, flags, channel); 4054 if (!block) { 4055 ret = -EINVAL; 4056 break; 4057 } 4058 4059 /* 4060 * Relying on used_length is racy and can result in false positives. 4061 * We might place pages beyond used_length in case RAM was shrunk 4062 * while in postcopy, which is fine - trying to place via 4063 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4064 */ 4065 if (!block->host || addr >= block->postcopy_length) { 4066 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4067 ret = -EINVAL; 4068 break; 4069 } 4070 tmp_page->target_pages++; 4071 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4072 /* 4073 * Postcopy requires that we place whole host pages atomically; 4074 * these may be huge pages for RAMBlocks that are backed by 4075 * hugetlbfs. 4076 * To make it atomic, the data is read into a temporary page 4077 * that's moved into place later. 4078 * The migration protocol uses, possibly smaller, target-pages 4079 * however the source ensures it always sends all the components 4080 * of a host page in one chunk. 4081 */ 4082 page_buffer = tmp_page->tmp_huge_page + 4083 host_page_offset_from_ram_block_offset(block, addr); 4084 /* If all TP are zero then we can optimise the place */ 4085 if (tmp_page->target_pages == 1) { 4086 tmp_page->host_addr = 4087 host_page_from_ram_block_offset(block, addr); 4088 } else if (tmp_page->host_addr != 4089 host_page_from_ram_block_offset(block, addr)) { 4090 /* not the 1st TP within the HP */ 4091 error_report("Non-same host page detected on channel %d: " 4092 "Target host page %p, received host page %p " 4093 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4094 channel, tmp_page->host_addr, 4095 host_page_from_ram_block_offset(block, addr), 4096 block->idstr, addr, tmp_page->target_pages); 4097 ret = -EINVAL; 4098 break; 4099 } 4100 4101 /* 4102 * If it's the last part of a host page then we place the host 4103 * page 4104 */ 4105 if (tmp_page->target_pages == 4106 (block->page_size / TARGET_PAGE_SIZE)) { 4107 place_needed = true; 4108 } 4109 place_source = tmp_page->tmp_huge_page; 4110 } 4111 4112 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4113 case RAM_SAVE_FLAG_ZERO: 4114 ch = qemu_get_byte(f); 4115 /* 4116 * Can skip to set page_buffer when 4117 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4118 */ 4119 if (ch || !matches_target_page_size) { 4120 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4121 } 4122 if (ch) { 4123 tmp_page->all_zero = false; 4124 } 4125 break; 4126 4127 case RAM_SAVE_FLAG_PAGE: 4128 tmp_page->all_zero = false; 4129 if (!matches_target_page_size) { 4130 /* For huge pages, we always use temporary buffer */ 4131 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4132 } else { 4133 /* 4134 * For small pages that matches target page size, we 4135 * avoid the qemu_file copy. Instead we directly use 4136 * the buffer of QEMUFile to place the page. Note: we 4137 * cannot do any QEMUFile operation before using that 4138 * buffer to make sure the buffer is valid when 4139 * placing the page. 4140 */ 4141 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4142 TARGET_PAGE_SIZE); 4143 } 4144 break; 4145 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4146 tmp_page->all_zero = false; 4147 len = qemu_get_be32(f); 4148 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4149 error_report("Invalid compressed data length: %d", len); 4150 ret = -EINVAL; 4151 break; 4152 } 4153 decompress_data_with_multi_threads(f, page_buffer, len); 4154 break; 4155 4156 case RAM_SAVE_FLAG_EOS: 4157 /* normal exit */ 4158 multifd_recv_sync_main(); 4159 break; 4160 default: 4161 error_report("Unknown combination of migration flags: 0x%x" 4162 " (postcopy mode)", flags); 4163 ret = -EINVAL; 4164 break; 4165 } 4166 4167 /* Got the whole host page, wait for decompress before placing. */ 4168 if (place_needed) { 4169 ret |= wait_for_decompress_done(); 4170 } 4171 4172 /* Detect for any possible file errors */ 4173 if (!ret && qemu_file_get_error(f)) { 4174 ret = qemu_file_get_error(f); 4175 } 4176 4177 if (!ret && place_needed) { 4178 if (tmp_page->all_zero) { 4179 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4180 } else { 4181 ret = postcopy_place_page(mis, tmp_page->host_addr, 4182 place_source, block); 4183 } 4184 place_needed = false; 4185 postcopy_temp_page_reset(tmp_page); 4186 } 4187 } 4188 4189 return ret; 4190 } 4191 4192 static bool postcopy_is_running(void) 4193 { 4194 PostcopyState ps = postcopy_state_get(); 4195 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4196 } 4197 4198 /* 4199 * Flush content of RAM cache into SVM's memory. 4200 * Only flush the pages that be dirtied by PVM or SVM or both. 4201 */ 4202 void colo_flush_ram_cache(void) 4203 { 4204 RAMBlock *block = NULL; 4205 void *dst_host; 4206 void *src_host; 4207 unsigned long offset = 0; 4208 4209 memory_global_dirty_log_sync(); 4210 WITH_RCU_READ_LOCK_GUARD() { 4211 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4212 ramblock_sync_dirty_bitmap(ram_state, block); 4213 } 4214 } 4215 4216 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4217 WITH_RCU_READ_LOCK_GUARD() { 4218 block = QLIST_FIRST_RCU(&ram_list.blocks); 4219 4220 while (block) { 4221 unsigned long num = 0; 4222 4223 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4224 if (!offset_in_ramblock(block, 4225 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4226 offset = 0; 4227 num = 0; 4228 block = QLIST_NEXT_RCU(block, next); 4229 } else { 4230 unsigned long i = 0; 4231 4232 for (i = 0; i < num; i++) { 4233 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4234 } 4235 dst_host = block->host 4236 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4237 src_host = block->colo_cache 4238 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4239 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4240 offset += num; 4241 } 4242 } 4243 } 4244 trace_colo_flush_ram_cache_end(); 4245 } 4246 4247 /** 4248 * ram_load_precopy: load pages in precopy case 4249 * 4250 * Returns 0 for success or -errno in case of error 4251 * 4252 * Called in precopy mode by ram_load(). 4253 * rcu_read_lock is taken prior to this being called. 4254 * 4255 * @f: QEMUFile where to send the data 4256 */ 4257 static int ram_load_precopy(QEMUFile *f) 4258 { 4259 MigrationIncomingState *mis = migration_incoming_get_current(); 4260 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4261 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4262 bool postcopy_advised = migration_incoming_postcopy_advised(); 4263 if (!migrate_compress()) { 4264 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4265 } 4266 4267 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4268 ram_addr_t addr, total_ram_bytes; 4269 void *host = NULL, *host_bak = NULL; 4270 uint8_t ch; 4271 4272 /* 4273 * Yield periodically to let main loop run, but an iteration of 4274 * the main loop is expensive, so do it each some iterations 4275 */ 4276 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4277 aio_co_schedule(qemu_get_current_aio_context(), 4278 qemu_coroutine_self()); 4279 qemu_coroutine_yield(); 4280 } 4281 i++; 4282 4283 addr = qemu_get_be64(f); 4284 flags = addr & ~TARGET_PAGE_MASK; 4285 addr &= TARGET_PAGE_MASK; 4286 4287 if (flags & invalid_flags) { 4288 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4289 error_report("Received an unexpected compressed page"); 4290 } 4291 4292 ret = -EINVAL; 4293 break; 4294 } 4295 4296 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4297 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4298 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4299 RAM_CHANNEL_PRECOPY); 4300 4301 host = host_from_ram_block_offset(block, addr); 4302 /* 4303 * After going into COLO stage, we should not load the page 4304 * into SVM's memory directly, we put them into colo_cache firstly. 4305 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4306 * Previously, we copied all these memory in preparing stage of COLO 4307 * while we need to stop VM, which is a time-consuming process. 4308 * Here we optimize it by a trick, back-up every page while in 4309 * migration process while COLO is enabled, though it affects the 4310 * speed of the migration, but it obviously reduce the downtime of 4311 * back-up all SVM'S memory in COLO preparing stage. 4312 */ 4313 if (migration_incoming_colo_enabled()) { 4314 if (migration_incoming_in_colo_state()) { 4315 /* In COLO stage, put all pages into cache temporarily */ 4316 host = colo_cache_from_block_offset(block, addr, true); 4317 } else { 4318 /* 4319 * In migration stage but before COLO stage, 4320 * Put all pages into both cache and SVM's memory. 4321 */ 4322 host_bak = colo_cache_from_block_offset(block, addr, false); 4323 } 4324 } 4325 if (!host) { 4326 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4327 ret = -EINVAL; 4328 break; 4329 } 4330 if (!migration_incoming_in_colo_state()) { 4331 ramblock_recv_bitmap_set(block, host); 4332 } 4333 4334 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4335 } 4336 4337 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4338 case RAM_SAVE_FLAG_MEM_SIZE: 4339 /* Synchronize RAM block list */ 4340 total_ram_bytes = addr; 4341 while (!ret && total_ram_bytes) { 4342 RAMBlock *block; 4343 char id[256]; 4344 ram_addr_t length; 4345 4346 len = qemu_get_byte(f); 4347 qemu_get_buffer(f, (uint8_t *)id, len); 4348 id[len] = 0; 4349 length = qemu_get_be64(f); 4350 4351 block = qemu_ram_block_by_name(id); 4352 if (block && !qemu_ram_is_migratable(block)) { 4353 error_report("block %s should not be migrated !", id); 4354 ret = -EINVAL; 4355 } else if (block) { 4356 if (length != block->used_length) { 4357 Error *local_err = NULL; 4358 4359 ret = qemu_ram_resize(block, length, 4360 &local_err); 4361 if (local_err) { 4362 error_report_err(local_err); 4363 } 4364 } 4365 /* For postcopy we need to check hugepage sizes match */ 4366 if (postcopy_advised && migrate_postcopy_ram() && 4367 block->page_size != qemu_host_page_size) { 4368 uint64_t remote_page_size = qemu_get_be64(f); 4369 if (remote_page_size != block->page_size) { 4370 error_report("Mismatched RAM page size %s " 4371 "(local) %zd != %" PRId64, 4372 id, block->page_size, 4373 remote_page_size); 4374 ret = -EINVAL; 4375 } 4376 } 4377 if (migrate_ignore_shared()) { 4378 hwaddr addr = qemu_get_be64(f); 4379 if (ramblock_is_ignored(block) && 4380 block->mr->addr != addr) { 4381 error_report("Mismatched GPAs for block %s " 4382 "%" PRId64 "!= %" PRId64, 4383 id, (uint64_t)addr, 4384 (uint64_t)block->mr->addr); 4385 ret = -EINVAL; 4386 } 4387 } 4388 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4389 block->idstr); 4390 } else { 4391 error_report("Unknown ramblock \"%s\", cannot " 4392 "accept migration", id); 4393 ret = -EINVAL; 4394 } 4395 4396 total_ram_bytes -= length; 4397 } 4398 break; 4399 4400 case RAM_SAVE_FLAG_ZERO: 4401 ch = qemu_get_byte(f); 4402 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4403 break; 4404 4405 case RAM_SAVE_FLAG_PAGE: 4406 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4407 break; 4408 4409 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4410 len = qemu_get_be32(f); 4411 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4412 error_report("Invalid compressed data length: %d", len); 4413 ret = -EINVAL; 4414 break; 4415 } 4416 decompress_data_with_multi_threads(f, host, len); 4417 break; 4418 4419 case RAM_SAVE_FLAG_XBZRLE: 4420 if (load_xbzrle(f, addr, host) < 0) { 4421 error_report("Failed to decompress XBZRLE page at " 4422 RAM_ADDR_FMT, addr); 4423 ret = -EINVAL; 4424 break; 4425 } 4426 break; 4427 case RAM_SAVE_FLAG_EOS: 4428 /* normal exit */ 4429 multifd_recv_sync_main(); 4430 break; 4431 default: 4432 if (flags & RAM_SAVE_FLAG_HOOK) { 4433 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4434 } else { 4435 error_report("Unknown combination of migration flags: 0x%x", 4436 flags); 4437 ret = -EINVAL; 4438 } 4439 } 4440 if (!ret) { 4441 ret = qemu_file_get_error(f); 4442 } 4443 if (!ret && host_bak) { 4444 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4445 } 4446 } 4447 4448 ret |= wait_for_decompress_done(); 4449 return ret; 4450 } 4451 4452 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4453 { 4454 int ret = 0; 4455 static uint64_t seq_iter; 4456 /* 4457 * If system is running in postcopy mode, page inserts to host memory must 4458 * be atomic 4459 */ 4460 bool postcopy_running = postcopy_is_running(); 4461 4462 seq_iter++; 4463 4464 if (version_id != 4) { 4465 return -EINVAL; 4466 } 4467 4468 /* 4469 * This RCU critical section can be very long running. 4470 * When RCU reclaims in the code start to become numerous, 4471 * it will be necessary to reduce the granularity of this 4472 * critical section. 4473 */ 4474 WITH_RCU_READ_LOCK_GUARD() { 4475 if (postcopy_running) { 4476 /* 4477 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4478 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4479 * service fast page faults. 4480 */ 4481 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4482 } else { 4483 ret = ram_load_precopy(f); 4484 } 4485 } 4486 trace_ram_load_complete(ret, seq_iter); 4487 4488 return ret; 4489 } 4490 4491 static bool ram_has_postcopy(void *opaque) 4492 { 4493 RAMBlock *rb; 4494 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4495 if (ramblock_is_pmem(rb)) { 4496 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4497 "is not supported now!", rb->idstr, rb->host); 4498 return false; 4499 } 4500 } 4501 4502 return migrate_postcopy_ram(); 4503 } 4504 4505 /* Sync all the dirty bitmap with destination VM. */ 4506 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4507 { 4508 RAMBlock *block; 4509 QEMUFile *file = s->to_dst_file; 4510 int ramblock_count = 0; 4511 4512 trace_ram_dirty_bitmap_sync_start(); 4513 4514 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4515 qemu_savevm_send_recv_bitmap(file, block->idstr); 4516 trace_ram_dirty_bitmap_request(block->idstr); 4517 ramblock_count++; 4518 } 4519 4520 trace_ram_dirty_bitmap_sync_wait(); 4521 4522 /* Wait until all the ramblocks' dirty bitmap synced */ 4523 while (ramblock_count--) { 4524 qemu_sem_wait(&s->rp_state.rp_sem); 4525 } 4526 4527 trace_ram_dirty_bitmap_sync_complete(); 4528 4529 return 0; 4530 } 4531 4532 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4533 { 4534 qemu_sem_post(&s->rp_state.rp_sem); 4535 } 4536 4537 /* 4538 * Read the received bitmap, revert it as the initial dirty bitmap. 4539 * This is only used when the postcopy migration is paused but wants 4540 * to resume from a middle point. 4541 */ 4542 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4543 { 4544 int ret = -EINVAL; 4545 /* from_dst_file is always valid because we're within rp_thread */ 4546 QEMUFile *file = s->rp_state.from_dst_file; 4547 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4548 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4549 uint64_t size, end_mark; 4550 4551 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4552 4553 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4554 error_report("%s: incorrect state %s", __func__, 4555 MigrationStatus_str(s->state)); 4556 return -EINVAL; 4557 } 4558 4559 /* 4560 * Note: see comments in ramblock_recv_bitmap_send() on why we 4561 * need the endianness conversion, and the paddings. 4562 */ 4563 local_size = ROUND_UP(local_size, 8); 4564 4565 /* Add paddings */ 4566 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4567 4568 size = qemu_get_be64(file); 4569 4570 /* The size of the bitmap should match with our ramblock */ 4571 if (size != local_size) { 4572 error_report("%s: ramblock '%s' bitmap size mismatch " 4573 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4574 block->idstr, size, local_size); 4575 ret = -EINVAL; 4576 goto out; 4577 } 4578 4579 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4580 end_mark = qemu_get_be64(file); 4581 4582 ret = qemu_file_get_error(file); 4583 if (ret || size != local_size) { 4584 error_report("%s: read bitmap failed for ramblock '%s': %d" 4585 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4586 __func__, block->idstr, ret, local_size, size); 4587 ret = -EIO; 4588 goto out; 4589 } 4590 4591 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4592 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4593 __func__, block->idstr, end_mark); 4594 ret = -EINVAL; 4595 goto out; 4596 } 4597 4598 /* 4599 * Endianness conversion. We are during postcopy (though paused). 4600 * The dirty bitmap won't change. We can directly modify it. 4601 */ 4602 bitmap_from_le(block->bmap, le_bitmap, nbits); 4603 4604 /* 4605 * What we received is "received bitmap". Revert it as the initial 4606 * dirty bitmap for this ramblock. 4607 */ 4608 bitmap_complement(block->bmap, block->bmap, nbits); 4609 4610 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4611 ramblock_dirty_bitmap_clear_discarded_pages(block); 4612 4613 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4614 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4615 4616 /* 4617 * We succeeded to sync bitmap for current ramblock. If this is 4618 * the last one to sync, we need to notify the main send thread. 4619 */ 4620 ram_dirty_bitmap_reload_notify(s); 4621 4622 ret = 0; 4623 out: 4624 g_free(le_bitmap); 4625 return ret; 4626 } 4627 4628 static int ram_resume_prepare(MigrationState *s, void *opaque) 4629 { 4630 RAMState *rs = *(RAMState **)opaque; 4631 int ret; 4632 4633 ret = ram_dirty_bitmap_sync_all(s, rs); 4634 if (ret) { 4635 return ret; 4636 } 4637 4638 ram_state_resume_prepare(rs, s->to_dst_file); 4639 4640 return 0; 4641 } 4642 4643 void postcopy_preempt_shutdown_file(MigrationState *s) 4644 { 4645 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4646 qemu_fflush(s->postcopy_qemufile_src); 4647 } 4648 4649 static SaveVMHandlers savevm_ram_handlers = { 4650 .save_setup = ram_save_setup, 4651 .save_live_iterate = ram_save_iterate, 4652 .save_live_complete_postcopy = ram_save_complete, 4653 .save_live_complete_precopy = ram_save_complete, 4654 .has_postcopy = ram_has_postcopy, 4655 .state_pending_exact = ram_state_pending_exact, 4656 .state_pending_estimate = ram_state_pending_estimate, 4657 .load_state = ram_load, 4658 .save_cleanup = ram_save_cleanup, 4659 .load_setup = ram_load_setup, 4660 .load_cleanup = ram_load_cleanup, 4661 .resume_prepare = ram_resume_prepare, 4662 }; 4663 4664 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4665 size_t old_size, size_t new_size) 4666 { 4667 PostcopyState ps = postcopy_state_get(); 4668 ram_addr_t offset; 4669 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4670 Error *err = NULL; 4671 4672 if (ramblock_is_ignored(rb)) { 4673 return; 4674 } 4675 4676 if (!migration_is_idle()) { 4677 /* 4678 * Precopy code on the source cannot deal with the size of RAM blocks 4679 * changing at random points in time - especially after sending the 4680 * RAM block sizes in the migration stream, they must no longer change. 4681 * Abort and indicate a proper reason. 4682 */ 4683 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4684 migration_cancel(err); 4685 error_free(err); 4686 } 4687 4688 switch (ps) { 4689 case POSTCOPY_INCOMING_ADVISE: 4690 /* 4691 * Update what ram_postcopy_incoming_init()->init_range() does at the 4692 * time postcopy was advised. Syncing RAM blocks with the source will 4693 * result in RAM resizes. 4694 */ 4695 if (old_size < new_size) { 4696 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4697 error_report("RAM block '%s' discard of resized RAM failed", 4698 rb->idstr); 4699 } 4700 } 4701 rb->postcopy_length = new_size; 4702 break; 4703 case POSTCOPY_INCOMING_NONE: 4704 case POSTCOPY_INCOMING_RUNNING: 4705 case POSTCOPY_INCOMING_END: 4706 /* 4707 * Once our guest is running, postcopy does no longer care about 4708 * resizes. When growing, the new memory was not available on the 4709 * source, no handler needed. 4710 */ 4711 break; 4712 default: 4713 error_report("RAM block '%s' resized during postcopy state: %d", 4714 rb->idstr, ps); 4715 exit(-1); 4716 } 4717 } 4718 4719 static RAMBlockNotifier ram_mig_ram_notifier = { 4720 .ram_block_resized = ram_mig_ram_block_resized, 4721 }; 4722 4723 void ram_mig_init(void) 4724 { 4725 qemu_mutex_init(&XBZRLE.lock); 4726 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4727 ram_block_notifier_add(&ram_mig_ram_notifier); 4728 } 4729