1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #include "hw/boards.h" /* for machine_dump_guest_core() */ 62 63 #if defined(__linux__) 64 #include "qemu/userfaultfd.h" 65 #endif /* defined(__linux__) */ 66 67 /***********************************************************/ 68 /* ram save/restore */ 69 70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 71 * worked for pages that where filled with the same char. We switched 72 * it to only search for the zero value. And to avoid confusion with 73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 74 */ 75 76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 77 #define RAM_SAVE_FLAG_ZERO 0x02 78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 79 #define RAM_SAVE_FLAG_PAGE 0x08 80 #define RAM_SAVE_FLAG_EOS 0x10 81 #define RAM_SAVE_FLAG_CONTINUE 0x20 82 #define RAM_SAVE_FLAG_XBZRLE 0x40 83 /* 0x80 is reserved in migration.h start with 0x100 next */ 84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 85 86 XBZRLECacheStats xbzrle_counters; 87 88 /* struct contains XBZRLE cache and a static page 89 used by the compression */ 90 static struct { 91 /* buffer used for XBZRLE encoding */ 92 uint8_t *encoded_buf; 93 /* buffer for storing page content */ 94 uint8_t *current_buf; 95 /* Cache for XBZRLE, Protected by lock. */ 96 PageCache *cache; 97 QemuMutex lock; 98 /* it will store a page full of zeros */ 99 uint8_t *zero_target_page; 100 /* buffer used for XBZRLE decoding */ 101 uint8_t *decoded_buf; 102 } XBZRLE; 103 104 static void XBZRLE_cache_lock(void) 105 { 106 if (migrate_use_xbzrle()) { 107 qemu_mutex_lock(&XBZRLE.lock); 108 } 109 } 110 111 static void XBZRLE_cache_unlock(void) 112 { 113 if (migrate_use_xbzrle()) { 114 qemu_mutex_unlock(&XBZRLE.lock); 115 } 116 } 117 118 /** 119 * xbzrle_cache_resize: resize the xbzrle cache 120 * 121 * This function is called from migrate_params_apply in main 122 * thread, possibly while a migration is in progress. A running 123 * migration may be using the cache and might finish during this call, 124 * hence changes to the cache are protected by XBZRLE.lock(). 125 * 126 * Returns 0 for success or -1 for error 127 * 128 * @new_size: new cache size 129 * @errp: set *errp if the check failed, with reason 130 */ 131 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 132 { 133 PageCache *new_cache; 134 int64_t ret = 0; 135 136 /* Check for truncation */ 137 if (new_size != (size_t)new_size) { 138 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 139 "exceeding address space"); 140 return -1; 141 } 142 143 if (new_size == migrate_xbzrle_cache_size()) { 144 /* nothing to do */ 145 return 0; 146 } 147 148 XBZRLE_cache_lock(); 149 150 if (XBZRLE.cache != NULL) { 151 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 152 if (!new_cache) { 153 ret = -1; 154 goto out; 155 } 156 157 cache_fini(XBZRLE.cache); 158 XBZRLE.cache = new_cache; 159 } 160 out: 161 XBZRLE_cache_unlock(); 162 return ret; 163 } 164 165 bool ramblock_is_ignored(RAMBlock *block) 166 { 167 return !qemu_ram_is_migratable(block) || 168 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 169 } 170 171 #undef RAMBLOCK_FOREACH 172 173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 174 { 175 RAMBlock *block; 176 int ret = 0; 177 178 RCU_READ_LOCK_GUARD(); 179 180 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 181 ret = func(block, opaque); 182 if (ret) { 183 break; 184 } 185 } 186 return ret; 187 } 188 189 static void ramblock_recv_map_init(void) 190 { 191 RAMBlock *rb; 192 193 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 194 assert(!rb->receivedmap); 195 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 196 } 197 } 198 199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 200 { 201 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 202 rb->receivedmap); 203 } 204 205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 206 { 207 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 208 } 209 210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 211 { 212 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 213 } 214 215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 216 size_t nr) 217 { 218 bitmap_set_atomic(rb->receivedmap, 219 ramblock_recv_bitmap_offset(host_addr, rb), 220 nr); 221 } 222 223 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 224 225 /* 226 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 227 * 228 * Returns >0 if success with sent bytes, or <0 if error. 229 */ 230 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 231 const char *block_name) 232 { 233 RAMBlock *block = qemu_ram_block_by_name(block_name); 234 unsigned long *le_bitmap, nbits; 235 uint64_t size; 236 237 if (!block) { 238 error_report("%s: invalid block name: %s", __func__, block_name); 239 return -1; 240 } 241 242 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 243 244 /* 245 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 246 * machines we may need 4 more bytes for padding (see below 247 * comment). So extend it a bit before hand. 248 */ 249 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 250 251 /* 252 * Always use little endian when sending the bitmap. This is 253 * required that when source and destination VMs are not using the 254 * same endianness. (Note: big endian won't work.) 255 */ 256 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 257 258 /* Size of the bitmap, in bytes */ 259 size = DIV_ROUND_UP(nbits, 8); 260 261 /* 262 * size is always aligned to 8 bytes for 64bit machines, but it 263 * may not be true for 32bit machines. We need this padding to 264 * make sure the migration can survive even between 32bit and 265 * 64bit machines. 266 */ 267 size = ROUND_UP(size, 8); 268 269 qemu_put_be64(file, size); 270 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 271 /* 272 * Mark as an end, in case the middle part is screwed up due to 273 * some "mysterious" reason. 274 */ 275 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 276 qemu_fflush(file); 277 278 g_free(le_bitmap); 279 280 if (qemu_file_get_error(file)) { 281 return qemu_file_get_error(file); 282 } 283 284 return size + sizeof(size); 285 } 286 287 /* 288 * An outstanding page request, on the source, having been received 289 * and queued 290 */ 291 struct RAMSrcPageRequest { 292 RAMBlock *rb; 293 hwaddr offset; 294 hwaddr len; 295 296 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 297 }; 298 299 typedef struct { 300 /* 301 * Cached ramblock/offset values if preempted. They're only meaningful if 302 * preempted==true below. 303 */ 304 RAMBlock *ram_block; 305 unsigned long ram_page; 306 /* 307 * Whether a postcopy preemption just happened. Will be reset after 308 * precopy recovered to background migration. 309 */ 310 bool preempted; 311 } PostcopyPreemptState; 312 313 /* State of RAM for migration */ 314 struct RAMState { 315 /* QEMUFile used for this migration */ 316 QEMUFile *f; 317 /* UFFD file descriptor, used in 'write-tracking' migration */ 318 int uffdio_fd; 319 /* Last block that we have visited searching for dirty pages */ 320 RAMBlock *last_seen_block; 321 /* Last block from where we have sent data */ 322 RAMBlock *last_sent_block; 323 /* Last dirty target page we have sent */ 324 ram_addr_t last_page; 325 /* last ram version we have seen */ 326 uint32_t last_version; 327 /* How many times we have dirty too many pages */ 328 int dirty_rate_high_cnt; 329 /* these variables are used for bitmap sync */ 330 /* last time we did a full bitmap_sync */ 331 int64_t time_last_bitmap_sync; 332 /* bytes transferred at start_time */ 333 uint64_t bytes_xfer_prev; 334 /* number of dirty pages since start_time */ 335 uint64_t num_dirty_pages_period; 336 /* xbzrle misses since the beginning of the period */ 337 uint64_t xbzrle_cache_miss_prev; 338 /* Amount of xbzrle pages since the beginning of the period */ 339 uint64_t xbzrle_pages_prev; 340 /* Amount of xbzrle encoded bytes since the beginning of the period */ 341 uint64_t xbzrle_bytes_prev; 342 /* Start using XBZRLE (e.g., after the first round). */ 343 bool xbzrle_enabled; 344 /* Are we on the last stage of migration */ 345 bool last_stage; 346 /* compression statistics since the beginning of the period */ 347 /* amount of count that no free thread to compress data */ 348 uint64_t compress_thread_busy_prev; 349 /* amount bytes after compression */ 350 uint64_t compressed_size_prev; 351 /* amount of compressed pages */ 352 uint64_t compress_pages_prev; 353 354 /* total handled target pages at the beginning of period */ 355 uint64_t target_page_count_prev; 356 /* total handled target pages since start */ 357 uint64_t target_page_count; 358 /* number of dirty bits in the bitmap */ 359 uint64_t migration_dirty_pages; 360 /* Protects modification of the bitmap and migration dirty pages */ 361 QemuMutex bitmap_mutex; 362 /* The RAMBlock used in the last src_page_requests */ 363 RAMBlock *last_req_rb; 364 /* Queue of outstanding page requests from the destination */ 365 QemuMutex src_page_req_mutex; 366 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 367 368 /* Postcopy preemption informations */ 369 PostcopyPreemptState postcopy_preempt_state; 370 /* 371 * Current channel we're using on src VM. Only valid if postcopy-preempt 372 * is enabled. 373 */ 374 unsigned int postcopy_channel; 375 }; 376 typedef struct RAMState RAMState; 377 378 static RAMState *ram_state; 379 380 static NotifierWithReturnList precopy_notifier_list; 381 382 static void postcopy_preempt_reset(RAMState *rs) 383 { 384 memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState)); 385 } 386 387 /* Whether postcopy has queued requests? */ 388 static bool postcopy_has_request(RAMState *rs) 389 { 390 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 391 } 392 393 void precopy_infrastructure_init(void) 394 { 395 notifier_with_return_list_init(&precopy_notifier_list); 396 } 397 398 void precopy_add_notifier(NotifierWithReturn *n) 399 { 400 notifier_with_return_list_add(&precopy_notifier_list, n); 401 } 402 403 void precopy_remove_notifier(NotifierWithReturn *n) 404 { 405 notifier_with_return_remove(n); 406 } 407 408 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 409 { 410 PrecopyNotifyData pnd; 411 pnd.reason = reason; 412 pnd.errp = errp; 413 414 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 415 } 416 417 uint64_t ram_bytes_remaining(void) 418 { 419 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 420 0; 421 } 422 423 MigrationStats ram_counters; 424 425 static void ram_transferred_add(uint64_t bytes) 426 { 427 if (runstate_is_running()) { 428 ram_counters.precopy_bytes += bytes; 429 } else if (migration_in_postcopy()) { 430 ram_counters.postcopy_bytes += bytes; 431 } else { 432 ram_counters.downtime_bytes += bytes; 433 } 434 ram_counters.transferred += bytes; 435 } 436 437 void dirty_sync_missed_zero_copy(void) 438 { 439 ram_counters.dirty_sync_missed_zero_copy++; 440 } 441 442 /* used by the search for pages to send */ 443 struct PageSearchStatus { 444 /* Current block being searched */ 445 RAMBlock *block; 446 /* Current page to search from */ 447 unsigned long page; 448 /* Set once we wrap around */ 449 bool complete_round; 450 /* 451 * [POSTCOPY-ONLY] Whether current page is explicitly requested by 452 * postcopy. When set, the request is "urgent" because the dest QEMU 453 * threads are waiting for us. 454 */ 455 bool postcopy_requested; 456 /* 457 * [POSTCOPY-ONLY] The target channel to use to send current page. 458 * 459 * Note: This may _not_ match with the value in postcopy_requested 460 * above. Let's imagine the case where the postcopy request is exactly 461 * the page that we're sending in progress during precopy. In this case 462 * we'll have postcopy_requested set to true but the target channel 463 * will be the precopy channel (so that we don't split brain on that 464 * specific page since the precopy channel already contains partial of 465 * that page data). 466 * 467 * Besides that specific use case, postcopy_target_channel should 468 * always be equal to postcopy_requested, because by default we send 469 * postcopy pages via postcopy preempt channel. 470 */ 471 bool postcopy_target_channel; 472 }; 473 typedef struct PageSearchStatus PageSearchStatus; 474 475 CompressionStats compression_counters; 476 477 struct CompressParam { 478 bool done; 479 bool quit; 480 bool zero_page; 481 QEMUFile *file; 482 QemuMutex mutex; 483 QemuCond cond; 484 RAMBlock *block; 485 ram_addr_t offset; 486 487 /* internally used fields */ 488 z_stream stream; 489 uint8_t *originbuf; 490 }; 491 typedef struct CompressParam CompressParam; 492 493 struct DecompressParam { 494 bool done; 495 bool quit; 496 QemuMutex mutex; 497 QemuCond cond; 498 void *des; 499 uint8_t *compbuf; 500 int len; 501 z_stream stream; 502 }; 503 typedef struct DecompressParam DecompressParam; 504 505 static CompressParam *comp_param; 506 static QemuThread *compress_threads; 507 /* comp_done_cond is used to wake up the migration thread when 508 * one of the compression threads has finished the compression. 509 * comp_done_lock is used to co-work with comp_done_cond. 510 */ 511 static QemuMutex comp_done_lock; 512 static QemuCond comp_done_cond; 513 514 static QEMUFile *decomp_file; 515 static DecompressParam *decomp_param; 516 static QemuThread *decompress_threads; 517 static QemuMutex decomp_done_lock; 518 static QemuCond decomp_done_cond; 519 520 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 521 ram_addr_t offset, uint8_t *source_buf); 522 523 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, 524 bool postcopy_requested); 525 526 static void *do_data_compress(void *opaque) 527 { 528 CompressParam *param = opaque; 529 RAMBlock *block; 530 ram_addr_t offset; 531 bool zero_page; 532 533 qemu_mutex_lock(¶m->mutex); 534 while (!param->quit) { 535 if (param->block) { 536 block = param->block; 537 offset = param->offset; 538 param->block = NULL; 539 qemu_mutex_unlock(¶m->mutex); 540 541 zero_page = do_compress_ram_page(param->file, ¶m->stream, 542 block, offset, param->originbuf); 543 544 qemu_mutex_lock(&comp_done_lock); 545 param->done = true; 546 param->zero_page = zero_page; 547 qemu_cond_signal(&comp_done_cond); 548 qemu_mutex_unlock(&comp_done_lock); 549 550 qemu_mutex_lock(¶m->mutex); 551 } else { 552 qemu_cond_wait(¶m->cond, ¶m->mutex); 553 } 554 } 555 qemu_mutex_unlock(¶m->mutex); 556 557 return NULL; 558 } 559 560 static void compress_threads_save_cleanup(void) 561 { 562 int i, thread_count; 563 564 if (!migrate_use_compression() || !comp_param) { 565 return; 566 } 567 568 thread_count = migrate_compress_threads(); 569 for (i = 0; i < thread_count; i++) { 570 /* 571 * we use it as a indicator which shows if the thread is 572 * properly init'd or not 573 */ 574 if (!comp_param[i].file) { 575 break; 576 } 577 578 qemu_mutex_lock(&comp_param[i].mutex); 579 comp_param[i].quit = true; 580 qemu_cond_signal(&comp_param[i].cond); 581 qemu_mutex_unlock(&comp_param[i].mutex); 582 583 qemu_thread_join(compress_threads + i); 584 qemu_mutex_destroy(&comp_param[i].mutex); 585 qemu_cond_destroy(&comp_param[i].cond); 586 deflateEnd(&comp_param[i].stream); 587 g_free(comp_param[i].originbuf); 588 qemu_fclose(comp_param[i].file); 589 comp_param[i].file = NULL; 590 } 591 qemu_mutex_destroy(&comp_done_lock); 592 qemu_cond_destroy(&comp_done_cond); 593 g_free(compress_threads); 594 g_free(comp_param); 595 compress_threads = NULL; 596 comp_param = NULL; 597 } 598 599 static int compress_threads_save_setup(void) 600 { 601 int i, thread_count; 602 603 if (!migrate_use_compression()) { 604 return 0; 605 } 606 thread_count = migrate_compress_threads(); 607 compress_threads = g_new0(QemuThread, thread_count); 608 comp_param = g_new0(CompressParam, thread_count); 609 qemu_cond_init(&comp_done_cond); 610 qemu_mutex_init(&comp_done_lock); 611 for (i = 0; i < thread_count; i++) { 612 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 613 if (!comp_param[i].originbuf) { 614 goto exit; 615 } 616 617 if (deflateInit(&comp_param[i].stream, 618 migrate_compress_level()) != Z_OK) { 619 g_free(comp_param[i].originbuf); 620 goto exit; 621 } 622 623 /* comp_param[i].file is just used as a dummy buffer to save data, 624 * set its ops to empty. 625 */ 626 comp_param[i].file = qemu_file_new_output( 627 QIO_CHANNEL(qio_channel_null_new())); 628 comp_param[i].done = true; 629 comp_param[i].quit = false; 630 qemu_mutex_init(&comp_param[i].mutex); 631 qemu_cond_init(&comp_param[i].cond); 632 qemu_thread_create(compress_threads + i, "compress", 633 do_data_compress, comp_param + i, 634 QEMU_THREAD_JOINABLE); 635 } 636 return 0; 637 638 exit: 639 compress_threads_save_cleanup(); 640 return -1; 641 } 642 643 /** 644 * save_page_header: write page header to wire 645 * 646 * If this is the 1st block, it also writes the block identification 647 * 648 * Returns the number of bytes written 649 * 650 * @f: QEMUFile where to send the data 651 * @block: block that contains the page we want to send 652 * @offset: offset inside the block for the page 653 * in the lower bits, it contains flags 654 */ 655 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 656 ram_addr_t offset) 657 { 658 size_t size, len; 659 660 if (block == rs->last_sent_block) { 661 offset |= RAM_SAVE_FLAG_CONTINUE; 662 } 663 qemu_put_be64(f, offset); 664 size = 8; 665 666 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 667 len = strlen(block->idstr); 668 qemu_put_byte(f, len); 669 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 670 size += 1 + len; 671 rs->last_sent_block = block; 672 } 673 return size; 674 } 675 676 /** 677 * mig_throttle_guest_down: throttle down the guest 678 * 679 * Reduce amount of guest cpu execution to hopefully slow down memory 680 * writes. If guest dirty memory rate is reduced below the rate at 681 * which we can transfer pages to the destination then we should be 682 * able to complete migration. Some workloads dirty memory way too 683 * fast and will not effectively converge, even with auto-converge. 684 */ 685 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 686 uint64_t bytes_dirty_threshold) 687 { 688 MigrationState *s = migrate_get_current(); 689 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 690 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 691 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 692 int pct_max = s->parameters.max_cpu_throttle; 693 694 uint64_t throttle_now = cpu_throttle_get_percentage(); 695 uint64_t cpu_now, cpu_ideal, throttle_inc; 696 697 /* We have not started throttling yet. Let's start it. */ 698 if (!cpu_throttle_active()) { 699 cpu_throttle_set(pct_initial); 700 } else { 701 /* Throttling already on, just increase the rate */ 702 if (!pct_tailslow) { 703 throttle_inc = pct_increment; 704 } else { 705 /* Compute the ideal CPU percentage used by Guest, which may 706 * make the dirty rate match the dirty rate threshold. */ 707 cpu_now = 100 - throttle_now; 708 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 709 bytes_dirty_period); 710 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 711 } 712 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 713 } 714 } 715 716 void mig_throttle_counter_reset(void) 717 { 718 RAMState *rs = ram_state; 719 720 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 721 rs->num_dirty_pages_period = 0; 722 rs->bytes_xfer_prev = ram_counters.transferred; 723 } 724 725 /** 726 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 727 * 728 * @rs: current RAM state 729 * @current_addr: address for the zero page 730 * 731 * Update the xbzrle cache to reflect a page that's been sent as all 0. 732 * The important thing is that a stale (not-yet-0'd) page be replaced 733 * by the new data. 734 * As a bonus, if the page wasn't in the cache it gets added so that 735 * when a small write is made into the 0'd page it gets XBZRLE sent. 736 */ 737 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 738 { 739 if (!rs->xbzrle_enabled) { 740 return; 741 } 742 743 /* We don't care if this fails to allocate a new cache page 744 * as long as it updated an old one */ 745 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 746 ram_counters.dirty_sync_count); 747 } 748 749 #define ENCODING_FLAG_XBZRLE 0x1 750 751 /** 752 * save_xbzrle_page: compress and send current page 753 * 754 * Returns: 1 means that we wrote the page 755 * 0 means that page is identical to the one already sent 756 * -1 means that xbzrle would be longer than normal 757 * 758 * @rs: current RAM state 759 * @current_data: pointer to the address of the page contents 760 * @current_addr: addr of the page 761 * @block: block that contains the page we want to send 762 * @offset: offset inside the block for the page 763 */ 764 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 765 ram_addr_t current_addr, RAMBlock *block, 766 ram_addr_t offset) 767 { 768 int encoded_len = 0, bytes_xbzrle; 769 uint8_t *prev_cached_page; 770 771 if (!cache_is_cached(XBZRLE.cache, current_addr, 772 ram_counters.dirty_sync_count)) { 773 xbzrle_counters.cache_miss++; 774 if (!rs->last_stage) { 775 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 776 ram_counters.dirty_sync_count) == -1) { 777 return -1; 778 } else { 779 /* update *current_data when the page has been 780 inserted into cache */ 781 *current_data = get_cached_data(XBZRLE.cache, current_addr); 782 } 783 } 784 return -1; 785 } 786 787 /* 788 * Reaching here means the page has hit the xbzrle cache, no matter what 789 * encoding result it is (normal encoding, overflow or skipping the page), 790 * count the page as encoded. This is used to calculate the encoding rate. 791 * 792 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 793 * 2nd page turns out to be skipped (i.e. no new bytes written to the 794 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 795 * skipped page included. In this way, the encoding rate can tell if the 796 * guest page is good for xbzrle encoding. 797 */ 798 xbzrle_counters.pages++; 799 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 800 801 /* save current buffer into memory */ 802 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 803 804 /* XBZRLE encoding (if there is no overflow) */ 805 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 806 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 807 TARGET_PAGE_SIZE); 808 809 /* 810 * Update the cache contents, so that it corresponds to the data 811 * sent, in all cases except where we skip the page. 812 */ 813 if (!rs->last_stage && encoded_len != 0) { 814 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 815 /* 816 * In the case where we couldn't compress, ensure that the caller 817 * sends the data from the cache, since the guest might have 818 * changed the RAM since we copied it. 819 */ 820 *current_data = prev_cached_page; 821 } 822 823 if (encoded_len == 0) { 824 trace_save_xbzrle_page_skipping(); 825 return 0; 826 } else if (encoded_len == -1) { 827 trace_save_xbzrle_page_overflow(); 828 xbzrle_counters.overflow++; 829 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 830 return -1; 831 } 832 833 /* Send XBZRLE based compressed page */ 834 bytes_xbzrle = save_page_header(rs, rs->f, block, 835 offset | RAM_SAVE_FLAG_XBZRLE); 836 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 837 qemu_put_be16(rs->f, encoded_len); 838 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 839 bytes_xbzrle += encoded_len + 1 + 2; 840 /* 841 * Like compressed_size (please see update_compress_thread_counts), 842 * the xbzrle encoded bytes don't count the 8 byte header with 843 * RAM_SAVE_FLAG_CONTINUE. 844 */ 845 xbzrle_counters.bytes += bytes_xbzrle - 8; 846 ram_transferred_add(bytes_xbzrle); 847 848 return 1; 849 } 850 851 /** 852 * migration_bitmap_find_dirty: find the next dirty page from start 853 * 854 * Returns the page offset within memory region of the start of a dirty page 855 * 856 * @rs: current RAM state 857 * @rb: RAMBlock where to search for dirty pages 858 * @start: page where we start the search 859 */ 860 static inline 861 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 862 unsigned long start) 863 { 864 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 865 unsigned long *bitmap = rb->bmap; 866 867 if (ramblock_is_ignored(rb)) { 868 return size; 869 } 870 871 return find_next_bit(bitmap, size, start); 872 } 873 874 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 875 unsigned long page) 876 { 877 uint8_t shift; 878 hwaddr size, start; 879 880 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 881 return; 882 } 883 884 shift = rb->clear_bmap_shift; 885 /* 886 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 887 * can make things easier sometimes since then start address 888 * of the small chunk will always be 64 pages aligned so the 889 * bitmap will always be aligned to unsigned long. We should 890 * even be able to remove this restriction but I'm simply 891 * keeping it. 892 */ 893 assert(shift >= 6); 894 895 size = 1ULL << (TARGET_PAGE_BITS + shift); 896 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 897 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 898 memory_region_clear_dirty_bitmap(rb->mr, start, size); 899 } 900 901 static void 902 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 903 unsigned long start, 904 unsigned long npages) 905 { 906 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 907 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 908 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 909 910 /* 911 * Clear pages from start to start + npages - 1, so the end boundary is 912 * exclusive. 913 */ 914 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 915 migration_clear_memory_region_dirty_bitmap(rb, i); 916 } 917 } 918 919 /* 920 * colo_bitmap_find_diry:find contiguous dirty pages from start 921 * 922 * Returns the page offset within memory region of the start of the contiguout 923 * dirty page 924 * 925 * @rs: current RAM state 926 * @rb: RAMBlock where to search for dirty pages 927 * @start: page where we start the search 928 * @num: the number of contiguous dirty pages 929 */ 930 static inline 931 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 932 unsigned long start, unsigned long *num) 933 { 934 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 935 unsigned long *bitmap = rb->bmap; 936 unsigned long first, next; 937 938 *num = 0; 939 940 if (ramblock_is_ignored(rb)) { 941 return size; 942 } 943 944 first = find_next_bit(bitmap, size, start); 945 if (first >= size) { 946 return first; 947 } 948 next = find_next_zero_bit(bitmap, size, first + 1); 949 assert(next >= first); 950 *num = next - first; 951 return first; 952 } 953 954 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 955 RAMBlock *rb, 956 unsigned long page) 957 { 958 bool ret; 959 960 /* 961 * Clear dirty bitmap if needed. This _must_ be called before we 962 * send any of the page in the chunk because we need to make sure 963 * we can capture further page content changes when we sync dirty 964 * log the next time. So as long as we are going to send any of 965 * the page in the chunk we clear the remote dirty bitmap for all. 966 * Clearing it earlier won't be a problem, but too late will. 967 */ 968 migration_clear_memory_region_dirty_bitmap(rb, page); 969 970 ret = test_and_clear_bit(page, rb->bmap); 971 if (ret) { 972 rs->migration_dirty_pages--; 973 } 974 975 return ret; 976 } 977 978 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 979 void *opaque) 980 { 981 const hwaddr offset = section->offset_within_region; 982 const hwaddr size = int128_get64(section->size); 983 const unsigned long start = offset >> TARGET_PAGE_BITS; 984 const unsigned long npages = size >> TARGET_PAGE_BITS; 985 RAMBlock *rb = section->mr->ram_block; 986 uint64_t *cleared_bits = opaque; 987 988 /* 989 * We don't grab ram_state->bitmap_mutex because we expect to run 990 * only when starting migration or during postcopy recovery where 991 * we don't have concurrent access. 992 */ 993 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 994 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 995 } 996 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 997 bitmap_clear(rb->bmap, start, npages); 998 } 999 1000 /* 1001 * Exclude all dirty pages from migration that fall into a discarded range as 1002 * managed by a RamDiscardManager responsible for the mapped memory region of 1003 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1004 * 1005 * Discarded pages ("logically unplugged") have undefined content and must 1006 * not get migrated, because even reading these pages for migration might 1007 * result in undesired behavior. 1008 * 1009 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1010 * 1011 * Note: The result is only stable while migrating (precopy/postcopy). 1012 */ 1013 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1014 { 1015 uint64_t cleared_bits = 0; 1016 1017 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1018 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1019 MemoryRegionSection section = { 1020 .mr = rb->mr, 1021 .offset_within_region = 0, 1022 .size = int128_make64(qemu_ram_get_used_length(rb)), 1023 }; 1024 1025 ram_discard_manager_replay_discarded(rdm, §ion, 1026 dirty_bitmap_clear_section, 1027 &cleared_bits); 1028 } 1029 return cleared_bits; 1030 } 1031 1032 /* 1033 * Check if a host-page aligned page falls into a discarded range as managed by 1034 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1035 * 1036 * Note: The result is only stable while migrating (precopy/postcopy). 1037 */ 1038 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1039 { 1040 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1041 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1042 MemoryRegionSection section = { 1043 .mr = rb->mr, 1044 .offset_within_region = start, 1045 .size = int128_make64(qemu_ram_pagesize(rb)), 1046 }; 1047 1048 return !ram_discard_manager_is_populated(rdm, §ion); 1049 } 1050 return false; 1051 } 1052 1053 /* Called with RCU critical section */ 1054 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1055 { 1056 uint64_t new_dirty_pages = 1057 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1058 1059 rs->migration_dirty_pages += new_dirty_pages; 1060 rs->num_dirty_pages_period += new_dirty_pages; 1061 } 1062 1063 /** 1064 * ram_pagesize_summary: calculate all the pagesizes of a VM 1065 * 1066 * Returns a summary bitmap of the page sizes of all RAMBlocks 1067 * 1068 * For VMs with just normal pages this is equivalent to the host page 1069 * size. If it's got some huge pages then it's the OR of all the 1070 * different page sizes. 1071 */ 1072 uint64_t ram_pagesize_summary(void) 1073 { 1074 RAMBlock *block; 1075 uint64_t summary = 0; 1076 1077 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1078 summary |= block->page_size; 1079 } 1080 1081 return summary; 1082 } 1083 1084 uint64_t ram_get_total_transferred_pages(void) 1085 { 1086 return ram_counters.normal + ram_counters.duplicate + 1087 compression_counters.pages + xbzrle_counters.pages; 1088 } 1089 1090 static void migration_update_rates(RAMState *rs, int64_t end_time) 1091 { 1092 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1093 double compressed_size; 1094 1095 /* calculate period counters */ 1096 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1097 / (end_time - rs->time_last_bitmap_sync); 1098 1099 if (!page_count) { 1100 return; 1101 } 1102 1103 if (migrate_use_xbzrle()) { 1104 double encoded_size, unencoded_size; 1105 1106 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1107 rs->xbzrle_cache_miss_prev) / page_count; 1108 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1109 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1110 TARGET_PAGE_SIZE; 1111 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1112 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1113 xbzrle_counters.encoding_rate = 0; 1114 } else { 1115 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1116 } 1117 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1118 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1119 } 1120 1121 if (migrate_use_compression()) { 1122 compression_counters.busy_rate = (double)(compression_counters.busy - 1123 rs->compress_thread_busy_prev) / page_count; 1124 rs->compress_thread_busy_prev = compression_counters.busy; 1125 1126 compressed_size = compression_counters.compressed_size - 1127 rs->compressed_size_prev; 1128 if (compressed_size) { 1129 double uncompressed_size = (compression_counters.pages - 1130 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1131 1132 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1133 compression_counters.compression_rate = 1134 uncompressed_size / compressed_size; 1135 1136 rs->compress_pages_prev = compression_counters.pages; 1137 rs->compressed_size_prev = compression_counters.compressed_size; 1138 } 1139 } 1140 } 1141 1142 static void migration_trigger_throttle(RAMState *rs) 1143 { 1144 MigrationState *s = migrate_get_current(); 1145 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1146 1147 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 1148 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1149 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1150 1151 /* During block migration the auto-converge logic incorrectly detects 1152 * that ram migration makes no progress. Avoid this by disabling the 1153 * throttling logic during the bulk phase of block migration. */ 1154 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1155 /* The following detection logic can be refined later. For now: 1156 Check to see if the ratio between dirtied bytes and the approx. 1157 amount of bytes that just got transferred since the last time 1158 we were in this routine reaches the threshold. If that happens 1159 twice, start or increase throttling. */ 1160 1161 if ((bytes_dirty_period > bytes_dirty_threshold) && 1162 (++rs->dirty_rate_high_cnt >= 2)) { 1163 trace_migration_throttle(); 1164 rs->dirty_rate_high_cnt = 0; 1165 mig_throttle_guest_down(bytes_dirty_period, 1166 bytes_dirty_threshold); 1167 } 1168 } 1169 } 1170 1171 static void migration_bitmap_sync(RAMState *rs) 1172 { 1173 RAMBlock *block; 1174 int64_t end_time; 1175 1176 ram_counters.dirty_sync_count++; 1177 1178 if (!rs->time_last_bitmap_sync) { 1179 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1180 } 1181 1182 trace_migration_bitmap_sync_start(); 1183 memory_global_dirty_log_sync(); 1184 1185 qemu_mutex_lock(&rs->bitmap_mutex); 1186 WITH_RCU_READ_LOCK_GUARD() { 1187 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1188 ramblock_sync_dirty_bitmap(rs, block); 1189 } 1190 ram_counters.remaining = ram_bytes_remaining(); 1191 } 1192 qemu_mutex_unlock(&rs->bitmap_mutex); 1193 1194 memory_global_after_dirty_log_sync(); 1195 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1196 1197 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1198 1199 /* more than 1 second = 1000 millisecons */ 1200 if (end_time > rs->time_last_bitmap_sync + 1000) { 1201 migration_trigger_throttle(rs); 1202 1203 migration_update_rates(rs, end_time); 1204 1205 rs->target_page_count_prev = rs->target_page_count; 1206 1207 /* reset period counters */ 1208 rs->time_last_bitmap_sync = end_time; 1209 rs->num_dirty_pages_period = 0; 1210 rs->bytes_xfer_prev = ram_counters.transferred; 1211 } 1212 if (migrate_use_events()) { 1213 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1214 } 1215 } 1216 1217 static void migration_bitmap_sync_precopy(RAMState *rs) 1218 { 1219 Error *local_err = NULL; 1220 1221 /* 1222 * The current notifier usage is just an optimization to migration, so we 1223 * don't stop the normal migration process in the error case. 1224 */ 1225 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1226 error_report_err(local_err); 1227 local_err = NULL; 1228 } 1229 1230 migration_bitmap_sync(rs); 1231 1232 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1233 error_report_err(local_err); 1234 } 1235 } 1236 1237 static void ram_release_page(const char *rbname, uint64_t offset) 1238 { 1239 if (!migrate_release_ram() || !migration_in_postcopy()) { 1240 return; 1241 } 1242 1243 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1244 } 1245 1246 /** 1247 * save_zero_page_to_file: send the zero page to the file 1248 * 1249 * Returns the size of data written to the file, 0 means the page is not 1250 * a zero page 1251 * 1252 * @rs: current RAM state 1253 * @file: the file where the data is saved 1254 * @block: block that contains the page we want to send 1255 * @offset: offset inside the block for the page 1256 */ 1257 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1258 RAMBlock *block, ram_addr_t offset) 1259 { 1260 uint8_t *p = block->host + offset; 1261 int len = 0; 1262 1263 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1264 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1265 qemu_put_byte(file, 0); 1266 len += 1; 1267 ram_release_page(block->idstr, offset); 1268 } 1269 return len; 1270 } 1271 1272 /** 1273 * save_zero_page: send the zero page to the stream 1274 * 1275 * Returns the number of pages written. 1276 * 1277 * @rs: current RAM state 1278 * @block: block that contains the page we want to send 1279 * @offset: offset inside the block for the page 1280 */ 1281 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1282 { 1283 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1284 1285 if (len) { 1286 ram_counters.duplicate++; 1287 ram_transferred_add(len); 1288 return 1; 1289 } 1290 return -1; 1291 } 1292 1293 /* 1294 * @pages: the number of pages written by the control path, 1295 * < 0 - error 1296 * > 0 - number of pages written 1297 * 1298 * Return true if the pages has been saved, otherwise false is returned. 1299 */ 1300 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1301 int *pages) 1302 { 1303 uint64_t bytes_xmit = 0; 1304 int ret; 1305 1306 *pages = -1; 1307 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1308 &bytes_xmit); 1309 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1310 return false; 1311 } 1312 1313 if (bytes_xmit) { 1314 ram_transferred_add(bytes_xmit); 1315 *pages = 1; 1316 } 1317 1318 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1319 return true; 1320 } 1321 1322 if (bytes_xmit > 0) { 1323 ram_counters.normal++; 1324 } else if (bytes_xmit == 0) { 1325 ram_counters.duplicate++; 1326 } 1327 1328 return true; 1329 } 1330 1331 /* 1332 * directly send the page to the stream 1333 * 1334 * Returns the number of pages written. 1335 * 1336 * @rs: current RAM state 1337 * @block: block that contains the page we want to send 1338 * @offset: offset inside the block for the page 1339 * @buf: the page to be sent 1340 * @async: send to page asyncly 1341 */ 1342 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1343 uint8_t *buf, bool async) 1344 { 1345 ram_transferred_add(save_page_header(rs, rs->f, block, 1346 offset | RAM_SAVE_FLAG_PAGE)); 1347 if (async) { 1348 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1349 migrate_release_ram() && 1350 migration_in_postcopy()); 1351 } else { 1352 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1353 } 1354 ram_transferred_add(TARGET_PAGE_SIZE); 1355 ram_counters.normal++; 1356 return 1; 1357 } 1358 1359 /** 1360 * ram_save_page: send the given page to the stream 1361 * 1362 * Returns the number of pages written. 1363 * < 0 - error 1364 * >=0 - Number of pages written - this might legally be 0 1365 * if xbzrle noticed the page was the same. 1366 * 1367 * @rs: current RAM state 1368 * @block: block that contains the page we want to send 1369 * @offset: offset inside the block for the page 1370 */ 1371 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1372 { 1373 int pages = -1; 1374 uint8_t *p; 1375 bool send_async = true; 1376 RAMBlock *block = pss->block; 1377 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1378 ram_addr_t current_addr = block->offset + offset; 1379 1380 p = block->host + offset; 1381 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1382 1383 XBZRLE_cache_lock(); 1384 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1385 pages = save_xbzrle_page(rs, &p, current_addr, block, 1386 offset); 1387 if (!rs->last_stage) { 1388 /* Can't send this cached data async, since the cache page 1389 * might get updated before it gets to the wire 1390 */ 1391 send_async = false; 1392 } 1393 } 1394 1395 /* XBZRLE overflow or normal page */ 1396 if (pages == -1) { 1397 pages = save_normal_page(rs, block, offset, p, send_async); 1398 } 1399 1400 XBZRLE_cache_unlock(); 1401 1402 return pages; 1403 } 1404 1405 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1406 ram_addr_t offset) 1407 { 1408 if (multifd_queue_page(rs->f, block, offset) < 0) { 1409 return -1; 1410 } 1411 ram_counters.normal++; 1412 1413 return 1; 1414 } 1415 1416 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1417 ram_addr_t offset, uint8_t *source_buf) 1418 { 1419 RAMState *rs = ram_state; 1420 uint8_t *p = block->host + offset; 1421 int ret; 1422 1423 if (save_zero_page_to_file(rs, f, block, offset)) { 1424 return true; 1425 } 1426 1427 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1428 1429 /* 1430 * copy it to a internal buffer to avoid it being modified by VM 1431 * so that we can catch up the error during compression and 1432 * decompression 1433 */ 1434 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1435 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1436 if (ret < 0) { 1437 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1438 error_report("compressed data failed!"); 1439 } 1440 return false; 1441 } 1442 1443 static void 1444 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1445 { 1446 ram_transferred_add(bytes_xmit); 1447 1448 if (param->zero_page) { 1449 ram_counters.duplicate++; 1450 return; 1451 } 1452 1453 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1454 compression_counters.compressed_size += bytes_xmit - 8; 1455 compression_counters.pages++; 1456 } 1457 1458 static bool save_page_use_compression(RAMState *rs); 1459 1460 static void flush_compressed_data(RAMState *rs) 1461 { 1462 int idx, len, thread_count; 1463 1464 if (!save_page_use_compression(rs)) { 1465 return; 1466 } 1467 thread_count = migrate_compress_threads(); 1468 1469 qemu_mutex_lock(&comp_done_lock); 1470 for (idx = 0; idx < thread_count; idx++) { 1471 while (!comp_param[idx].done) { 1472 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1473 } 1474 } 1475 qemu_mutex_unlock(&comp_done_lock); 1476 1477 for (idx = 0; idx < thread_count; idx++) { 1478 qemu_mutex_lock(&comp_param[idx].mutex); 1479 if (!comp_param[idx].quit) { 1480 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1481 /* 1482 * it's safe to fetch zero_page without holding comp_done_lock 1483 * as there is no further request submitted to the thread, 1484 * i.e, the thread should be waiting for a request at this point. 1485 */ 1486 update_compress_thread_counts(&comp_param[idx], len); 1487 } 1488 qemu_mutex_unlock(&comp_param[idx].mutex); 1489 } 1490 } 1491 1492 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1493 ram_addr_t offset) 1494 { 1495 param->block = block; 1496 param->offset = offset; 1497 } 1498 1499 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1500 ram_addr_t offset) 1501 { 1502 int idx, thread_count, bytes_xmit = -1, pages = -1; 1503 bool wait = migrate_compress_wait_thread(); 1504 1505 thread_count = migrate_compress_threads(); 1506 qemu_mutex_lock(&comp_done_lock); 1507 retry: 1508 for (idx = 0; idx < thread_count; idx++) { 1509 if (comp_param[idx].done) { 1510 comp_param[idx].done = false; 1511 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1512 qemu_mutex_lock(&comp_param[idx].mutex); 1513 set_compress_params(&comp_param[idx], block, offset); 1514 qemu_cond_signal(&comp_param[idx].cond); 1515 qemu_mutex_unlock(&comp_param[idx].mutex); 1516 pages = 1; 1517 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1518 break; 1519 } 1520 } 1521 1522 /* 1523 * wait for the free thread if the user specifies 'compress-wait-thread', 1524 * otherwise we will post the page out in the main thread as normal page. 1525 */ 1526 if (pages < 0 && wait) { 1527 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1528 goto retry; 1529 } 1530 qemu_mutex_unlock(&comp_done_lock); 1531 1532 return pages; 1533 } 1534 1535 /** 1536 * find_dirty_block: find the next dirty page and update any state 1537 * associated with the search process. 1538 * 1539 * Returns true if a page is found 1540 * 1541 * @rs: current RAM state 1542 * @pss: data about the state of the current dirty page scan 1543 * @again: set to false if the search has scanned the whole of RAM 1544 */ 1545 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1546 { 1547 /* 1548 * This is not a postcopy requested page, mark it "not urgent", and use 1549 * precopy channel to send it. 1550 */ 1551 pss->postcopy_requested = false; 1552 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; 1553 1554 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1555 if (pss->complete_round && pss->block == rs->last_seen_block && 1556 pss->page >= rs->last_page) { 1557 /* 1558 * We've been once around the RAM and haven't found anything. 1559 * Give up. 1560 */ 1561 *again = false; 1562 return false; 1563 } 1564 if (!offset_in_ramblock(pss->block, 1565 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1566 /* Didn't find anything in this RAM Block */ 1567 pss->page = 0; 1568 pss->block = QLIST_NEXT_RCU(pss->block, next); 1569 if (!pss->block) { 1570 /* 1571 * If memory migration starts over, we will meet a dirtied page 1572 * which may still exists in compression threads's ring, so we 1573 * should flush the compressed data to make sure the new page 1574 * is not overwritten by the old one in the destination. 1575 * 1576 * Also If xbzrle is on, stop using the data compression at this 1577 * point. In theory, xbzrle can do better than compression. 1578 */ 1579 flush_compressed_data(rs); 1580 1581 /* Hit the end of the list */ 1582 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1583 /* Flag that we've looped */ 1584 pss->complete_round = true; 1585 /* After the first round, enable XBZRLE. */ 1586 if (migrate_use_xbzrle()) { 1587 rs->xbzrle_enabled = true; 1588 } 1589 } 1590 /* Didn't find anything this time, but try again on the new block */ 1591 *again = true; 1592 return false; 1593 } else { 1594 /* Can go around again, but... */ 1595 *again = true; 1596 /* We've found something so probably don't need to */ 1597 return true; 1598 } 1599 } 1600 1601 /** 1602 * unqueue_page: gets a page of the queue 1603 * 1604 * Helper for 'get_queued_page' - gets a page off the queue 1605 * 1606 * Returns the block of the page (or NULL if none available) 1607 * 1608 * @rs: current RAM state 1609 * @offset: used to return the offset within the RAMBlock 1610 */ 1611 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1612 { 1613 struct RAMSrcPageRequest *entry; 1614 RAMBlock *block = NULL; 1615 size_t page_size; 1616 1617 if (!postcopy_has_request(rs)) { 1618 return NULL; 1619 } 1620 1621 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1622 1623 /* 1624 * This should _never_ change even after we take the lock, because no one 1625 * should be taking anything off the request list other than us. 1626 */ 1627 assert(postcopy_has_request(rs)); 1628 1629 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1630 block = entry->rb; 1631 *offset = entry->offset; 1632 page_size = qemu_ram_pagesize(block); 1633 /* Each page request should only be multiple page size of the ramblock */ 1634 assert((entry->len % page_size) == 0); 1635 1636 if (entry->len > page_size) { 1637 entry->len -= page_size; 1638 entry->offset += page_size; 1639 } else { 1640 memory_region_unref(block->mr); 1641 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1642 g_free(entry); 1643 migration_consume_urgent_request(); 1644 } 1645 1646 trace_unqueue_page(block->idstr, *offset, 1647 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap)); 1648 1649 return block; 1650 } 1651 1652 #if defined(__linux__) 1653 /** 1654 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1655 * is found, return RAM block pointer and page offset 1656 * 1657 * Returns pointer to the RAMBlock containing faulting page, 1658 * NULL if no write faults are pending 1659 * 1660 * @rs: current RAM state 1661 * @offset: page offset from the beginning of the block 1662 */ 1663 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1664 { 1665 struct uffd_msg uffd_msg; 1666 void *page_address; 1667 RAMBlock *block; 1668 int res; 1669 1670 if (!migrate_background_snapshot()) { 1671 return NULL; 1672 } 1673 1674 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1675 if (res <= 0) { 1676 return NULL; 1677 } 1678 1679 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1680 block = qemu_ram_block_from_host(page_address, false, offset); 1681 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1682 return block; 1683 } 1684 1685 /** 1686 * ram_save_release_protection: release UFFD write protection after 1687 * a range of pages has been saved 1688 * 1689 * @rs: current RAM state 1690 * @pss: page-search-status structure 1691 * @start_page: index of the first page in the range relative to pss->block 1692 * 1693 * Returns 0 on success, negative value in case of an error 1694 */ 1695 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1696 unsigned long start_page) 1697 { 1698 int res = 0; 1699 1700 /* Check if page is from UFFD-managed region. */ 1701 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1702 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1703 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1704 1705 /* Flush async buffers before un-protect. */ 1706 qemu_fflush(rs->f); 1707 /* Un-protect memory range. */ 1708 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1709 false, false); 1710 } 1711 1712 return res; 1713 } 1714 1715 /* ram_write_tracking_available: check if kernel supports required UFFD features 1716 * 1717 * Returns true if supports, false otherwise 1718 */ 1719 bool ram_write_tracking_available(void) 1720 { 1721 uint64_t uffd_features; 1722 int res; 1723 1724 res = uffd_query_features(&uffd_features); 1725 return (res == 0 && 1726 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1727 } 1728 1729 /* ram_write_tracking_compatible: check if guest configuration is 1730 * compatible with 'write-tracking' 1731 * 1732 * Returns true if compatible, false otherwise 1733 */ 1734 bool ram_write_tracking_compatible(void) 1735 { 1736 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1737 int uffd_fd; 1738 RAMBlock *block; 1739 bool ret = false; 1740 1741 /* Open UFFD file descriptor */ 1742 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1743 if (uffd_fd < 0) { 1744 return false; 1745 } 1746 1747 RCU_READ_LOCK_GUARD(); 1748 1749 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1750 uint64_t uffd_ioctls; 1751 1752 /* Nothing to do with read-only and MMIO-writable regions */ 1753 if (block->mr->readonly || block->mr->rom_device) { 1754 continue; 1755 } 1756 /* Try to register block memory via UFFD-IO to track writes */ 1757 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1758 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1759 goto out; 1760 } 1761 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1762 goto out; 1763 } 1764 } 1765 ret = true; 1766 1767 out: 1768 uffd_close_fd(uffd_fd); 1769 return ret; 1770 } 1771 1772 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1773 ram_addr_t size) 1774 { 1775 /* 1776 * We read one byte of each page; this will preallocate page tables if 1777 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1778 * where no page was populated yet. This might require adaption when 1779 * supporting other mappings, like shmem. 1780 */ 1781 for (; offset < size; offset += block->page_size) { 1782 char tmp = *((char *)block->host + offset); 1783 1784 /* Don't optimize the read out */ 1785 asm volatile("" : "+r" (tmp)); 1786 } 1787 } 1788 1789 static inline int populate_read_section(MemoryRegionSection *section, 1790 void *opaque) 1791 { 1792 const hwaddr size = int128_get64(section->size); 1793 hwaddr offset = section->offset_within_region; 1794 RAMBlock *block = section->mr->ram_block; 1795 1796 populate_read_range(block, offset, size); 1797 return 0; 1798 } 1799 1800 /* 1801 * ram_block_populate_read: preallocate page tables and populate pages in the 1802 * RAM block by reading a byte of each page. 1803 * 1804 * Since it's solely used for userfault_fd WP feature, here we just 1805 * hardcode page size to qemu_real_host_page_size. 1806 * 1807 * @block: RAM block to populate 1808 */ 1809 static void ram_block_populate_read(RAMBlock *rb) 1810 { 1811 /* 1812 * Skip populating all pages that fall into a discarded range as managed by 1813 * a RamDiscardManager responsible for the mapped memory region of the 1814 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1815 * must not get populated automatically. We don't have to track 1816 * modifications via userfaultfd WP reliably, because these pages will 1817 * not be part of the migration stream either way -- see 1818 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1819 * 1820 * Note: The result is only stable while migrating (precopy/postcopy). 1821 */ 1822 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1823 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1824 MemoryRegionSection section = { 1825 .mr = rb->mr, 1826 .offset_within_region = 0, 1827 .size = rb->mr->size, 1828 }; 1829 1830 ram_discard_manager_replay_populated(rdm, §ion, 1831 populate_read_section, NULL); 1832 } else { 1833 populate_read_range(rb, 0, rb->used_length); 1834 } 1835 } 1836 1837 /* 1838 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1839 */ 1840 void ram_write_tracking_prepare(void) 1841 { 1842 RAMBlock *block; 1843 1844 RCU_READ_LOCK_GUARD(); 1845 1846 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1847 /* Nothing to do with read-only and MMIO-writable regions */ 1848 if (block->mr->readonly || block->mr->rom_device) { 1849 continue; 1850 } 1851 1852 /* 1853 * Populate pages of the RAM block before enabling userfault_fd 1854 * write protection. 1855 * 1856 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1857 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1858 * pages with pte_none() entries in page table. 1859 */ 1860 ram_block_populate_read(block); 1861 } 1862 } 1863 1864 /* 1865 * ram_write_tracking_start: start UFFD-WP memory tracking 1866 * 1867 * Returns 0 for success or negative value in case of error 1868 */ 1869 int ram_write_tracking_start(void) 1870 { 1871 int uffd_fd; 1872 RAMState *rs = ram_state; 1873 RAMBlock *block; 1874 1875 /* Open UFFD file descriptor */ 1876 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1877 if (uffd_fd < 0) { 1878 return uffd_fd; 1879 } 1880 rs->uffdio_fd = uffd_fd; 1881 1882 RCU_READ_LOCK_GUARD(); 1883 1884 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1885 /* Nothing to do with read-only and MMIO-writable regions */ 1886 if (block->mr->readonly || block->mr->rom_device) { 1887 continue; 1888 } 1889 1890 /* Register block memory with UFFD to track writes */ 1891 if (uffd_register_memory(rs->uffdio_fd, block->host, 1892 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1893 goto fail; 1894 } 1895 /* Apply UFFD write protection to the block memory range */ 1896 if (uffd_change_protection(rs->uffdio_fd, block->host, 1897 block->max_length, true, false)) { 1898 goto fail; 1899 } 1900 block->flags |= RAM_UF_WRITEPROTECT; 1901 memory_region_ref(block->mr); 1902 1903 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1904 block->host, block->max_length); 1905 } 1906 1907 return 0; 1908 1909 fail: 1910 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1911 1912 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1913 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1914 continue; 1915 } 1916 /* 1917 * In case some memory block failed to be write-protected 1918 * remove protection and unregister all succeeded RAM blocks 1919 */ 1920 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1921 false, false); 1922 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1923 /* Cleanup flags and remove reference */ 1924 block->flags &= ~RAM_UF_WRITEPROTECT; 1925 memory_region_unref(block->mr); 1926 } 1927 1928 uffd_close_fd(uffd_fd); 1929 rs->uffdio_fd = -1; 1930 return -1; 1931 } 1932 1933 /** 1934 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1935 */ 1936 void ram_write_tracking_stop(void) 1937 { 1938 RAMState *rs = ram_state; 1939 RAMBlock *block; 1940 1941 RCU_READ_LOCK_GUARD(); 1942 1943 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1944 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1945 continue; 1946 } 1947 /* Remove protection and unregister all affected RAM blocks */ 1948 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1949 false, false); 1950 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1951 1952 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1953 block->host, block->max_length); 1954 1955 /* Cleanup flags and remove reference */ 1956 block->flags &= ~RAM_UF_WRITEPROTECT; 1957 memory_region_unref(block->mr); 1958 } 1959 1960 /* Finally close UFFD file descriptor */ 1961 uffd_close_fd(rs->uffdio_fd); 1962 rs->uffdio_fd = -1; 1963 } 1964 1965 #else 1966 /* No target OS support, stubs just fail or ignore */ 1967 1968 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1969 { 1970 (void) rs; 1971 (void) offset; 1972 1973 return NULL; 1974 } 1975 1976 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1977 unsigned long start_page) 1978 { 1979 (void) rs; 1980 (void) pss; 1981 (void) start_page; 1982 1983 return 0; 1984 } 1985 1986 bool ram_write_tracking_available(void) 1987 { 1988 return false; 1989 } 1990 1991 bool ram_write_tracking_compatible(void) 1992 { 1993 assert(0); 1994 return false; 1995 } 1996 1997 int ram_write_tracking_start(void) 1998 { 1999 assert(0); 2000 return -1; 2001 } 2002 2003 void ram_write_tracking_stop(void) 2004 { 2005 assert(0); 2006 } 2007 #endif /* defined(__linux__) */ 2008 2009 /* 2010 * Check whether two addr/offset of the ramblock falls onto the same host huge 2011 * page. Returns true if so, false otherwise. 2012 */ 2013 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1, 2014 uint64_t addr2) 2015 { 2016 size_t page_size = qemu_ram_pagesize(rb); 2017 2018 addr1 = ROUND_DOWN(addr1, page_size); 2019 addr2 = ROUND_DOWN(addr2, page_size); 2020 2021 return addr1 == addr2; 2022 } 2023 2024 /* 2025 * Whether a previous preempted precopy huge page contains current requested 2026 * page? Returns true if so, false otherwise. 2027 * 2028 * This should really happen very rarely, because it means when we were sending 2029 * during background migration for postcopy we're sending exactly the page that 2030 * some vcpu got faulted on on dest node. When it happens, we probably don't 2031 * need to do much but drop the request, because we know right after we restore 2032 * the precopy stream it'll be serviced. It'll slightly affect the order of 2033 * postcopy requests to be serviced (e.g. it'll be the same as we move current 2034 * request to the end of the queue) but it shouldn't be a big deal. The most 2035 * imporant thing is we can _never_ try to send a partial-sent huge page on the 2036 * POSTCOPY channel again, otherwise that huge page will got "split brain" on 2037 * two channels (PRECOPY, POSTCOPY). 2038 */ 2039 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block, 2040 ram_addr_t offset) 2041 { 2042 PostcopyPreemptState *state = &rs->postcopy_preempt_state; 2043 2044 /* No preemption at all? */ 2045 if (!state->preempted) { 2046 return false; 2047 } 2048 2049 /* Not even the same ramblock? */ 2050 if (state->ram_block != block) { 2051 return false; 2052 } 2053 2054 return offset_on_same_huge_page(block, offset, 2055 state->ram_page << TARGET_PAGE_BITS); 2056 } 2057 2058 /** 2059 * get_queued_page: unqueue a page from the postcopy requests 2060 * 2061 * Skips pages that are already sent (!dirty) 2062 * 2063 * Returns true if a queued page is found 2064 * 2065 * @rs: current RAM state 2066 * @pss: data about the state of the current dirty page scan 2067 */ 2068 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2069 { 2070 RAMBlock *block; 2071 ram_addr_t offset; 2072 2073 block = unqueue_page(rs, &offset); 2074 2075 if (block) { 2076 /* See comment above postcopy_preempted_contains() */ 2077 if (postcopy_preempted_contains(rs, block, offset)) { 2078 trace_postcopy_preempt_hit(block->idstr, offset); 2079 /* 2080 * If what we preempted previously was exactly what we're 2081 * requesting right now, restore the preempted precopy 2082 * immediately, boosting its priority as it's requested by 2083 * postcopy. 2084 */ 2085 postcopy_preempt_restore(rs, pss, true); 2086 return true; 2087 } 2088 } else { 2089 /* 2090 * Poll write faults too if background snapshot is enabled; that's 2091 * when we have vcpus got blocked by the write protected pages. 2092 */ 2093 block = poll_fault_page(rs, &offset); 2094 } 2095 2096 if (block) { 2097 /* 2098 * We want the background search to continue from the queued page 2099 * since the guest is likely to want other pages near to the page 2100 * it just requested. 2101 */ 2102 pss->block = block; 2103 pss->page = offset >> TARGET_PAGE_BITS; 2104 2105 /* 2106 * This unqueued page would break the "one round" check, even is 2107 * really rare. 2108 */ 2109 pss->complete_round = false; 2110 /* Mark it an urgent request, meanwhile using POSTCOPY channel */ 2111 pss->postcopy_requested = true; 2112 pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY; 2113 } 2114 2115 return !!block; 2116 } 2117 2118 /** 2119 * migration_page_queue_free: drop any remaining pages in the ram 2120 * request queue 2121 * 2122 * It should be empty at the end anyway, but in error cases there may 2123 * be some left. in case that there is any page left, we drop it. 2124 * 2125 */ 2126 static void migration_page_queue_free(RAMState *rs) 2127 { 2128 struct RAMSrcPageRequest *mspr, *next_mspr; 2129 /* This queue generally should be empty - but in the case of a failed 2130 * migration might have some droppings in. 2131 */ 2132 RCU_READ_LOCK_GUARD(); 2133 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2134 memory_region_unref(mspr->rb->mr); 2135 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2136 g_free(mspr); 2137 } 2138 } 2139 2140 /** 2141 * ram_save_queue_pages: queue the page for transmission 2142 * 2143 * A request from postcopy destination for example. 2144 * 2145 * Returns zero on success or negative on error 2146 * 2147 * @rbname: Name of the RAMBLock of the request. NULL means the 2148 * same that last one. 2149 * @start: starting address from the start of the RAMBlock 2150 * @len: length (in bytes) to send 2151 */ 2152 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2153 { 2154 RAMBlock *ramblock; 2155 RAMState *rs = ram_state; 2156 2157 ram_counters.postcopy_requests++; 2158 RCU_READ_LOCK_GUARD(); 2159 2160 if (!rbname) { 2161 /* Reuse last RAMBlock */ 2162 ramblock = rs->last_req_rb; 2163 2164 if (!ramblock) { 2165 /* 2166 * Shouldn't happen, we can't reuse the last RAMBlock if 2167 * it's the 1st request. 2168 */ 2169 error_report("ram_save_queue_pages no previous block"); 2170 return -1; 2171 } 2172 } else { 2173 ramblock = qemu_ram_block_by_name(rbname); 2174 2175 if (!ramblock) { 2176 /* We shouldn't be asked for a non-existent RAMBlock */ 2177 error_report("ram_save_queue_pages no block '%s'", rbname); 2178 return -1; 2179 } 2180 rs->last_req_rb = ramblock; 2181 } 2182 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2183 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2184 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2185 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2186 __func__, start, len, ramblock->used_length); 2187 return -1; 2188 } 2189 2190 struct RAMSrcPageRequest *new_entry = 2191 g_new0(struct RAMSrcPageRequest, 1); 2192 new_entry->rb = ramblock; 2193 new_entry->offset = start; 2194 new_entry->len = len; 2195 2196 memory_region_ref(ramblock->mr); 2197 qemu_mutex_lock(&rs->src_page_req_mutex); 2198 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2199 migration_make_urgent_request(); 2200 qemu_mutex_unlock(&rs->src_page_req_mutex); 2201 2202 return 0; 2203 } 2204 2205 static bool save_page_use_compression(RAMState *rs) 2206 { 2207 if (!migrate_use_compression()) { 2208 return false; 2209 } 2210 2211 /* 2212 * If xbzrle is enabled (e.g., after first round of migration), stop 2213 * using the data compression. In theory, xbzrle can do better than 2214 * compression. 2215 */ 2216 if (rs->xbzrle_enabled) { 2217 return false; 2218 } 2219 2220 return true; 2221 } 2222 2223 /* 2224 * try to compress the page before posting it out, return true if the page 2225 * has been properly handled by compression, otherwise needs other 2226 * paths to handle it 2227 */ 2228 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 2229 { 2230 if (!save_page_use_compression(rs)) { 2231 return false; 2232 } 2233 2234 /* 2235 * When starting the process of a new block, the first page of 2236 * the block should be sent out before other pages in the same 2237 * block, and all the pages in last block should have been sent 2238 * out, keeping this order is important, because the 'cont' flag 2239 * is used to avoid resending the block name. 2240 * 2241 * We post the fist page as normal page as compression will take 2242 * much CPU resource. 2243 */ 2244 if (block != rs->last_sent_block) { 2245 flush_compressed_data(rs); 2246 return false; 2247 } 2248 2249 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 2250 return true; 2251 } 2252 2253 compression_counters.busy++; 2254 return false; 2255 } 2256 2257 /** 2258 * ram_save_target_page: save one target page 2259 * 2260 * Returns the number of pages written 2261 * 2262 * @rs: current RAM state 2263 * @pss: data about the page we want to send 2264 */ 2265 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 2266 { 2267 RAMBlock *block = pss->block; 2268 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2269 int res; 2270 2271 if (control_save_page(rs, block, offset, &res)) { 2272 return res; 2273 } 2274 2275 if (save_compress_page(rs, block, offset)) { 2276 return 1; 2277 } 2278 2279 res = save_zero_page(rs, block, offset); 2280 if (res > 0) { 2281 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2282 * page would be stale 2283 */ 2284 if (!save_page_use_compression(rs)) { 2285 XBZRLE_cache_lock(); 2286 xbzrle_cache_zero_page(rs, block->offset + offset); 2287 XBZRLE_cache_unlock(); 2288 } 2289 return res; 2290 } 2291 2292 /* 2293 * Do not use multifd for: 2294 * 1. Compression as the first page in the new block should be posted out 2295 * before sending the compressed page 2296 * 2. In postcopy as one whole host page should be placed 2297 */ 2298 if (!save_page_use_compression(rs) && migrate_use_multifd() 2299 && !migration_in_postcopy()) { 2300 return ram_save_multifd_page(rs, block, offset); 2301 } 2302 2303 return ram_save_page(rs, pss); 2304 } 2305 2306 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss) 2307 { 2308 MigrationState *ms = migrate_get_current(); 2309 2310 /* Not enabled eager preempt? Then never do that. */ 2311 if (!migrate_postcopy_preempt()) { 2312 return false; 2313 } 2314 2315 /* If the user explicitly disabled breaking of huge page, skip */ 2316 if (!ms->postcopy_preempt_break_huge) { 2317 return false; 2318 } 2319 2320 /* If the ramblock we're sending is a small page? Never bother. */ 2321 if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) { 2322 return false; 2323 } 2324 2325 /* Not in postcopy at all? */ 2326 if (!migration_in_postcopy()) { 2327 return false; 2328 } 2329 2330 /* 2331 * If we're already handling a postcopy request, don't preempt as this page 2332 * has got the same high priority. 2333 */ 2334 if (pss->postcopy_requested) { 2335 return false; 2336 } 2337 2338 /* If there's postcopy requests, then check it up! */ 2339 return postcopy_has_request(rs); 2340 } 2341 2342 /* Returns true if we preempted precopy, false otherwise */ 2343 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss) 2344 { 2345 PostcopyPreemptState *p_state = &rs->postcopy_preempt_state; 2346 2347 trace_postcopy_preempt_triggered(pss->block->idstr, pss->page); 2348 2349 /* 2350 * Time to preempt precopy. Cache current PSS into preempt state, so that 2351 * after handling the postcopy pages we can recover to it. We need to do 2352 * so because the dest VM will have partial of the precopy huge page kept 2353 * over in its tmp huge page caches; better move on with it when we can. 2354 */ 2355 p_state->ram_block = pss->block; 2356 p_state->ram_page = pss->page; 2357 p_state->preempted = true; 2358 } 2359 2360 /* Whether we're preempted by a postcopy request during sending a huge page */ 2361 static bool postcopy_preempt_triggered(RAMState *rs) 2362 { 2363 return rs->postcopy_preempt_state.preempted; 2364 } 2365 2366 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, 2367 bool postcopy_requested) 2368 { 2369 PostcopyPreemptState *state = &rs->postcopy_preempt_state; 2370 2371 assert(state->preempted); 2372 2373 pss->block = state->ram_block; 2374 pss->page = state->ram_page; 2375 2376 /* Whether this is a postcopy request? */ 2377 pss->postcopy_requested = postcopy_requested; 2378 /* 2379 * When restoring a preempted page, the old data resides in PRECOPY 2380 * slow channel, even if postcopy_requested is set. So always use 2381 * PRECOPY channel here. 2382 */ 2383 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; 2384 2385 trace_postcopy_preempt_restored(pss->block->idstr, pss->page); 2386 2387 /* Reset preempt state, most importantly, set preempted==false */ 2388 postcopy_preempt_reset(rs); 2389 } 2390 2391 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss) 2392 { 2393 MigrationState *s = migrate_get_current(); 2394 unsigned int channel = pss->postcopy_target_channel; 2395 QEMUFile *next; 2396 2397 if (channel != rs->postcopy_channel) { 2398 if (channel == RAM_CHANNEL_PRECOPY) { 2399 next = s->to_dst_file; 2400 } else { 2401 next = s->postcopy_qemufile_src; 2402 } 2403 /* Update and cache the current channel */ 2404 rs->f = next; 2405 rs->postcopy_channel = channel; 2406 2407 /* 2408 * If channel switched, reset last_sent_block since the old sent block 2409 * may not be on the same channel. 2410 */ 2411 rs->last_sent_block = NULL; 2412 2413 trace_postcopy_preempt_switch_channel(channel); 2414 } 2415 2416 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2417 } 2418 2419 /* We need to make sure rs->f always points to the default channel elsewhere */ 2420 static void postcopy_preempt_reset_channel(RAMState *rs) 2421 { 2422 if (migrate_postcopy_preempt() && migration_in_postcopy()) { 2423 rs->postcopy_channel = RAM_CHANNEL_PRECOPY; 2424 rs->f = migrate_get_current()->to_dst_file; 2425 trace_postcopy_preempt_reset_channel(); 2426 } 2427 } 2428 2429 /** 2430 * ram_save_host_page: save a whole host page 2431 * 2432 * Starting at *offset send pages up to the end of the current host 2433 * page. It's valid for the initial offset to point into the middle of 2434 * a host page in which case the remainder of the hostpage is sent. 2435 * Only dirty target pages are sent. Note that the host page size may 2436 * be a huge page for this block. 2437 * The saving stops at the boundary of the used_length of the block 2438 * if the RAMBlock isn't a multiple of the host page size. 2439 * 2440 * Returns the number of pages written or negative on error 2441 * 2442 * @rs: current RAM state 2443 * @pss: data about the page we want to send 2444 */ 2445 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2446 { 2447 int tmppages, pages = 0; 2448 size_t pagesize_bits = 2449 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2450 unsigned long hostpage_boundary = 2451 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2452 unsigned long start_page = pss->page; 2453 int res; 2454 2455 if (ramblock_is_ignored(pss->block)) { 2456 error_report("block %s should not be migrated !", pss->block->idstr); 2457 return 0; 2458 } 2459 2460 if (migrate_postcopy_preempt() && migration_in_postcopy()) { 2461 postcopy_preempt_choose_channel(rs, pss); 2462 } 2463 2464 do { 2465 if (postcopy_needs_preempt(rs, pss)) { 2466 postcopy_do_preempt(rs, pss); 2467 break; 2468 } 2469 2470 /* Check the pages is dirty and if it is send it */ 2471 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2472 tmppages = ram_save_target_page(rs, pss); 2473 if (tmppages < 0) { 2474 return tmppages; 2475 } 2476 2477 pages += tmppages; 2478 /* 2479 * Allow rate limiting to happen in the middle of huge pages if 2480 * something is sent in the current iteration. 2481 */ 2482 if (pagesize_bits > 1 && tmppages > 0) { 2483 migration_rate_limit(); 2484 } 2485 } 2486 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2487 } while ((pss->page < hostpage_boundary) && 2488 offset_in_ramblock(pss->block, 2489 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2490 /* The offset we leave with is the min boundary of host page and block */ 2491 pss->page = MIN(pss->page, hostpage_boundary); 2492 2493 /* 2494 * When with postcopy preempt mode, flush the data as soon as possible for 2495 * postcopy requests, because we've already sent a whole huge page, so the 2496 * dst node should already have enough resource to atomically filling in 2497 * the current missing page. 2498 * 2499 * More importantly, when using separate postcopy channel, we must do 2500 * explicit flush or it won't flush until the buffer is full. 2501 */ 2502 if (migrate_postcopy_preempt() && pss->postcopy_requested) { 2503 qemu_fflush(rs->f); 2504 } 2505 2506 res = ram_save_release_protection(rs, pss, start_page); 2507 return (res < 0 ? res : pages); 2508 } 2509 2510 /** 2511 * ram_find_and_save_block: finds a dirty page and sends it to f 2512 * 2513 * Called within an RCU critical section. 2514 * 2515 * Returns the number of pages written where zero means no dirty pages, 2516 * or negative on error 2517 * 2518 * @rs: current RAM state 2519 * 2520 * On systems where host-page-size > target-page-size it will send all the 2521 * pages in a host page that are dirty. 2522 */ 2523 static int ram_find_and_save_block(RAMState *rs) 2524 { 2525 PageSearchStatus pss; 2526 int pages = 0; 2527 bool again, found; 2528 2529 /* No dirty page as there is zero RAM */ 2530 if (!ram_bytes_total()) { 2531 return pages; 2532 } 2533 2534 pss.block = rs->last_seen_block; 2535 pss.page = rs->last_page; 2536 pss.complete_round = false; 2537 2538 if (!pss.block) { 2539 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2540 } 2541 2542 do { 2543 again = true; 2544 found = get_queued_page(rs, &pss); 2545 2546 if (!found) { 2547 /* 2548 * Recover previous precopy ramblock/offset if postcopy has 2549 * preempted precopy. Otherwise find the next dirty bit. 2550 */ 2551 if (postcopy_preempt_triggered(rs)) { 2552 postcopy_preempt_restore(rs, &pss, false); 2553 found = true; 2554 } else { 2555 /* priority queue empty, so just search for something dirty */ 2556 found = find_dirty_block(rs, &pss, &again); 2557 } 2558 } 2559 2560 if (found) { 2561 pages = ram_save_host_page(rs, &pss); 2562 } 2563 } while (!pages && again); 2564 2565 rs->last_seen_block = pss.block; 2566 rs->last_page = pss.page; 2567 2568 return pages; 2569 } 2570 2571 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2572 { 2573 uint64_t pages = size / TARGET_PAGE_SIZE; 2574 2575 if (zero) { 2576 ram_counters.duplicate += pages; 2577 } else { 2578 ram_counters.normal += pages; 2579 ram_transferred_add(size); 2580 qemu_file_credit_transfer(f, size); 2581 } 2582 } 2583 2584 static uint64_t ram_bytes_total_common(bool count_ignored) 2585 { 2586 RAMBlock *block; 2587 uint64_t total = 0; 2588 2589 RCU_READ_LOCK_GUARD(); 2590 2591 if (count_ignored) { 2592 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2593 total += block->used_length; 2594 } 2595 } else { 2596 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2597 total += block->used_length; 2598 } 2599 } 2600 return total; 2601 } 2602 2603 uint64_t ram_bytes_total(void) 2604 { 2605 return ram_bytes_total_common(false); 2606 } 2607 2608 static void xbzrle_load_setup(void) 2609 { 2610 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2611 } 2612 2613 static void xbzrle_load_cleanup(void) 2614 { 2615 g_free(XBZRLE.decoded_buf); 2616 XBZRLE.decoded_buf = NULL; 2617 } 2618 2619 static void ram_state_cleanup(RAMState **rsp) 2620 { 2621 if (*rsp) { 2622 migration_page_queue_free(*rsp); 2623 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2624 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2625 g_free(*rsp); 2626 *rsp = NULL; 2627 } 2628 } 2629 2630 static void xbzrle_cleanup(void) 2631 { 2632 XBZRLE_cache_lock(); 2633 if (XBZRLE.cache) { 2634 cache_fini(XBZRLE.cache); 2635 g_free(XBZRLE.encoded_buf); 2636 g_free(XBZRLE.current_buf); 2637 g_free(XBZRLE.zero_target_page); 2638 XBZRLE.cache = NULL; 2639 XBZRLE.encoded_buf = NULL; 2640 XBZRLE.current_buf = NULL; 2641 XBZRLE.zero_target_page = NULL; 2642 } 2643 XBZRLE_cache_unlock(); 2644 } 2645 2646 static void ram_save_cleanup(void *opaque) 2647 { 2648 RAMState **rsp = opaque; 2649 RAMBlock *block; 2650 2651 /* We don't use dirty log with background snapshots */ 2652 if (!migrate_background_snapshot()) { 2653 /* caller have hold iothread lock or is in a bh, so there is 2654 * no writing race against the migration bitmap 2655 */ 2656 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2657 /* 2658 * do not stop dirty log without starting it, since 2659 * memory_global_dirty_log_stop will assert that 2660 * memory_global_dirty_log_start/stop used in pairs 2661 */ 2662 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2663 } 2664 } 2665 2666 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2667 g_free(block->clear_bmap); 2668 block->clear_bmap = NULL; 2669 g_free(block->bmap); 2670 block->bmap = NULL; 2671 } 2672 2673 xbzrle_cleanup(); 2674 compress_threads_save_cleanup(); 2675 ram_state_cleanup(rsp); 2676 } 2677 2678 static void ram_state_reset(RAMState *rs) 2679 { 2680 rs->last_seen_block = NULL; 2681 rs->last_sent_block = NULL; 2682 rs->last_page = 0; 2683 rs->last_version = ram_list.version; 2684 rs->xbzrle_enabled = false; 2685 postcopy_preempt_reset(rs); 2686 rs->postcopy_channel = RAM_CHANNEL_PRECOPY; 2687 } 2688 2689 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2690 2691 /* **** functions for postcopy ***** */ 2692 2693 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2694 { 2695 struct RAMBlock *block; 2696 2697 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2698 unsigned long *bitmap = block->bmap; 2699 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2700 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2701 2702 while (run_start < range) { 2703 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2704 ram_discard_range(block->idstr, 2705 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2706 ((ram_addr_t)(run_end - run_start)) 2707 << TARGET_PAGE_BITS); 2708 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2709 } 2710 } 2711 } 2712 2713 /** 2714 * postcopy_send_discard_bm_ram: discard a RAMBlock 2715 * 2716 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2717 * 2718 * @ms: current migration state 2719 * @block: RAMBlock to discard 2720 */ 2721 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2722 { 2723 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2724 unsigned long current; 2725 unsigned long *bitmap = block->bmap; 2726 2727 for (current = 0; current < end; ) { 2728 unsigned long one = find_next_bit(bitmap, end, current); 2729 unsigned long zero, discard_length; 2730 2731 if (one >= end) { 2732 break; 2733 } 2734 2735 zero = find_next_zero_bit(bitmap, end, one + 1); 2736 2737 if (zero >= end) { 2738 discard_length = end - one; 2739 } else { 2740 discard_length = zero - one; 2741 } 2742 postcopy_discard_send_range(ms, one, discard_length); 2743 current = one + discard_length; 2744 } 2745 } 2746 2747 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2748 2749 /** 2750 * postcopy_each_ram_send_discard: discard all RAMBlocks 2751 * 2752 * Utility for the outgoing postcopy code. 2753 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2754 * passing it bitmap indexes and name. 2755 * (qemu_ram_foreach_block ends up passing unscaled lengths 2756 * which would mean postcopy code would have to deal with target page) 2757 * 2758 * @ms: current migration state 2759 */ 2760 static void postcopy_each_ram_send_discard(MigrationState *ms) 2761 { 2762 struct RAMBlock *block; 2763 2764 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2765 postcopy_discard_send_init(ms, block->idstr); 2766 2767 /* 2768 * Deal with TPS != HPS and huge pages. It discard any partially sent 2769 * host-page size chunks, mark any partially dirty host-page size 2770 * chunks as all dirty. In this case the host-page is the host-page 2771 * for the particular RAMBlock, i.e. it might be a huge page. 2772 */ 2773 postcopy_chunk_hostpages_pass(ms, block); 2774 2775 /* 2776 * Postcopy sends chunks of bitmap over the wire, but it 2777 * just needs indexes at this point, avoids it having 2778 * target page specific code. 2779 */ 2780 postcopy_send_discard_bm_ram(ms, block); 2781 postcopy_discard_send_finish(ms); 2782 } 2783 } 2784 2785 /** 2786 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2787 * 2788 * Helper for postcopy_chunk_hostpages; it's called twice to 2789 * canonicalize the two bitmaps, that are similar, but one is 2790 * inverted. 2791 * 2792 * Postcopy requires that all target pages in a hostpage are dirty or 2793 * clean, not a mix. This function canonicalizes the bitmaps. 2794 * 2795 * @ms: current migration state 2796 * @block: block that contains the page we want to canonicalize 2797 */ 2798 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2799 { 2800 RAMState *rs = ram_state; 2801 unsigned long *bitmap = block->bmap; 2802 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2803 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2804 unsigned long run_start; 2805 2806 if (block->page_size == TARGET_PAGE_SIZE) { 2807 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2808 return; 2809 } 2810 2811 /* Find a dirty page */ 2812 run_start = find_next_bit(bitmap, pages, 0); 2813 2814 while (run_start < pages) { 2815 2816 /* 2817 * If the start of this run of pages is in the middle of a host 2818 * page, then we need to fixup this host page. 2819 */ 2820 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2821 /* Find the end of this run */ 2822 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2823 /* 2824 * If the end isn't at the start of a host page, then the 2825 * run doesn't finish at the end of a host page 2826 * and we need to discard. 2827 */ 2828 } 2829 2830 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2831 unsigned long page; 2832 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2833 host_ratio); 2834 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2835 2836 /* Clean up the bitmap */ 2837 for (page = fixup_start_addr; 2838 page < fixup_start_addr + host_ratio; page++) { 2839 /* 2840 * Remark them as dirty, updating the count for any pages 2841 * that weren't previously dirty. 2842 */ 2843 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2844 } 2845 } 2846 2847 /* Find the next dirty page for the next iteration */ 2848 run_start = find_next_bit(bitmap, pages, run_start); 2849 } 2850 } 2851 2852 /** 2853 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2854 * 2855 * Transmit the set of pages to be discarded after precopy to the target 2856 * these are pages that: 2857 * a) Have been previously transmitted but are now dirty again 2858 * b) Pages that have never been transmitted, this ensures that 2859 * any pages on the destination that have been mapped by background 2860 * tasks get discarded (transparent huge pages is the specific concern) 2861 * Hopefully this is pretty sparse 2862 * 2863 * @ms: current migration state 2864 */ 2865 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2866 { 2867 RAMState *rs = ram_state; 2868 2869 RCU_READ_LOCK_GUARD(); 2870 2871 /* This should be our last sync, the src is now paused */ 2872 migration_bitmap_sync(rs); 2873 2874 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2875 rs->last_seen_block = NULL; 2876 rs->last_sent_block = NULL; 2877 rs->last_page = 0; 2878 2879 postcopy_each_ram_send_discard(ms); 2880 2881 trace_ram_postcopy_send_discard_bitmap(); 2882 } 2883 2884 /** 2885 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2886 * 2887 * Returns zero on success 2888 * 2889 * @rbname: name of the RAMBlock of the request. NULL means the 2890 * same that last one. 2891 * @start: RAMBlock starting page 2892 * @length: RAMBlock size 2893 */ 2894 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2895 { 2896 trace_ram_discard_range(rbname, start, length); 2897 2898 RCU_READ_LOCK_GUARD(); 2899 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2900 2901 if (!rb) { 2902 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2903 return -1; 2904 } 2905 2906 /* 2907 * On source VM, we don't need to update the received bitmap since 2908 * we don't even have one. 2909 */ 2910 if (rb->receivedmap) { 2911 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2912 length >> qemu_target_page_bits()); 2913 } 2914 2915 return ram_block_discard_range(rb, start, length); 2916 } 2917 2918 /* 2919 * For every allocation, we will try not to crash the VM if the 2920 * allocation failed. 2921 */ 2922 static int xbzrle_init(void) 2923 { 2924 Error *local_err = NULL; 2925 2926 if (!migrate_use_xbzrle()) { 2927 return 0; 2928 } 2929 2930 XBZRLE_cache_lock(); 2931 2932 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2933 if (!XBZRLE.zero_target_page) { 2934 error_report("%s: Error allocating zero page", __func__); 2935 goto err_out; 2936 } 2937 2938 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2939 TARGET_PAGE_SIZE, &local_err); 2940 if (!XBZRLE.cache) { 2941 error_report_err(local_err); 2942 goto free_zero_page; 2943 } 2944 2945 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2946 if (!XBZRLE.encoded_buf) { 2947 error_report("%s: Error allocating encoded_buf", __func__); 2948 goto free_cache; 2949 } 2950 2951 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2952 if (!XBZRLE.current_buf) { 2953 error_report("%s: Error allocating current_buf", __func__); 2954 goto free_encoded_buf; 2955 } 2956 2957 /* We are all good */ 2958 XBZRLE_cache_unlock(); 2959 return 0; 2960 2961 free_encoded_buf: 2962 g_free(XBZRLE.encoded_buf); 2963 XBZRLE.encoded_buf = NULL; 2964 free_cache: 2965 cache_fini(XBZRLE.cache); 2966 XBZRLE.cache = NULL; 2967 free_zero_page: 2968 g_free(XBZRLE.zero_target_page); 2969 XBZRLE.zero_target_page = NULL; 2970 err_out: 2971 XBZRLE_cache_unlock(); 2972 return -ENOMEM; 2973 } 2974 2975 static int ram_state_init(RAMState **rsp) 2976 { 2977 *rsp = g_try_new0(RAMState, 1); 2978 2979 if (!*rsp) { 2980 error_report("%s: Init ramstate fail", __func__); 2981 return -1; 2982 } 2983 2984 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2985 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2986 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2987 2988 /* 2989 * Count the total number of pages used by ram blocks not including any 2990 * gaps due to alignment or unplugs. 2991 * This must match with the initial values of dirty bitmap. 2992 */ 2993 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2994 ram_state_reset(*rsp); 2995 2996 return 0; 2997 } 2998 2999 static void ram_list_init_bitmaps(void) 3000 { 3001 MigrationState *ms = migrate_get_current(); 3002 RAMBlock *block; 3003 unsigned long pages; 3004 uint8_t shift; 3005 3006 /* Skip setting bitmap if there is no RAM */ 3007 if (ram_bytes_total()) { 3008 shift = ms->clear_bitmap_shift; 3009 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3010 error_report("clear_bitmap_shift (%u) too big, using " 3011 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3012 shift = CLEAR_BITMAP_SHIFT_MAX; 3013 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3014 error_report("clear_bitmap_shift (%u) too small, using " 3015 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3016 shift = CLEAR_BITMAP_SHIFT_MIN; 3017 } 3018 3019 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3020 pages = block->max_length >> TARGET_PAGE_BITS; 3021 /* 3022 * The initial dirty bitmap for migration must be set with all 3023 * ones to make sure we'll migrate every guest RAM page to 3024 * destination. 3025 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3026 * new migration after a failed migration, ram_list. 3027 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3028 * guest memory. 3029 */ 3030 block->bmap = bitmap_new(pages); 3031 bitmap_set(block->bmap, 0, pages); 3032 block->clear_bmap_shift = shift; 3033 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3034 } 3035 } 3036 } 3037 3038 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3039 { 3040 unsigned long pages; 3041 RAMBlock *rb; 3042 3043 RCU_READ_LOCK_GUARD(); 3044 3045 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3046 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3047 rs->migration_dirty_pages -= pages; 3048 } 3049 } 3050 3051 static void ram_init_bitmaps(RAMState *rs) 3052 { 3053 /* For memory_global_dirty_log_start below. */ 3054 qemu_mutex_lock_iothread(); 3055 qemu_mutex_lock_ramlist(); 3056 3057 WITH_RCU_READ_LOCK_GUARD() { 3058 ram_list_init_bitmaps(); 3059 /* We don't use dirty log with background snapshots */ 3060 if (!migrate_background_snapshot()) { 3061 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3062 migration_bitmap_sync_precopy(rs); 3063 } 3064 } 3065 qemu_mutex_unlock_ramlist(); 3066 qemu_mutex_unlock_iothread(); 3067 3068 /* 3069 * After an eventual first bitmap sync, fixup the initial bitmap 3070 * containing all 1s to exclude any discarded pages from migration. 3071 */ 3072 migration_bitmap_clear_discarded_pages(rs); 3073 } 3074 3075 static int ram_init_all(RAMState **rsp) 3076 { 3077 if (ram_state_init(rsp)) { 3078 return -1; 3079 } 3080 3081 if (xbzrle_init()) { 3082 ram_state_cleanup(rsp); 3083 return -1; 3084 } 3085 3086 ram_init_bitmaps(*rsp); 3087 3088 return 0; 3089 } 3090 3091 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3092 { 3093 RAMBlock *block; 3094 uint64_t pages = 0; 3095 3096 /* 3097 * Postcopy is not using xbzrle/compression, so no need for that. 3098 * Also, since source are already halted, we don't need to care 3099 * about dirty page logging as well. 3100 */ 3101 3102 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3103 pages += bitmap_count_one(block->bmap, 3104 block->used_length >> TARGET_PAGE_BITS); 3105 } 3106 3107 /* This may not be aligned with current bitmaps. Recalculate. */ 3108 rs->migration_dirty_pages = pages; 3109 3110 ram_state_reset(rs); 3111 3112 /* Update RAMState cache of output QEMUFile */ 3113 rs->f = out; 3114 3115 trace_ram_state_resume_prepare(pages); 3116 } 3117 3118 /* 3119 * This function clears bits of the free pages reported by the caller from the 3120 * migration dirty bitmap. @addr is the host address corresponding to the 3121 * start of the continuous guest free pages, and @len is the total bytes of 3122 * those pages. 3123 */ 3124 void qemu_guest_free_page_hint(void *addr, size_t len) 3125 { 3126 RAMBlock *block; 3127 ram_addr_t offset; 3128 size_t used_len, start, npages; 3129 MigrationState *s = migrate_get_current(); 3130 3131 /* This function is currently expected to be used during live migration */ 3132 if (!migration_is_setup_or_active(s->state)) { 3133 return; 3134 } 3135 3136 for (; len > 0; len -= used_len, addr += used_len) { 3137 block = qemu_ram_block_from_host(addr, false, &offset); 3138 if (unlikely(!block || offset >= block->used_length)) { 3139 /* 3140 * The implementation might not support RAMBlock resize during 3141 * live migration, but it could happen in theory with future 3142 * updates. So we add a check here to capture that case. 3143 */ 3144 error_report_once("%s unexpected error", __func__); 3145 return; 3146 } 3147 3148 if (len <= block->used_length - offset) { 3149 used_len = len; 3150 } else { 3151 used_len = block->used_length - offset; 3152 } 3153 3154 start = offset >> TARGET_PAGE_BITS; 3155 npages = used_len >> TARGET_PAGE_BITS; 3156 3157 qemu_mutex_lock(&ram_state->bitmap_mutex); 3158 /* 3159 * The skipped free pages are equavalent to be sent from clear_bmap's 3160 * perspective, so clear the bits from the memory region bitmap which 3161 * are initially set. Otherwise those skipped pages will be sent in 3162 * the next round after syncing from the memory region bitmap. 3163 */ 3164 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3165 ram_state->migration_dirty_pages -= 3166 bitmap_count_one_with_offset(block->bmap, start, npages); 3167 bitmap_clear(block->bmap, start, npages); 3168 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3169 } 3170 } 3171 3172 /* 3173 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3174 * long-running RCU critical section. When rcu-reclaims in the code 3175 * start to become numerous it will be necessary to reduce the 3176 * granularity of these critical sections. 3177 */ 3178 3179 /** 3180 * ram_save_setup: Setup RAM for migration 3181 * 3182 * Returns zero to indicate success and negative for error 3183 * 3184 * @f: QEMUFile where to send the data 3185 * @opaque: RAMState pointer 3186 */ 3187 static int ram_save_setup(QEMUFile *f, void *opaque) 3188 { 3189 RAMState **rsp = opaque; 3190 RAMBlock *block; 3191 int ret; 3192 3193 if (compress_threads_save_setup()) { 3194 return -1; 3195 } 3196 3197 /* migration has already setup the bitmap, reuse it. */ 3198 if (!migration_in_colo_state()) { 3199 if (ram_init_all(rsp) != 0) { 3200 compress_threads_save_cleanup(); 3201 return -1; 3202 } 3203 } 3204 (*rsp)->f = f; 3205 3206 WITH_RCU_READ_LOCK_GUARD() { 3207 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 3208 3209 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3210 qemu_put_byte(f, strlen(block->idstr)); 3211 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3212 qemu_put_be64(f, block->used_length); 3213 if (migrate_postcopy_ram() && block->page_size != 3214 qemu_host_page_size) { 3215 qemu_put_be64(f, block->page_size); 3216 } 3217 if (migrate_ignore_shared()) { 3218 qemu_put_be64(f, block->mr->addr); 3219 } 3220 } 3221 } 3222 3223 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3224 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3225 3226 ret = multifd_send_sync_main(f); 3227 if (ret < 0) { 3228 return ret; 3229 } 3230 3231 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3232 qemu_fflush(f); 3233 3234 return 0; 3235 } 3236 3237 /** 3238 * ram_save_iterate: iterative stage for migration 3239 * 3240 * Returns zero to indicate success and negative for error 3241 * 3242 * @f: QEMUFile where to send the data 3243 * @opaque: RAMState pointer 3244 */ 3245 static int ram_save_iterate(QEMUFile *f, void *opaque) 3246 { 3247 RAMState **temp = opaque; 3248 RAMState *rs = *temp; 3249 int ret = 0; 3250 int i; 3251 int64_t t0; 3252 int done = 0; 3253 3254 if (blk_mig_bulk_active()) { 3255 /* Avoid transferring ram during bulk phase of block migration as 3256 * the bulk phase will usually take a long time and transferring 3257 * ram updates during that time is pointless. */ 3258 goto out; 3259 } 3260 3261 /* 3262 * We'll take this lock a little bit long, but it's okay for two reasons. 3263 * Firstly, the only possible other thread to take it is who calls 3264 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3265 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3266 * guarantees that we'll at least released it in a regular basis. 3267 */ 3268 qemu_mutex_lock(&rs->bitmap_mutex); 3269 WITH_RCU_READ_LOCK_GUARD() { 3270 if (ram_list.version != rs->last_version) { 3271 ram_state_reset(rs); 3272 } 3273 3274 /* Read version before ram_list.blocks */ 3275 smp_rmb(); 3276 3277 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3278 3279 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3280 i = 0; 3281 while ((ret = qemu_file_rate_limit(f)) == 0 || 3282 postcopy_has_request(rs)) { 3283 int pages; 3284 3285 if (qemu_file_get_error(f)) { 3286 break; 3287 } 3288 3289 pages = ram_find_and_save_block(rs); 3290 /* no more pages to sent */ 3291 if (pages == 0) { 3292 done = 1; 3293 break; 3294 } 3295 3296 if (pages < 0) { 3297 qemu_file_set_error(f, pages); 3298 break; 3299 } 3300 3301 rs->target_page_count += pages; 3302 3303 /* 3304 * During postcopy, it is necessary to make sure one whole host 3305 * page is sent in one chunk. 3306 */ 3307 if (migrate_postcopy_ram()) { 3308 flush_compressed_data(rs); 3309 } 3310 3311 /* 3312 * we want to check in the 1st loop, just in case it was the 1st 3313 * time and we had to sync the dirty bitmap. 3314 * qemu_clock_get_ns() is a bit expensive, so we only check each 3315 * some iterations 3316 */ 3317 if ((i & 63) == 0) { 3318 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3319 1000000; 3320 if (t1 > MAX_WAIT) { 3321 trace_ram_save_iterate_big_wait(t1, i); 3322 break; 3323 } 3324 } 3325 i++; 3326 } 3327 } 3328 qemu_mutex_unlock(&rs->bitmap_mutex); 3329 3330 postcopy_preempt_reset_channel(rs); 3331 3332 /* 3333 * Must occur before EOS (or any QEMUFile operation) 3334 * because of RDMA protocol. 3335 */ 3336 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3337 3338 out: 3339 if (ret >= 0 3340 && migration_is_setup_or_active(migrate_get_current()->state)) { 3341 ret = multifd_send_sync_main(rs->f); 3342 if (ret < 0) { 3343 return ret; 3344 } 3345 3346 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3347 qemu_fflush(f); 3348 ram_transferred_add(8); 3349 3350 ret = qemu_file_get_error(f); 3351 } 3352 if (ret < 0) { 3353 return ret; 3354 } 3355 3356 return done; 3357 } 3358 3359 /** 3360 * ram_save_complete: function called to send the remaining amount of ram 3361 * 3362 * Returns zero to indicate success or negative on error 3363 * 3364 * Called with iothread lock 3365 * 3366 * @f: QEMUFile where to send the data 3367 * @opaque: RAMState pointer 3368 */ 3369 static int ram_save_complete(QEMUFile *f, void *opaque) 3370 { 3371 RAMState **temp = opaque; 3372 RAMState *rs = *temp; 3373 int ret = 0; 3374 3375 rs->last_stage = !migration_in_colo_state(); 3376 3377 WITH_RCU_READ_LOCK_GUARD() { 3378 if (!migration_in_postcopy()) { 3379 migration_bitmap_sync_precopy(rs); 3380 } 3381 3382 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3383 3384 /* try transferring iterative blocks of memory */ 3385 3386 /* flush all remaining blocks regardless of rate limiting */ 3387 while (true) { 3388 int pages; 3389 3390 pages = ram_find_and_save_block(rs); 3391 /* no more blocks to sent */ 3392 if (pages == 0) { 3393 break; 3394 } 3395 if (pages < 0) { 3396 ret = pages; 3397 break; 3398 } 3399 } 3400 3401 flush_compressed_data(rs); 3402 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3403 } 3404 3405 if (ret < 0) { 3406 return ret; 3407 } 3408 3409 postcopy_preempt_reset_channel(rs); 3410 3411 ret = multifd_send_sync_main(rs->f); 3412 if (ret < 0) { 3413 return ret; 3414 } 3415 3416 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3417 qemu_fflush(f); 3418 3419 return 0; 3420 } 3421 3422 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3423 uint64_t *res_precopy_only, 3424 uint64_t *res_compatible, 3425 uint64_t *res_postcopy_only) 3426 { 3427 RAMState **temp = opaque; 3428 RAMState *rs = *temp; 3429 uint64_t remaining_size; 3430 3431 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3432 3433 if (!migration_in_postcopy() && 3434 remaining_size < max_size) { 3435 qemu_mutex_lock_iothread(); 3436 WITH_RCU_READ_LOCK_GUARD() { 3437 migration_bitmap_sync_precopy(rs); 3438 } 3439 qemu_mutex_unlock_iothread(); 3440 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3441 } 3442 3443 if (migrate_postcopy_ram()) { 3444 /* We can do postcopy, and all the data is postcopiable */ 3445 *res_compatible += remaining_size; 3446 } else { 3447 *res_precopy_only += remaining_size; 3448 } 3449 } 3450 3451 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3452 { 3453 unsigned int xh_len; 3454 int xh_flags; 3455 uint8_t *loaded_data; 3456 3457 /* extract RLE header */ 3458 xh_flags = qemu_get_byte(f); 3459 xh_len = qemu_get_be16(f); 3460 3461 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3462 error_report("Failed to load XBZRLE page - wrong compression!"); 3463 return -1; 3464 } 3465 3466 if (xh_len > TARGET_PAGE_SIZE) { 3467 error_report("Failed to load XBZRLE page - len overflow!"); 3468 return -1; 3469 } 3470 loaded_data = XBZRLE.decoded_buf; 3471 /* load data and decode */ 3472 /* it can change loaded_data to point to an internal buffer */ 3473 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3474 3475 /* decode RLE */ 3476 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3477 TARGET_PAGE_SIZE) == -1) { 3478 error_report("Failed to load XBZRLE page - decode error!"); 3479 return -1; 3480 } 3481 3482 return 0; 3483 } 3484 3485 /** 3486 * ram_block_from_stream: read a RAMBlock id from the migration stream 3487 * 3488 * Must be called from within a rcu critical section. 3489 * 3490 * Returns a pointer from within the RCU-protected ram_list. 3491 * 3492 * @mis: the migration incoming state pointer 3493 * @f: QEMUFile where to read the data from 3494 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3495 * @channel: the channel we're using 3496 */ 3497 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3498 QEMUFile *f, int flags, 3499 int channel) 3500 { 3501 RAMBlock *block = mis->last_recv_block[channel]; 3502 char id[256]; 3503 uint8_t len; 3504 3505 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3506 if (!block) { 3507 error_report("Ack, bad migration stream!"); 3508 return NULL; 3509 } 3510 return block; 3511 } 3512 3513 len = qemu_get_byte(f); 3514 qemu_get_buffer(f, (uint8_t *)id, len); 3515 id[len] = 0; 3516 3517 block = qemu_ram_block_by_name(id); 3518 if (!block) { 3519 error_report("Can't find block %s", id); 3520 return NULL; 3521 } 3522 3523 if (ramblock_is_ignored(block)) { 3524 error_report("block %s should not be migrated !", id); 3525 return NULL; 3526 } 3527 3528 mis->last_recv_block[channel] = block; 3529 3530 return block; 3531 } 3532 3533 static inline void *host_from_ram_block_offset(RAMBlock *block, 3534 ram_addr_t offset) 3535 { 3536 if (!offset_in_ramblock(block, offset)) { 3537 return NULL; 3538 } 3539 3540 return block->host + offset; 3541 } 3542 3543 static void *host_page_from_ram_block_offset(RAMBlock *block, 3544 ram_addr_t offset) 3545 { 3546 /* Note: Explicitly no check against offset_in_ramblock(). */ 3547 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3548 block->page_size); 3549 } 3550 3551 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3552 ram_addr_t offset) 3553 { 3554 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3555 } 3556 3557 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3558 ram_addr_t offset, bool record_bitmap) 3559 { 3560 if (!offset_in_ramblock(block, offset)) { 3561 return NULL; 3562 } 3563 if (!block->colo_cache) { 3564 error_report("%s: colo_cache is NULL in block :%s", 3565 __func__, block->idstr); 3566 return NULL; 3567 } 3568 3569 /* 3570 * During colo checkpoint, we need bitmap of these migrated pages. 3571 * It help us to decide which pages in ram cache should be flushed 3572 * into VM's RAM later. 3573 */ 3574 if (record_bitmap && 3575 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3576 ram_state->migration_dirty_pages++; 3577 } 3578 return block->colo_cache + offset; 3579 } 3580 3581 /** 3582 * ram_handle_compressed: handle the zero page case 3583 * 3584 * If a page (or a whole RDMA chunk) has been 3585 * determined to be zero, then zap it. 3586 * 3587 * @host: host address for the zero page 3588 * @ch: what the page is filled from. We only support zero 3589 * @size: size of the zero page 3590 */ 3591 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3592 { 3593 if (ch != 0 || !buffer_is_zero(host, size)) { 3594 memset(host, ch, size); 3595 } 3596 } 3597 3598 /* return the size after decompression, or negative value on error */ 3599 static int 3600 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3601 const uint8_t *source, size_t source_len) 3602 { 3603 int err; 3604 3605 err = inflateReset(stream); 3606 if (err != Z_OK) { 3607 return -1; 3608 } 3609 3610 stream->avail_in = source_len; 3611 stream->next_in = (uint8_t *)source; 3612 stream->avail_out = dest_len; 3613 stream->next_out = dest; 3614 3615 err = inflate(stream, Z_NO_FLUSH); 3616 if (err != Z_STREAM_END) { 3617 return -1; 3618 } 3619 3620 return stream->total_out; 3621 } 3622 3623 static void *do_data_decompress(void *opaque) 3624 { 3625 DecompressParam *param = opaque; 3626 unsigned long pagesize; 3627 uint8_t *des; 3628 int len, ret; 3629 3630 qemu_mutex_lock(¶m->mutex); 3631 while (!param->quit) { 3632 if (param->des) { 3633 des = param->des; 3634 len = param->len; 3635 param->des = 0; 3636 qemu_mutex_unlock(¶m->mutex); 3637 3638 pagesize = TARGET_PAGE_SIZE; 3639 3640 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3641 param->compbuf, len); 3642 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3643 error_report("decompress data failed"); 3644 qemu_file_set_error(decomp_file, ret); 3645 } 3646 3647 qemu_mutex_lock(&decomp_done_lock); 3648 param->done = true; 3649 qemu_cond_signal(&decomp_done_cond); 3650 qemu_mutex_unlock(&decomp_done_lock); 3651 3652 qemu_mutex_lock(¶m->mutex); 3653 } else { 3654 qemu_cond_wait(¶m->cond, ¶m->mutex); 3655 } 3656 } 3657 qemu_mutex_unlock(¶m->mutex); 3658 3659 return NULL; 3660 } 3661 3662 static int wait_for_decompress_done(void) 3663 { 3664 int idx, thread_count; 3665 3666 if (!migrate_use_compression()) { 3667 return 0; 3668 } 3669 3670 thread_count = migrate_decompress_threads(); 3671 qemu_mutex_lock(&decomp_done_lock); 3672 for (idx = 0; idx < thread_count; idx++) { 3673 while (!decomp_param[idx].done) { 3674 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3675 } 3676 } 3677 qemu_mutex_unlock(&decomp_done_lock); 3678 return qemu_file_get_error(decomp_file); 3679 } 3680 3681 static void compress_threads_load_cleanup(void) 3682 { 3683 int i, thread_count; 3684 3685 if (!migrate_use_compression()) { 3686 return; 3687 } 3688 thread_count = migrate_decompress_threads(); 3689 for (i = 0; i < thread_count; i++) { 3690 /* 3691 * we use it as a indicator which shows if the thread is 3692 * properly init'd or not 3693 */ 3694 if (!decomp_param[i].compbuf) { 3695 break; 3696 } 3697 3698 qemu_mutex_lock(&decomp_param[i].mutex); 3699 decomp_param[i].quit = true; 3700 qemu_cond_signal(&decomp_param[i].cond); 3701 qemu_mutex_unlock(&decomp_param[i].mutex); 3702 } 3703 for (i = 0; i < thread_count; i++) { 3704 if (!decomp_param[i].compbuf) { 3705 break; 3706 } 3707 3708 qemu_thread_join(decompress_threads + i); 3709 qemu_mutex_destroy(&decomp_param[i].mutex); 3710 qemu_cond_destroy(&decomp_param[i].cond); 3711 inflateEnd(&decomp_param[i].stream); 3712 g_free(decomp_param[i].compbuf); 3713 decomp_param[i].compbuf = NULL; 3714 } 3715 g_free(decompress_threads); 3716 g_free(decomp_param); 3717 decompress_threads = NULL; 3718 decomp_param = NULL; 3719 decomp_file = NULL; 3720 } 3721 3722 static int compress_threads_load_setup(QEMUFile *f) 3723 { 3724 int i, thread_count; 3725 3726 if (!migrate_use_compression()) { 3727 return 0; 3728 } 3729 3730 thread_count = migrate_decompress_threads(); 3731 decompress_threads = g_new0(QemuThread, thread_count); 3732 decomp_param = g_new0(DecompressParam, thread_count); 3733 qemu_mutex_init(&decomp_done_lock); 3734 qemu_cond_init(&decomp_done_cond); 3735 decomp_file = f; 3736 for (i = 0; i < thread_count; i++) { 3737 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3738 goto exit; 3739 } 3740 3741 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3742 qemu_mutex_init(&decomp_param[i].mutex); 3743 qemu_cond_init(&decomp_param[i].cond); 3744 decomp_param[i].done = true; 3745 decomp_param[i].quit = false; 3746 qemu_thread_create(decompress_threads + i, "decompress", 3747 do_data_decompress, decomp_param + i, 3748 QEMU_THREAD_JOINABLE); 3749 } 3750 return 0; 3751 exit: 3752 compress_threads_load_cleanup(); 3753 return -1; 3754 } 3755 3756 static void decompress_data_with_multi_threads(QEMUFile *f, 3757 void *host, int len) 3758 { 3759 int idx, thread_count; 3760 3761 thread_count = migrate_decompress_threads(); 3762 QEMU_LOCK_GUARD(&decomp_done_lock); 3763 while (true) { 3764 for (idx = 0; idx < thread_count; idx++) { 3765 if (decomp_param[idx].done) { 3766 decomp_param[idx].done = false; 3767 qemu_mutex_lock(&decomp_param[idx].mutex); 3768 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3769 decomp_param[idx].des = host; 3770 decomp_param[idx].len = len; 3771 qemu_cond_signal(&decomp_param[idx].cond); 3772 qemu_mutex_unlock(&decomp_param[idx].mutex); 3773 break; 3774 } 3775 } 3776 if (idx < thread_count) { 3777 break; 3778 } else { 3779 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3780 } 3781 } 3782 } 3783 3784 static void colo_init_ram_state(void) 3785 { 3786 ram_state_init(&ram_state); 3787 } 3788 3789 /* 3790 * colo cache: this is for secondary VM, we cache the whole 3791 * memory of the secondary VM, it is need to hold the global lock 3792 * to call this helper. 3793 */ 3794 int colo_init_ram_cache(void) 3795 { 3796 RAMBlock *block; 3797 3798 WITH_RCU_READ_LOCK_GUARD() { 3799 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3800 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3801 NULL, false, false); 3802 if (!block->colo_cache) { 3803 error_report("%s: Can't alloc memory for COLO cache of block %s," 3804 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3805 block->used_length); 3806 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3807 if (block->colo_cache) { 3808 qemu_anon_ram_free(block->colo_cache, block->used_length); 3809 block->colo_cache = NULL; 3810 } 3811 } 3812 return -errno; 3813 } 3814 if (!machine_dump_guest_core(current_machine)) { 3815 qemu_madvise(block->colo_cache, block->used_length, 3816 QEMU_MADV_DONTDUMP); 3817 } 3818 } 3819 } 3820 3821 /* 3822 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3823 * with to decide which page in cache should be flushed into SVM's RAM. Here 3824 * we use the same name 'ram_bitmap' as for migration. 3825 */ 3826 if (ram_bytes_total()) { 3827 RAMBlock *block; 3828 3829 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3830 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3831 block->bmap = bitmap_new(pages); 3832 } 3833 } 3834 3835 colo_init_ram_state(); 3836 return 0; 3837 } 3838 3839 /* TODO: duplicated with ram_init_bitmaps */ 3840 void colo_incoming_start_dirty_log(void) 3841 { 3842 RAMBlock *block = NULL; 3843 /* For memory_global_dirty_log_start below. */ 3844 qemu_mutex_lock_iothread(); 3845 qemu_mutex_lock_ramlist(); 3846 3847 memory_global_dirty_log_sync(); 3848 WITH_RCU_READ_LOCK_GUARD() { 3849 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3850 ramblock_sync_dirty_bitmap(ram_state, block); 3851 /* Discard this dirty bitmap record */ 3852 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3853 } 3854 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3855 } 3856 ram_state->migration_dirty_pages = 0; 3857 qemu_mutex_unlock_ramlist(); 3858 qemu_mutex_unlock_iothread(); 3859 } 3860 3861 /* It is need to hold the global lock to call this helper */ 3862 void colo_release_ram_cache(void) 3863 { 3864 RAMBlock *block; 3865 3866 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3867 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3868 g_free(block->bmap); 3869 block->bmap = NULL; 3870 } 3871 3872 WITH_RCU_READ_LOCK_GUARD() { 3873 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3874 if (block->colo_cache) { 3875 qemu_anon_ram_free(block->colo_cache, block->used_length); 3876 block->colo_cache = NULL; 3877 } 3878 } 3879 } 3880 ram_state_cleanup(&ram_state); 3881 } 3882 3883 /** 3884 * ram_load_setup: Setup RAM for migration incoming side 3885 * 3886 * Returns zero to indicate success and negative for error 3887 * 3888 * @f: QEMUFile where to receive the data 3889 * @opaque: RAMState pointer 3890 */ 3891 static int ram_load_setup(QEMUFile *f, void *opaque) 3892 { 3893 if (compress_threads_load_setup(f)) { 3894 return -1; 3895 } 3896 3897 xbzrle_load_setup(); 3898 ramblock_recv_map_init(); 3899 3900 return 0; 3901 } 3902 3903 static int ram_load_cleanup(void *opaque) 3904 { 3905 RAMBlock *rb; 3906 3907 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3908 qemu_ram_block_writeback(rb); 3909 } 3910 3911 xbzrle_load_cleanup(); 3912 compress_threads_load_cleanup(); 3913 3914 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3915 g_free(rb->receivedmap); 3916 rb->receivedmap = NULL; 3917 } 3918 3919 return 0; 3920 } 3921 3922 /** 3923 * ram_postcopy_incoming_init: allocate postcopy data structures 3924 * 3925 * Returns 0 for success and negative if there was one error 3926 * 3927 * @mis: current migration incoming state 3928 * 3929 * Allocate data structures etc needed by incoming migration with 3930 * postcopy-ram. postcopy-ram's similarly names 3931 * postcopy_ram_incoming_init does the work. 3932 */ 3933 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3934 { 3935 return postcopy_ram_incoming_init(mis); 3936 } 3937 3938 /** 3939 * ram_load_postcopy: load a page in postcopy case 3940 * 3941 * Returns 0 for success or -errno in case of error 3942 * 3943 * Called in postcopy mode by ram_load(). 3944 * rcu_read_lock is taken prior to this being called. 3945 * 3946 * @f: QEMUFile where to send the data 3947 * @channel: the channel to use for loading 3948 */ 3949 int ram_load_postcopy(QEMUFile *f, int channel) 3950 { 3951 int flags = 0, ret = 0; 3952 bool place_needed = false; 3953 bool matches_target_page_size = false; 3954 MigrationIncomingState *mis = migration_incoming_get_current(); 3955 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3956 3957 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3958 ram_addr_t addr; 3959 void *page_buffer = NULL; 3960 void *place_source = NULL; 3961 RAMBlock *block = NULL; 3962 uint8_t ch; 3963 int len; 3964 3965 addr = qemu_get_be64(f); 3966 3967 /* 3968 * If qemu file error, we should stop here, and then "addr" 3969 * may be invalid 3970 */ 3971 ret = qemu_file_get_error(f); 3972 if (ret) { 3973 break; 3974 } 3975 3976 flags = addr & ~TARGET_PAGE_MASK; 3977 addr &= TARGET_PAGE_MASK; 3978 3979 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3980 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3981 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3982 block = ram_block_from_stream(mis, f, flags, channel); 3983 if (!block) { 3984 ret = -EINVAL; 3985 break; 3986 } 3987 3988 /* 3989 * Relying on used_length is racy and can result in false positives. 3990 * We might place pages beyond used_length in case RAM was shrunk 3991 * while in postcopy, which is fine - trying to place via 3992 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3993 */ 3994 if (!block->host || addr >= block->postcopy_length) { 3995 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3996 ret = -EINVAL; 3997 break; 3998 } 3999 tmp_page->target_pages++; 4000 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4001 /* 4002 * Postcopy requires that we place whole host pages atomically; 4003 * these may be huge pages for RAMBlocks that are backed by 4004 * hugetlbfs. 4005 * To make it atomic, the data is read into a temporary page 4006 * that's moved into place later. 4007 * The migration protocol uses, possibly smaller, target-pages 4008 * however the source ensures it always sends all the components 4009 * of a host page in one chunk. 4010 */ 4011 page_buffer = tmp_page->tmp_huge_page + 4012 host_page_offset_from_ram_block_offset(block, addr); 4013 /* If all TP are zero then we can optimise the place */ 4014 if (tmp_page->target_pages == 1) { 4015 tmp_page->host_addr = 4016 host_page_from_ram_block_offset(block, addr); 4017 } else if (tmp_page->host_addr != 4018 host_page_from_ram_block_offset(block, addr)) { 4019 /* not the 1st TP within the HP */ 4020 error_report("Non-same host page detected on channel %d: " 4021 "Target host page %p, received host page %p " 4022 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4023 channel, tmp_page->host_addr, 4024 host_page_from_ram_block_offset(block, addr), 4025 block->idstr, addr, tmp_page->target_pages); 4026 ret = -EINVAL; 4027 break; 4028 } 4029 4030 /* 4031 * If it's the last part of a host page then we place the host 4032 * page 4033 */ 4034 if (tmp_page->target_pages == 4035 (block->page_size / TARGET_PAGE_SIZE)) { 4036 place_needed = true; 4037 } 4038 place_source = tmp_page->tmp_huge_page; 4039 } 4040 4041 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4042 case RAM_SAVE_FLAG_ZERO: 4043 ch = qemu_get_byte(f); 4044 /* 4045 * Can skip to set page_buffer when 4046 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4047 */ 4048 if (ch || !matches_target_page_size) { 4049 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4050 } 4051 if (ch) { 4052 tmp_page->all_zero = false; 4053 } 4054 break; 4055 4056 case RAM_SAVE_FLAG_PAGE: 4057 tmp_page->all_zero = false; 4058 if (!matches_target_page_size) { 4059 /* For huge pages, we always use temporary buffer */ 4060 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4061 } else { 4062 /* 4063 * For small pages that matches target page size, we 4064 * avoid the qemu_file copy. Instead we directly use 4065 * the buffer of QEMUFile to place the page. Note: we 4066 * cannot do any QEMUFile operation before using that 4067 * buffer to make sure the buffer is valid when 4068 * placing the page. 4069 */ 4070 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4071 TARGET_PAGE_SIZE); 4072 } 4073 break; 4074 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4075 tmp_page->all_zero = false; 4076 len = qemu_get_be32(f); 4077 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4078 error_report("Invalid compressed data length: %d", len); 4079 ret = -EINVAL; 4080 break; 4081 } 4082 decompress_data_with_multi_threads(f, page_buffer, len); 4083 break; 4084 4085 case RAM_SAVE_FLAG_EOS: 4086 /* normal exit */ 4087 multifd_recv_sync_main(); 4088 break; 4089 default: 4090 error_report("Unknown combination of migration flags: 0x%x" 4091 " (postcopy mode)", flags); 4092 ret = -EINVAL; 4093 break; 4094 } 4095 4096 /* Got the whole host page, wait for decompress before placing. */ 4097 if (place_needed) { 4098 ret |= wait_for_decompress_done(); 4099 } 4100 4101 /* Detect for any possible file errors */ 4102 if (!ret && qemu_file_get_error(f)) { 4103 ret = qemu_file_get_error(f); 4104 } 4105 4106 if (!ret && place_needed) { 4107 if (tmp_page->all_zero) { 4108 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4109 } else { 4110 ret = postcopy_place_page(mis, tmp_page->host_addr, 4111 place_source, block); 4112 } 4113 place_needed = false; 4114 postcopy_temp_page_reset(tmp_page); 4115 } 4116 } 4117 4118 return ret; 4119 } 4120 4121 static bool postcopy_is_advised(void) 4122 { 4123 PostcopyState ps = postcopy_state_get(); 4124 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 4125 } 4126 4127 static bool postcopy_is_running(void) 4128 { 4129 PostcopyState ps = postcopy_state_get(); 4130 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4131 } 4132 4133 /* 4134 * Flush content of RAM cache into SVM's memory. 4135 * Only flush the pages that be dirtied by PVM or SVM or both. 4136 */ 4137 void colo_flush_ram_cache(void) 4138 { 4139 RAMBlock *block = NULL; 4140 void *dst_host; 4141 void *src_host; 4142 unsigned long offset = 0; 4143 4144 memory_global_dirty_log_sync(); 4145 WITH_RCU_READ_LOCK_GUARD() { 4146 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4147 ramblock_sync_dirty_bitmap(ram_state, block); 4148 } 4149 } 4150 4151 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4152 WITH_RCU_READ_LOCK_GUARD() { 4153 block = QLIST_FIRST_RCU(&ram_list.blocks); 4154 4155 while (block) { 4156 unsigned long num = 0; 4157 4158 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4159 if (!offset_in_ramblock(block, 4160 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4161 offset = 0; 4162 num = 0; 4163 block = QLIST_NEXT_RCU(block, next); 4164 } else { 4165 unsigned long i = 0; 4166 4167 for (i = 0; i < num; i++) { 4168 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4169 } 4170 dst_host = block->host 4171 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4172 src_host = block->colo_cache 4173 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4174 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4175 offset += num; 4176 } 4177 } 4178 } 4179 trace_colo_flush_ram_cache_end(); 4180 } 4181 4182 /** 4183 * ram_load_precopy: load pages in precopy case 4184 * 4185 * Returns 0 for success or -errno in case of error 4186 * 4187 * Called in precopy mode by ram_load(). 4188 * rcu_read_lock is taken prior to this being called. 4189 * 4190 * @f: QEMUFile where to send the data 4191 */ 4192 static int ram_load_precopy(QEMUFile *f) 4193 { 4194 MigrationIncomingState *mis = migration_incoming_get_current(); 4195 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4196 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4197 bool postcopy_advised = postcopy_is_advised(); 4198 if (!migrate_use_compression()) { 4199 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4200 } 4201 4202 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4203 ram_addr_t addr, total_ram_bytes; 4204 void *host = NULL, *host_bak = NULL; 4205 uint8_t ch; 4206 4207 /* 4208 * Yield periodically to let main loop run, but an iteration of 4209 * the main loop is expensive, so do it each some iterations 4210 */ 4211 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4212 aio_co_schedule(qemu_get_current_aio_context(), 4213 qemu_coroutine_self()); 4214 qemu_coroutine_yield(); 4215 } 4216 i++; 4217 4218 addr = qemu_get_be64(f); 4219 flags = addr & ~TARGET_PAGE_MASK; 4220 addr &= TARGET_PAGE_MASK; 4221 4222 if (flags & invalid_flags) { 4223 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4224 error_report("Received an unexpected compressed page"); 4225 } 4226 4227 ret = -EINVAL; 4228 break; 4229 } 4230 4231 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4232 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4233 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4234 RAM_CHANNEL_PRECOPY); 4235 4236 host = host_from_ram_block_offset(block, addr); 4237 /* 4238 * After going into COLO stage, we should not load the page 4239 * into SVM's memory directly, we put them into colo_cache firstly. 4240 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4241 * Previously, we copied all these memory in preparing stage of COLO 4242 * while we need to stop VM, which is a time-consuming process. 4243 * Here we optimize it by a trick, back-up every page while in 4244 * migration process while COLO is enabled, though it affects the 4245 * speed of the migration, but it obviously reduce the downtime of 4246 * back-up all SVM'S memory in COLO preparing stage. 4247 */ 4248 if (migration_incoming_colo_enabled()) { 4249 if (migration_incoming_in_colo_state()) { 4250 /* In COLO stage, put all pages into cache temporarily */ 4251 host = colo_cache_from_block_offset(block, addr, true); 4252 } else { 4253 /* 4254 * In migration stage but before COLO stage, 4255 * Put all pages into both cache and SVM's memory. 4256 */ 4257 host_bak = colo_cache_from_block_offset(block, addr, false); 4258 } 4259 } 4260 if (!host) { 4261 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4262 ret = -EINVAL; 4263 break; 4264 } 4265 if (!migration_incoming_in_colo_state()) { 4266 ramblock_recv_bitmap_set(block, host); 4267 } 4268 4269 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4270 } 4271 4272 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4273 case RAM_SAVE_FLAG_MEM_SIZE: 4274 /* Synchronize RAM block list */ 4275 total_ram_bytes = addr; 4276 while (!ret && total_ram_bytes) { 4277 RAMBlock *block; 4278 char id[256]; 4279 ram_addr_t length; 4280 4281 len = qemu_get_byte(f); 4282 qemu_get_buffer(f, (uint8_t *)id, len); 4283 id[len] = 0; 4284 length = qemu_get_be64(f); 4285 4286 block = qemu_ram_block_by_name(id); 4287 if (block && !qemu_ram_is_migratable(block)) { 4288 error_report("block %s should not be migrated !", id); 4289 ret = -EINVAL; 4290 } else if (block) { 4291 if (length != block->used_length) { 4292 Error *local_err = NULL; 4293 4294 ret = qemu_ram_resize(block, length, 4295 &local_err); 4296 if (local_err) { 4297 error_report_err(local_err); 4298 } 4299 } 4300 /* For postcopy we need to check hugepage sizes match */ 4301 if (postcopy_advised && migrate_postcopy_ram() && 4302 block->page_size != qemu_host_page_size) { 4303 uint64_t remote_page_size = qemu_get_be64(f); 4304 if (remote_page_size != block->page_size) { 4305 error_report("Mismatched RAM page size %s " 4306 "(local) %zd != %" PRId64, 4307 id, block->page_size, 4308 remote_page_size); 4309 ret = -EINVAL; 4310 } 4311 } 4312 if (migrate_ignore_shared()) { 4313 hwaddr addr = qemu_get_be64(f); 4314 if (ramblock_is_ignored(block) && 4315 block->mr->addr != addr) { 4316 error_report("Mismatched GPAs for block %s " 4317 "%" PRId64 "!= %" PRId64, 4318 id, (uint64_t)addr, 4319 (uint64_t)block->mr->addr); 4320 ret = -EINVAL; 4321 } 4322 } 4323 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4324 block->idstr); 4325 } else { 4326 error_report("Unknown ramblock \"%s\", cannot " 4327 "accept migration", id); 4328 ret = -EINVAL; 4329 } 4330 4331 total_ram_bytes -= length; 4332 } 4333 break; 4334 4335 case RAM_SAVE_FLAG_ZERO: 4336 ch = qemu_get_byte(f); 4337 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4338 break; 4339 4340 case RAM_SAVE_FLAG_PAGE: 4341 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4342 break; 4343 4344 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4345 len = qemu_get_be32(f); 4346 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4347 error_report("Invalid compressed data length: %d", len); 4348 ret = -EINVAL; 4349 break; 4350 } 4351 decompress_data_with_multi_threads(f, host, len); 4352 break; 4353 4354 case RAM_SAVE_FLAG_XBZRLE: 4355 if (load_xbzrle(f, addr, host) < 0) { 4356 error_report("Failed to decompress XBZRLE page at " 4357 RAM_ADDR_FMT, addr); 4358 ret = -EINVAL; 4359 break; 4360 } 4361 break; 4362 case RAM_SAVE_FLAG_EOS: 4363 /* normal exit */ 4364 multifd_recv_sync_main(); 4365 break; 4366 default: 4367 if (flags & RAM_SAVE_FLAG_HOOK) { 4368 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4369 } else { 4370 error_report("Unknown combination of migration flags: 0x%x", 4371 flags); 4372 ret = -EINVAL; 4373 } 4374 } 4375 if (!ret) { 4376 ret = qemu_file_get_error(f); 4377 } 4378 if (!ret && host_bak) { 4379 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4380 } 4381 } 4382 4383 ret |= wait_for_decompress_done(); 4384 return ret; 4385 } 4386 4387 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4388 { 4389 int ret = 0; 4390 static uint64_t seq_iter; 4391 /* 4392 * If system is running in postcopy mode, page inserts to host memory must 4393 * be atomic 4394 */ 4395 bool postcopy_running = postcopy_is_running(); 4396 4397 seq_iter++; 4398 4399 if (version_id != 4) { 4400 return -EINVAL; 4401 } 4402 4403 /* 4404 * This RCU critical section can be very long running. 4405 * When RCU reclaims in the code start to become numerous, 4406 * it will be necessary to reduce the granularity of this 4407 * critical section. 4408 */ 4409 WITH_RCU_READ_LOCK_GUARD() { 4410 if (postcopy_running) { 4411 /* 4412 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4413 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4414 * service fast page faults. 4415 */ 4416 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4417 } else { 4418 ret = ram_load_precopy(f); 4419 } 4420 } 4421 trace_ram_load_complete(ret, seq_iter); 4422 4423 return ret; 4424 } 4425 4426 static bool ram_has_postcopy(void *opaque) 4427 { 4428 RAMBlock *rb; 4429 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4430 if (ramblock_is_pmem(rb)) { 4431 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4432 "is not supported now!", rb->idstr, rb->host); 4433 return false; 4434 } 4435 } 4436 4437 return migrate_postcopy_ram(); 4438 } 4439 4440 /* Sync all the dirty bitmap with destination VM. */ 4441 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4442 { 4443 RAMBlock *block; 4444 QEMUFile *file = s->to_dst_file; 4445 int ramblock_count = 0; 4446 4447 trace_ram_dirty_bitmap_sync_start(); 4448 4449 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4450 qemu_savevm_send_recv_bitmap(file, block->idstr); 4451 trace_ram_dirty_bitmap_request(block->idstr); 4452 ramblock_count++; 4453 } 4454 4455 trace_ram_dirty_bitmap_sync_wait(); 4456 4457 /* Wait until all the ramblocks' dirty bitmap synced */ 4458 while (ramblock_count--) { 4459 qemu_sem_wait(&s->rp_state.rp_sem); 4460 } 4461 4462 trace_ram_dirty_bitmap_sync_complete(); 4463 4464 return 0; 4465 } 4466 4467 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4468 { 4469 qemu_sem_post(&s->rp_state.rp_sem); 4470 } 4471 4472 /* 4473 * Read the received bitmap, revert it as the initial dirty bitmap. 4474 * This is only used when the postcopy migration is paused but wants 4475 * to resume from a middle point. 4476 */ 4477 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4478 { 4479 int ret = -EINVAL; 4480 /* from_dst_file is always valid because we're within rp_thread */ 4481 QEMUFile *file = s->rp_state.from_dst_file; 4482 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4483 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4484 uint64_t size, end_mark; 4485 4486 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4487 4488 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4489 error_report("%s: incorrect state %s", __func__, 4490 MigrationStatus_str(s->state)); 4491 return -EINVAL; 4492 } 4493 4494 /* 4495 * Note: see comments in ramblock_recv_bitmap_send() on why we 4496 * need the endianness conversion, and the paddings. 4497 */ 4498 local_size = ROUND_UP(local_size, 8); 4499 4500 /* Add paddings */ 4501 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4502 4503 size = qemu_get_be64(file); 4504 4505 /* The size of the bitmap should match with our ramblock */ 4506 if (size != local_size) { 4507 error_report("%s: ramblock '%s' bitmap size mismatch " 4508 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4509 block->idstr, size, local_size); 4510 ret = -EINVAL; 4511 goto out; 4512 } 4513 4514 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4515 end_mark = qemu_get_be64(file); 4516 4517 ret = qemu_file_get_error(file); 4518 if (ret || size != local_size) { 4519 error_report("%s: read bitmap failed for ramblock '%s': %d" 4520 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4521 __func__, block->idstr, ret, local_size, size); 4522 ret = -EIO; 4523 goto out; 4524 } 4525 4526 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4527 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4528 __func__, block->idstr, end_mark); 4529 ret = -EINVAL; 4530 goto out; 4531 } 4532 4533 /* 4534 * Endianness conversion. We are during postcopy (though paused). 4535 * The dirty bitmap won't change. We can directly modify it. 4536 */ 4537 bitmap_from_le(block->bmap, le_bitmap, nbits); 4538 4539 /* 4540 * What we received is "received bitmap". Revert it as the initial 4541 * dirty bitmap for this ramblock. 4542 */ 4543 bitmap_complement(block->bmap, block->bmap, nbits); 4544 4545 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4546 ramblock_dirty_bitmap_clear_discarded_pages(block); 4547 4548 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4549 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4550 4551 /* 4552 * We succeeded to sync bitmap for current ramblock. If this is 4553 * the last one to sync, we need to notify the main send thread. 4554 */ 4555 ram_dirty_bitmap_reload_notify(s); 4556 4557 ret = 0; 4558 out: 4559 g_free(le_bitmap); 4560 return ret; 4561 } 4562 4563 static int ram_resume_prepare(MigrationState *s, void *opaque) 4564 { 4565 RAMState *rs = *(RAMState **)opaque; 4566 int ret; 4567 4568 ret = ram_dirty_bitmap_sync_all(s, rs); 4569 if (ret) { 4570 return ret; 4571 } 4572 4573 ram_state_resume_prepare(rs, s->to_dst_file); 4574 4575 return 0; 4576 } 4577 4578 void postcopy_preempt_shutdown_file(MigrationState *s) 4579 { 4580 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4581 qemu_fflush(s->postcopy_qemufile_src); 4582 } 4583 4584 static SaveVMHandlers savevm_ram_handlers = { 4585 .save_setup = ram_save_setup, 4586 .save_live_iterate = ram_save_iterate, 4587 .save_live_complete_postcopy = ram_save_complete, 4588 .save_live_complete_precopy = ram_save_complete, 4589 .has_postcopy = ram_has_postcopy, 4590 .save_live_pending = ram_save_pending, 4591 .load_state = ram_load, 4592 .save_cleanup = ram_save_cleanup, 4593 .load_setup = ram_load_setup, 4594 .load_cleanup = ram_load_cleanup, 4595 .resume_prepare = ram_resume_prepare, 4596 }; 4597 4598 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4599 size_t old_size, size_t new_size) 4600 { 4601 PostcopyState ps = postcopy_state_get(); 4602 ram_addr_t offset; 4603 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4604 Error *err = NULL; 4605 4606 if (ramblock_is_ignored(rb)) { 4607 return; 4608 } 4609 4610 if (!migration_is_idle()) { 4611 /* 4612 * Precopy code on the source cannot deal with the size of RAM blocks 4613 * changing at random points in time - especially after sending the 4614 * RAM block sizes in the migration stream, they must no longer change. 4615 * Abort and indicate a proper reason. 4616 */ 4617 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4618 migration_cancel(err); 4619 error_free(err); 4620 } 4621 4622 switch (ps) { 4623 case POSTCOPY_INCOMING_ADVISE: 4624 /* 4625 * Update what ram_postcopy_incoming_init()->init_range() does at the 4626 * time postcopy was advised. Syncing RAM blocks with the source will 4627 * result in RAM resizes. 4628 */ 4629 if (old_size < new_size) { 4630 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4631 error_report("RAM block '%s' discard of resized RAM failed", 4632 rb->idstr); 4633 } 4634 } 4635 rb->postcopy_length = new_size; 4636 break; 4637 case POSTCOPY_INCOMING_NONE: 4638 case POSTCOPY_INCOMING_RUNNING: 4639 case POSTCOPY_INCOMING_END: 4640 /* 4641 * Once our guest is running, postcopy does no longer care about 4642 * resizes. When growing, the new memory was not available on the 4643 * source, no handler needed. 4644 */ 4645 break; 4646 default: 4647 error_report("RAM block '%s' resized during postcopy state: %d", 4648 rb->idstr, ps); 4649 exit(-1); 4650 } 4651 } 4652 4653 static RAMBlockNotifier ram_mig_ram_notifier = { 4654 .ram_block_resized = ram_mig_ram_block_resized, 4655 }; 4656 4657 void ram_mig_init(void) 4658 { 4659 qemu_mutex_init(&XBZRLE.lock); 4660 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4661 ram_block_notifier_add(&ram_mig_ram_notifier); 4662 } 4663