1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #include "hw/boards.h" /* for machine_dump_guest_core() */ 62 63 #if defined(__linux__) 64 #include "qemu/userfaultfd.h" 65 #endif /* defined(__linux__) */ 66 67 /***********************************************************/ 68 /* ram save/restore */ 69 70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 71 * worked for pages that where filled with the same char. We switched 72 * it to only search for the zero value. And to avoid confusion with 73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 74 */ 75 76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 77 #define RAM_SAVE_FLAG_ZERO 0x02 78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 79 #define RAM_SAVE_FLAG_PAGE 0x08 80 #define RAM_SAVE_FLAG_EOS 0x10 81 #define RAM_SAVE_FLAG_CONTINUE 0x20 82 #define RAM_SAVE_FLAG_XBZRLE 0x40 83 /* 0x80 is reserved in migration.h start with 0x100 next */ 84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 85 86 XBZRLECacheStats xbzrle_counters; 87 88 /* struct contains XBZRLE cache and a static page 89 used by the compression */ 90 static struct { 91 /* buffer used for XBZRLE encoding */ 92 uint8_t *encoded_buf; 93 /* buffer for storing page content */ 94 uint8_t *current_buf; 95 /* Cache for XBZRLE, Protected by lock. */ 96 PageCache *cache; 97 QemuMutex lock; 98 /* it will store a page full of zeros */ 99 uint8_t *zero_target_page; 100 /* buffer used for XBZRLE decoding */ 101 uint8_t *decoded_buf; 102 } XBZRLE; 103 104 static void XBZRLE_cache_lock(void) 105 { 106 if (migrate_use_xbzrle()) { 107 qemu_mutex_lock(&XBZRLE.lock); 108 } 109 } 110 111 static void XBZRLE_cache_unlock(void) 112 { 113 if (migrate_use_xbzrle()) { 114 qemu_mutex_unlock(&XBZRLE.lock); 115 } 116 } 117 118 /** 119 * xbzrle_cache_resize: resize the xbzrle cache 120 * 121 * This function is called from migrate_params_apply in main 122 * thread, possibly while a migration is in progress. A running 123 * migration may be using the cache and might finish during this call, 124 * hence changes to the cache are protected by XBZRLE.lock(). 125 * 126 * Returns 0 for success or -1 for error 127 * 128 * @new_size: new cache size 129 * @errp: set *errp if the check failed, with reason 130 */ 131 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 132 { 133 PageCache *new_cache; 134 int64_t ret = 0; 135 136 /* Check for truncation */ 137 if (new_size != (size_t)new_size) { 138 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 139 "exceeding address space"); 140 return -1; 141 } 142 143 if (new_size == migrate_xbzrle_cache_size()) { 144 /* nothing to do */ 145 return 0; 146 } 147 148 XBZRLE_cache_lock(); 149 150 if (XBZRLE.cache != NULL) { 151 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 152 if (!new_cache) { 153 ret = -1; 154 goto out; 155 } 156 157 cache_fini(XBZRLE.cache); 158 XBZRLE.cache = new_cache; 159 } 160 out: 161 XBZRLE_cache_unlock(); 162 return ret; 163 } 164 165 bool ramblock_is_ignored(RAMBlock *block) 166 { 167 return !qemu_ram_is_migratable(block) || 168 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 169 } 170 171 #undef RAMBLOCK_FOREACH 172 173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 174 { 175 RAMBlock *block; 176 int ret = 0; 177 178 RCU_READ_LOCK_GUARD(); 179 180 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 181 ret = func(block, opaque); 182 if (ret) { 183 break; 184 } 185 } 186 return ret; 187 } 188 189 static void ramblock_recv_map_init(void) 190 { 191 RAMBlock *rb; 192 193 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 194 assert(!rb->receivedmap); 195 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 196 } 197 } 198 199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 200 { 201 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 202 rb->receivedmap); 203 } 204 205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 206 { 207 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 208 } 209 210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 211 { 212 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 213 } 214 215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 216 size_t nr) 217 { 218 bitmap_set_atomic(rb->receivedmap, 219 ramblock_recv_bitmap_offset(host_addr, rb), 220 nr); 221 } 222 223 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 224 225 /* 226 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 227 * 228 * Returns >0 if success with sent bytes, or <0 if error. 229 */ 230 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 231 const char *block_name) 232 { 233 RAMBlock *block = qemu_ram_block_by_name(block_name); 234 unsigned long *le_bitmap, nbits; 235 uint64_t size; 236 237 if (!block) { 238 error_report("%s: invalid block name: %s", __func__, block_name); 239 return -1; 240 } 241 242 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 243 244 /* 245 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 246 * machines we may need 4 more bytes for padding (see below 247 * comment). So extend it a bit before hand. 248 */ 249 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 250 251 /* 252 * Always use little endian when sending the bitmap. This is 253 * required that when source and destination VMs are not using the 254 * same endianness. (Note: big endian won't work.) 255 */ 256 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 257 258 /* Size of the bitmap, in bytes */ 259 size = DIV_ROUND_UP(nbits, 8); 260 261 /* 262 * size is always aligned to 8 bytes for 64bit machines, but it 263 * may not be true for 32bit machines. We need this padding to 264 * make sure the migration can survive even between 32bit and 265 * 64bit machines. 266 */ 267 size = ROUND_UP(size, 8); 268 269 qemu_put_be64(file, size); 270 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 271 /* 272 * Mark as an end, in case the middle part is screwed up due to 273 * some "mysterious" reason. 274 */ 275 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 276 qemu_fflush(file); 277 278 g_free(le_bitmap); 279 280 if (qemu_file_get_error(file)) { 281 return qemu_file_get_error(file); 282 } 283 284 return size + sizeof(size); 285 } 286 287 /* 288 * An outstanding page request, on the source, having been received 289 * and queued 290 */ 291 struct RAMSrcPageRequest { 292 RAMBlock *rb; 293 hwaddr offset; 294 hwaddr len; 295 296 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 297 }; 298 299 typedef struct { 300 /* 301 * Cached ramblock/offset values if preempted. They're only meaningful if 302 * preempted==true below. 303 */ 304 RAMBlock *ram_block; 305 unsigned long ram_page; 306 /* 307 * Whether a postcopy preemption just happened. Will be reset after 308 * precopy recovered to background migration. 309 */ 310 bool preempted; 311 } PostcopyPreemptState; 312 313 /* State of RAM for migration */ 314 struct RAMState { 315 /* QEMUFile used for this migration */ 316 QEMUFile *f; 317 /* UFFD file descriptor, used in 'write-tracking' migration */ 318 int uffdio_fd; 319 /* Last block that we have visited searching for dirty pages */ 320 RAMBlock *last_seen_block; 321 /* Last block from where we have sent data */ 322 RAMBlock *last_sent_block; 323 /* Last dirty target page we have sent */ 324 ram_addr_t last_page; 325 /* last ram version we have seen */ 326 uint32_t last_version; 327 /* How many times we have dirty too many pages */ 328 int dirty_rate_high_cnt; 329 /* these variables are used for bitmap sync */ 330 /* last time we did a full bitmap_sync */ 331 int64_t time_last_bitmap_sync; 332 /* bytes transferred at start_time */ 333 uint64_t bytes_xfer_prev; 334 /* number of dirty pages since start_time */ 335 uint64_t num_dirty_pages_period; 336 /* xbzrle misses since the beginning of the period */ 337 uint64_t xbzrle_cache_miss_prev; 338 /* Amount of xbzrle pages since the beginning of the period */ 339 uint64_t xbzrle_pages_prev; 340 /* Amount of xbzrle encoded bytes since the beginning of the period */ 341 uint64_t xbzrle_bytes_prev; 342 /* Start using XBZRLE (e.g., after the first round). */ 343 bool xbzrle_enabled; 344 /* Are we on the last stage of migration */ 345 bool last_stage; 346 /* compression statistics since the beginning of the period */ 347 /* amount of count that no free thread to compress data */ 348 uint64_t compress_thread_busy_prev; 349 /* amount bytes after compression */ 350 uint64_t compressed_size_prev; 351 /* amount of compressed pages */ 352 uint64_t compress_pages_prev; 353 354 /* total handled target pages at the beginning of period */ 355 uint64_t target_page_count_prev; 356 /* total handled target pages since start */ 357 uint64_t target_page_count; 358 /* number of dirty bits in the bitmap */ 359 uint64_t migration_dirty_pages; 360 /* Protects modification of the bitmap and migration dirty pages */ 361 QemuMutex bitmap_mutex; 362 /* The RAMBlock used in the last src_page_requests */ 363 RAMBlock *last_req_rb; 364 /* Queue of outstanding page requests from the destination */ 365 QemuMutex src_page_req_mutex; 366 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 367 368 /* Postcopy preemption informations */ 369 PostcopyPreemptState postcopy_preempt_state; 370 /* 371 * Current channel we're using on src VM. Only valid if postcopy-preempt 372 * is enabled. 373 */ 374 unsigned int postcopy_channel; 375 }; 376 typedef struct RAMState RAMState; 377 378 static RAMState *ram_state; 379 380 static NotifierWithReturnList precopy_notifier_list; 381 382 static void postcopy_preempt_reset(RAMState *rs) 383 { 384 memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState)); 385 } 386 387 /* Whether postcopy has queued requests? */ 388 static bool postcopy_has_request(RAMState *rs) 389 { 390 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 391 } 392 393 void precopy_infrastructure_init(void) 394 { 395 notifier_with_return_list_init(&precopy_notifier_list); 396 } 397 398 void precopy_add_notifier(NotifierWithReturn *n) 399 { 400 notifier_with_return_list_add(&precopy_notifier_list, n); 401 } 402 403 void precopy_remove_notifier(NotifierWithReturn *n) 404 { 405 notifier_with_return_remove(n); 406 } 407 408 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 409 { 410 PrecopyNotifyData pnd; 411 pnd.reason = reason; 412 pnd.errp = errp; 413 414 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 415 } 416 417 uint64_t ram_bytes_remaining(void) 418 { 419 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 420 0; 421 } 422 423 MigrationStats ram_counters; 424 425 static void ram_transferred_add(uint64_t bytes) 426 { 427 if (runstate_is_running()) { 428 ram_counters.precopy_bytes += bytes; 429 } else if (migration_in_postcopy()) { 430 ram_counters.postcopy_bytes += bytes; 431 } else { 432 ram_counters.downtime_bytes += bytes; 433 } 434 ram_counters.transferred += bytes; 435 } 436 437 void dirty_sync_missed_zero_copy(void) 438 { 439 ram_counters.dirty_sync_missed_zero_copy++; 440 } 441 442 /* used by the search for pages to send */ 443 struct PageSearchStatus { 444 /* Current block being searched */ 445 RAMBlock *block; 446 /* Current page to search from */ 447 unsigned long page; 448 /* Set once we wrap around */ 449 bool complete_round; 450 /* 451 * [POSTCOPY-ONLY] Whether current page is explicitly requested by 452 * postcopy. When set, the request is "urgent" because the dest QEMU 453 * threads are waiting for us. 454 */ 455 bool postcopy_requested; 456 /* 457 * [POSTCOPY-ONLY] The target channel to use to send current page. 458 * 459 * Note: This may _not_ match with the value in postcopy_requested 460 * above. Let's imagine the case where the postcopy request is exactly 461 * the page that we're sending in progress during precopy. In this case 462 * we'll have postcopy_requested set to true but the target channel 463 * will be the precopy channel (so that we don't split brain on that 464 * specific page since the precopy channel already contains partial of 465 * that page data). 466 * 467 * Besides that specific use case, postcopy_target_channel should 468 * always be equal to postcopy_requested, because by default we send 469 * postcopy pages via postcopy preempt channel. 470 */ 471 bool postcopy_target_channel; 472 }; 473 typedef struct PageSearchStatus PageSearchStatus; 474 475 CompressionStats compression_counters; 476 477 struct CompressParam { 478 bool done; 479 bool quit; 480 bool zero_page; 481 QEMUFile *file; 482 QemuMutex mutex; 483 QemuCond cond; 484 RAMBlock *block; 485 ram_addr_t offset; 486 487 /* internally used fields */ 488 z_stream stream; 489 uint8_t *originbuf; 490 }; 491 typedef struct CompressParam CompressParam; 492 493 struct DecompressParam { 494 bool done; 495 bool quit; 496 QemuMutex mutex; 497 QemuCond cond; 498 void *des; 499 uint8_t *compbuf; 500 int len; 501 z_stream stream; 502 }; 503 typedef struct DecompressParam DecompressParam; 504 505 static CompressParam *comp_param; 506 static QemuThread *compress_threads; 507 /* comp_done_cond is used to wake up the migration thread when 508 * one of the compression threads has finished the compression. 509 * comp_done_lock is used to co-work with comp_done_cond. 510 */ 511 static QemuMutex comp_done_lock; 512 static QemuCond comp_done_cond; 513 514 static QEMUFile *decomp_file; 515 static DecompressParam *decomp_param; 516 static QemuThread *decompress_threads; 517 static QemuMutex decomp_done_lock; 518 static QemuCond decomp_done_cond; 519 520 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 521 ram_addr_t offset, uint8_t *source_buf); 522 523 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, 524 bool postcopy_requested); 525 526 static void *do_data_compress(void *opaque) 527 { 528 CompressParam *param = opaque; 529 RAMBlock *block; 530 ram_addr_t offset; 531 bool zero_page; 532 533 qemu_mutex_lock(¶m->mutex); 534 while (!param->quit) { 535 if (param->block) { 536 block = param->block; 537 offset = param->offset; 538 param->block = NULL; 539 qemu_mutex_unlock(¶m->mutex); 540 541 zero_page = do_compress_ram_page(param->file, ¶m->stream, 542 block, offset, param->originbuf); 543 544 qemu_mutex_lock(&comp_done_lock); 545 param->done = true; 546 param->zero_page = zero_page; 547 qemu_cond_signal(&comp_done_cond); 548 qemu_mutex_unlock(&comp_done_lock); 549 550 qemu_mutex_lock(¶m->mutex); 551 } else { 552 qemu_cond_wait(¶m->cond, ¶m->mutex); 553 } 554 } 555 qemu_mutex_unlock(¶m->mutex); 556 557 return NULL; 558 } 559 560 static void compress_threads_save_cleanup(void) 561 { 562 int i, thread_count; 563 564 if (!migrate_use_compression() || !comp_param) { 565 return; 566 } 567 568 thread_count = migrate_compress_threads(); 569 for (i = 0; i < thread_count; i++) { 570 /* 571 * we use it as a indicator which shows if the thread is 572 * properly init'd or not 573 */ 574 if (!comp_param[i].file) { 575 break; 576 } 577 578 qemu_mutex_lock(&comp_param[i].mutex); 579 comp_param[i].quit = true; 580 qemu_cond_signal(&comp_param[i].cond); 581 qemu_mutex_unlock(&comp_param[i].mutex); 582 583 qemu_thread_join(compress_threads + i); 584 qemu_mutex_destroy(&comp_param[i].mutex); 585 qemu_cond_destroy(&comp_param[i].cond); 586 deflateEnd(&comp_param[i].stream); 587 g_free(comp_param[i].originbuf); 588 qemu_fclose(comp_param[i].file); 589 comp_param[i].file = NULL; 590 } 591 qemu_mutex_destroy(&comp_done_lock); 592 qemu_cond_destroy(&comp_done_cond); 593 g_free(compress_threads); 594 g_free(comp_param); 595 compress_threads = NULL; 596 comp_param = NULL; 597 } 598 599 static int compress_threads_save_setup(void) 600 { 601 int i, thread_count; 602 603 if (!migrate_use_compression()) { 604 return 0; 605 } 606 thread_count = migrate_compress_threads(); 607 compress_threads = g_new0(QemuThread, thread_count); 608 comp_param = g_new0(CompressParam, thread_count); 609 qemu_cond_init(&comp_done_cond); 610 qemu_mutex_init(&comp_done_lock); 611 for (i = 0; i < thread_count; i++) { 612 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 613 if (!comp_param[i].originbuf) { 614 goto exit; 615 } 616 617 if (deflateInit(&comp_param[i].stream, 618 migrate_compress_level()) != Z_OK) { 619 g_free(comp_param[i].originbuf); 620 goto exit; 621 } 622 623 /* comp_param[i].file is just used as a dummy buffer to save data, 624 * set its ops to empty. 625 */ 626 comp_param[i].file = qemu_file_new_output( 627 QIO_CHANNEL(qio_channel_null_new())); 628 comp_param[i].done = true; 629 comp_param[i].quit = false; 630 qemu_mutex_init(&comp_param[i].mutex); 631 qemu_cond_init(&comp_param[i].cond); 632 qemu_thread_create(compress_threads + i, "compress", 633 do_data_compress, comp_param + i, 634 QEMU_THREAD_JOINABLE); 635 } 636 return 0; 637 638 exit: 639 compress_threads_save_cleanup(); 640 return -1; 641 } 642 643 /** 644 * save_page_header: write page header to wire 645 * 646 * If this is the 1st block, it also writes the block identification 647 * 648 * Returns the number of bytes written 649 * 650 * @f: QEMUFile where to send the data 651 * @block: block that contains the page we want to send 652 * @offset: offset inside the block for the page 653 * in the lower bits, it contains flags 654 */ 655 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 656 ram_addr_t offset) 657 { 658 size_t size, len; 659 660 if (block == rs->last_sent_block) { 661 offset |= RAM_SAVE_FLAG_CONTINUE; 662 } 663 qemu_put_be64(f, offset); 664 size = 8; 665 666 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 667 len = strlen(block->idstr); 668 qemu_put_byte(f, len); 669 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 670 size += 1 + len; 671 rs->last_sent_block = block; 672 } 673 return size; 674 } 675 676 /** 677 * mig_throttle_guest_down: throttle down the guest 678 * 679 * Reduce amount of guest cpu execution to hopefully slow down memory 680 * writes. If guest dirty memory rate is reduced below the rate at 681 * which we can transfer pages to the destination then we should be 682 * able to complete migration. Some workloads dirty memory way too 683 * fast and will not effectively converge, even with auto-converge. 684 */ 685 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 686 uint64_t bytes_dirty_threshold) 687 { 688 MigrationState *s = migrate_get_current(); 689 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 690 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 691 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 692 int pct_max = s->parameters.max_cpu_throttle; 693 694 uint64_t throttle_now = cpu_throttle_get_percentage(); 695 uint64_t cpu_now, cpu_ideal, throttle_inc; 696 697 /* We have not started throttling yet. Let's start it. */ 698 if (!cpu_throttle_active()) { 699 cpu_throttle_set(pct_initial); 700 } else { 701 /* Throttling already on, just increase the rate */ 702 if (!pct_tailslow) { 703 throttle_inc = pct_increment; 704 } else { 705 /* Compute the ideal CPU percentage used by Guest, which may 706 * make the dirty rate match the dirty rate threshold. */ 707 cpu_now = 100 - throttle_now; 708 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 709 bytes_dirty_period); 710 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 711 } 712 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 713 } 714 } 715 716 void mig_throttle_counter_reset(void) 717 { 718 RAMState *rs = ram_state; 719 720 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 721 rs->num_dirty_pages_period = 0; 722 rs->bytes_xfer_prev = ram_counters.transferred; 723 } 724 725 /** 726 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 727 * 728 * @rs: current RAM state 729 * @current_addr: address for the zero page 730 * 731 * Update the xbzrle cache to reflect a page that's been sent as all 0. 732 * The important thing is that a stale (not-yet-0'd) page be replaced 733 * by the new data. 734 * As a bonus, if the page wasn't in the cache it gets added so that 735 * when a small write is made into the 0'd page it gets XBZRLE sent. 736 */ 737 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 738 { 739 if (!rs->xbzrle_enabled) { 740 return; 741 } 742 743 /* We don't care if this fails to allocate a new cache page 744 * as long as it updated an old one */ 745 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 746 ram_counters.dirty_sync_count); 747 } 748 749 #define ENCODING_FLAG_XBZRLE 0x1 750 751 /** 752 * save_xbzrle_page: compress and send current page 753 * 754 * Returns: 1 means that we wrote the page 755 * 0 means that page is identical to the one already sent 756 * -1 means that xbzrle would be longer than normal 757 * 758 * @rs: current RAM state 759 * @current_data: pointer to the address of the page contents 760 * @current_addr: addr of the page 761 * @block: block that contains the page we want to send 762 * @offset: offset inside the block for the page 763 */ 764 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 765 ram_addr_t current_addr, RAMBlock *block, 766 ram_addr_t offset) 767 { 768 int encoded_len = 0, bytes_xbzrle; 769 uint8_t *prev_cached_page; 770 771 if (!cache_is_cached(XBZRLE.cache, current_addr, 772 ram_counters.dirty_sync_count)) { 773 xbzrle_counters.cache_miss++; 774 if (!rs->last_stage) { 775 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 776 ram_counters.dirty_sync_count) == -1) { 777 return -1; 778 } else { 779 /* update *current_data when the page has been 780 inserted into cache */ 781 *current_data = get_cached_data(XBZRLE.cache, current_addr); 782 } 783 } 784 return -1; 785 } 786 787 /* 788 * Reaching here means the page has hit the xbzrle cache, no matter what 789 * encoding result it is (normal encoding, overflow or skipping the page), 790 * count the page as encoded. This is used to calculate the encoding rate. 791 * 792 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 793 * 2nd page turns out to be skipped (i.e. no new bytes written to the 794 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 795 * skipped page included. In this way, the encoding rate can tell if the 796 * guest page is good for xbzrle encoding. 797 */ 798 xbzrle_counters.pages++; 799 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 800 801 /* save current buffer into memory */ 802 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 803 804 /* XBZRLE encoding (if there is no overflow) */ 805 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 806 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 807 TARGET_PAGE_SIZE); 808 809 /* 810 * Update the cache contents, so that it corresponds to the data 811 * sent, in all cases except where we skip the page. 812 */ 813 if (!rs->last_stage && encoded_len != 0) { 814 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 815 /* 816 * In the case where we couldn't compress, ensure that the caller 817 * sends the data from the cache, since the guest might have 818 * changed the RAM since we copied it. 819 */ 820 *current_data = prev_cached_page; 821 } 822 823 if (encoded_len == 0) { 824 trace_save_xbzrle_page_skipping(); 825 return 0; 826 } else if (encoded_len == -1) { 827 trace_save_xbzrle_page_overflow(); 828 xbzrle_counters.overflow++; 829 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 830 return -1; 831 } 832 833 /* Send XBZRLE based compressed page */ 834 bytes_xbzrle = save_page_header(rs, rs->f, block, 835 offset | RAM_SAVE_FLAG_XBZRLE); 836 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 837 qemu_put_be16(rs->f, encoded_len); 838 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 839 bytes_xbzrle += encoded_len + 1 + 2; 840 /* 841 * Like compressed_size (please see update_compress_thread_counts), 842 * the xbzrle encoded bytes don't count the 8 byte header with 843 * RAM_SAVE_FLAG_CONTINUE. 844 */ 845 xbzrle_counters.bytes += bytes_xbzrle - 8; 846 ram_transferred_add(bytes_xbzrle); 847 848 return 1; 849 } 850 851 /** 852 * migration_bitmap_find_dirty: find the next dirty page from start 853 * 854 * Returns the page offset within memory region of the start of a dirty page 855 * 856 * @rs: current RAM state 857 * @rb: RAMBlock where to search for dirty pages 858 * @start: page where we start the search 859 */ 860 static inline 861 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 862 unsigned long start) 863 { 864 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 865 unsigned long *bitmap = rb->bmap; 866 867 if (ramblock_is_ignored(rb)) { 868 return size; 869 } 870 871 return find_next_bit(bitmap, size, start); 872 } 873 874 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 875 unsigned long page) 876 { 877 uint8_t shift; 878 hwaddr size, start; 879 880 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 881 return; 882 } 883 884 shift = rb->clear_bmap_shift; 885 /* 886 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 887 * can make things easier sometimes since then start address 888 * of the small chunk will always be 64 pages aligned so the 889 * bitmap will always be aligned to unsigned long. We should 890 * even be able to remove this restriction but I'm simply 891 * keeping it. 892 */ 893 assert(shift >= 6); 894 895 size = 1ULL << (TARGET_PAGE_BITS + shift); 896 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 897 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 898 memory_region_clear_dirty_bitmap(rb->mr, start, size); 899 } 900 901 static void 902 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 903 unsigned long start, 904 unsigned long npages) 905 { 906 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 907 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 908 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 909 910 /* 911 * Clear pages from start to start + npages - 1, so the end boundary is 912 * exclusive. 913 */ 914 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 915 migration_clear_memory_region_dirty_bitmap(rb, i); 916 } 917 } 918 919 /* 920 * colo_bitmap_find_diry:find contiguous dirty pages from start 921 * 922 * Returns the page offset within memory region of the start of the contiguout 923 * dirty page 924 * 925 * @rs: current RAM state 926 * @rb: RAMBlock where to search for dirty pages 927 * @start: page where we start the search 928 * @num: the number of contiguous dirty pages 929 */ 930 static inline 931 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 932 unsigned long start, unsigned long *num) 933 { 934 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 935 unsigned long *bitmap = rb->bmap; 936 unsigned long first, next; 937 938 *num = 0; 939 940 if (ramblock_is_ignored(rb)) { 941 return size; 942 } 943 944 first = find_next_bit(bitmap, size, start); 945 if (first >= size) { 946 return first; 947 } 948 next = find_next_zero_bit(bitmap, size, first + 1); 949 assert(next >= first); 950 *num = next - first; 951 return first; 952 } 953 954 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 955 RAMBlock *rb, 956 unsigned long page) 957 { 958 bool ret; 959 960 /* 961 * Clear dirty bitmap if needed. This _must_ be called before we 962 * send any of the page in the chunk because we need to make sure 963 * we can capture further page content changes when we sync dirty 964 * log the next time. So as long as we are going to send any of 965 * the page in the chunk we clear the remote dirty bitmap for all. 966 * Clearing it earlier won't be a problem, but too late will. 967 */ 968 migration_clear_memory_region_dirty_bitmap(rb, page); 969 970 ret = test_and_clear_bit(page, rb->bmap); 971 if (ret) { 972 rs->migration_dirty_pages--; 973 } 974 975 return ret; 976 } 977 978 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 979 void *opaque) 980 { 981 const hwaddr offset = section->offset_within_region; 982 const hwaddr size = int128_get64(section->size); 983 const unsigned long start = offset >> TARGET_PAGE_BITS; 984 const unsigned long npages = size >> TARGET_PAGE_BITS; 985 RAMBlock *rb = section->mr->ram_block; 986 uint64_t *cleared_bits = opaque; 987 988 /* 989 * We don't grab ram_state->bitmap_mutex because we expect to run 990 * only when starting migration or during postcopy recovery where 991 * we don't have concurrent access. 992 */ 993 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 994 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 995 } 996 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 997 bitmap_clear(rb->bmap, start, npages); 998 } 999 1000 /* 1001 * Exclude all dirty pages from migration that fall into a discarded range as 1002 * managed by a RamDiscardManager responsible for the mapped memory region of 1003 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1004 * 1005 * Discarded pages ("logically unplugged") have undefined content and must 1006 * not get migrated, because even reading these pages for migration might 1007 * result in undesired behavior. 1008 * 1009 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1010 * 1011 * Note: The result is only stable while migrating (precopy/postcopy). 1012 */ 1013 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1014 { 1015 uint64_t cleared_bits = 0; 1016 1017 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1018 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1019 MemoryRegionSection section = { 1020 .mr = rb->mr, 1021 .offset_within_region = 0, 1022 .size = int128_make64(qemu_ram_get_used_length(rb)), 1023 }; 1024 1025 ram_discard_manager_replay_discarded(rdm, §ion, 1026 dirty_bitmap_clear_section, 1027 &cleared_bits); 1028 } 1029 return cleared_bits; 1030 } 1031 1032 /* 1033 * Check if a host-page aligned page falls into a discarded range as managed by 1034 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1035 * 1036 * Note: The result is only stable while migrating (precopy/postcopy). 1037 */ 1038 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1039 { 1040 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1041 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1042 MemoryRegionSection section = { 1043 .mr = rb->mr, 1044 .offset_within_region = start, 1045 .size = int128_make64(qemu_ram_pagesize(rb)), 1046 }; 1047 1048 return !ram_discard_manager_is_populated(rdm, §ion); 1049 } 1050 return false; 1051 } 1052 1053 /* Called with RCU critical section */ 1054 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1055 { 1056 uint64_t new_dirty_pages = 1057 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1058 1059 rs->migration_dirty_pages += new_dirty_pages; 1060 rs->num_dirty_pages_period += new_dirty_pages; 1061 } 1062 1063 /** 1064 * ram_pagesize_summary: calculate all the pagesizes of a VM 1065 * 1066 * Returns a summary bitmap of the page sizes of all RAMBlocks 1067 * 1068 * For VMs with just normal pages this is equivalent to the host page 1069 * size. If it's got some huge pages then it's the OR of all the 1070 * different page sizes. 1071 */ 1072 uint64_t ram_pagesize_summary(void) 1073 { 1074 RAMBlock *block; 1075 uint64_t summary = 0; 1076 1077 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1078 summary |= block->page_size; 1079 } 1080 1081 return summary; 1082 } 1083 1084 uint64_t ram_get_total_transferred_pages(void) 1085 { 1086 return ram_counters.normal + ram_counters.duplicate + 1087 compression_counters.pages + xbzrle_counters.pages; 1088 } 1089 1090 static void migration_update_rates(RAMState *rs, int64_t end_time) 1091 { 1092 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1093 double compressed_size; 1094 1095 /* calculate period counters */ 1096 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1097 / (end_time - rs->time_last_bitmap_sync); 1098 1099 if (!page_count) { 1100 return; 1101 } 1102 1103 if (migrate_use_xbzrle()) { 1104 double encoded_size, unencoded_size; 1105 1106 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1107 rs->xbzrle_cache_miss_prev) / page_count; 1108 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1109 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1110 TARGET_PAGE_SIZE; 1111 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1112 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1113 xbzrle_counters.encoding_rate = 0; 1114 } else { 1115 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1116 } 1117 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1118 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1119 } 1120 1121 if (migrate_use_compression()) { 1122 compression_counters.busy_rate = (double)(compression_counters.busy - 1123 rs->compress_thread_busy_prev) / page_count; 1124 rs->compress_thread_busy_prev = compression_counters.busy; 1125 1126 compressed_size = compression_counters.compressed_size - 1127 rs->compressed_size_prev; 1128 if (compressed_size) { 1129 double uncompressed_size = (compression_counters.pages - 1130 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1131 1132 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1133 compression_counters.compression_rate = 1134 uncompressed_size / compressed_size; 1135 1136 rs->compress_pages_prev = compression_counters.pages; 1137 rs->compressed_size_prev = compression_counters.compressed_size; 1138 } 1139 } 1140 } 1141 1142 static void migration_trigger_throttle(RAMState *rs) 1143 { 1144 MigrationState *s = migrate_get_current(); 1145 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1146 1147 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 1148 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1149 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1150 1151 /* During block migration the auto-converge logic incorrectly detects 1152 * that ram migration makes no progress. Avoid this by disabling the 1153 * throttling logic during the bulk phase of block migration. */ 1154 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1155 /* The following detection logic can be refined later. For now: 1156 Check to see if the ratio between dirtied bytes and the approx. 1157 amount of bytes that just got transferred since the last time 1158 we were in this routine reaches the threshold. If that happens 1159 twice, start or increase throttling. */ 1160 1161 if ((bytes_dirty_period > bytes_dirty_threshold) && 1162 (++rs->dirty_rate_high_cnt >= 2)) { 1163 trace_migration_throttle(); 1164 rs->dirty_rate_high_cnt = 0; 1165 mig_throttle_guest_down(bytes_dirty_period, 1166 bytes_dirty_threshold); 1167 } 1168 } 1169 } 1170 1171 static void migration_bitmap_sync(RAMState *rs) 1172 { 1173 RAMBlock *block; 1174 int64_t end_time; 1175 1176 ram_counters.dirty_sync_count++; 1177 1178 if (!rs->time_last_bitmap_sync) { 1179 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1180 } 1181 1182 trace_migration_bitmap_sync_start(); 1183 memory_global_dirty_log_sync(); 1184 1185 qemu_mutex_lock(&rs->bitmap_mutex); 1186 WITH_RCU_READ_LOCK_GUARD() { 1187 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1188 ramblock_sync_dirty_bitmap(rs, block); 1189 } 1190 ram_counters.remaining = ram_bytes_remaining(); 1191 } 1192 qemu_mutex_unlock(&rs->bitmap_mutex); 1193 1194 memory_global_after_dirty_log_sync(); 1195 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1196 1197 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1198 1199 /* more than 1 second = 1000 millisecons */ 1200 if (end_time > rs->time_last_bitmap_sync + 1000) { 1201 migration_trigger_throttle(rs); 1202 1203 migration_update_rates(rs, end_time); 1204 1205 rs->target_page_count_prev = rs->target_page_count; 1206 1207 /* reset period counters */ 1208 rs->time_last_bitmap_sync = end_time; 1209 rs->num_dirty_pages_period = 0; 1210 rs->bytes_xfer_prev = ram_counters.transferred; 1211 } 1212 if (migrate_use_events()) { 1213 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1214 } 1215 } 1216 1217 static void migration_bitmap_sync_precopy(RAMState *rs) 1218 { 1219 Error *local_err = NULL; 1220 1221 /* 1222 * The current notifier usage is just an optimization to migration, so we 1223 * don't stop the normal migration process in the error case. 1224 */ 1225 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1226 error_report_err(local_err); 1227 local_err = NULL; 1228 } 1229 1230 migration_bitmap_sync(rs); 1231 1232 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1233 error_report_err(local_err); 1234 } 1235 } 1236 1237 static void ram_release_page(const char *rbname, uint64_t offset) 1238 { 1239 if (!migrate_release_ram() || !migration_in_postcopy()) { 1240 return; 1241 } 1242 1243 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1244 } 1245 1246 /** 1247 * save_zero_page_to_file: send the zero page to the file 1248 * 1249 * Returns the size of data written to the file, 0 means the page is not 1250 * a zero page 1251 * 1252 * @rs: current RAM state 1253 * @file: the file where the data is saved 1254 * @block: block that contains the page we want to send 1255 * @offset: offset inside the block for the page 1256 */ 1257 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1258 RAMBlock *block, ram_addr_t offset) 1259 { 1260 uint8_t *p = block->host + offset; 1261 int len = 0; 1262 1263 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1264 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1265 qemu_put_byte(file, 0); 1266 len += 1; 1267 ram_release_page(block->idstr, offset); 1268 } 1269 return len; 1270 } 1271 1272 /** 1273 * save_zero_page: send the zero page to the stream 1274 * 1275 * Returns the number of pages written. 1276 * 1277 * @rs: current RAM state 1278 * @block: block that contains the page we want to send 1279 * @offset: offset inside the block for the page 1280 */ 1281 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1282 { 1283 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1284 1285 if (len) { 1286 ram_counters.duplicate++; 1287 ram_transferred_add(len); 1288 return 1; 1289 } 1290 return -1; 1291 } 1292 1293 /* 1294 * @pages: the number of pages written by the control path, 1295 * < 0 - error 1296 * > 0 - number of pages written 1297 * 1298 * Return true if the pages has been saved, otherwise false is returned. 1299 */ 1300 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1301 int *pages) 1302 { 1303 uint64_t bytes_xmit = 0; 1304 int ret; 1305 1306 *pages = -1; 1307 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1308 &bytes_xmit); 1309 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1310 return false; 1311 } 1312 1313 if (bytes_xmit) { 1314 ram_transferred_add(bytes_xmit); 1315 *pages = 1; 1316 } 1317 1318 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1319 return true; 1320 } 1321 1322 if (bytes_xmit > 0) { 1323 ram_counters.normal++; 1324 } else if (bytes_xmit == 0) { 1325 ram_counters.duplicate++; 1326 } 1327 1328 return true; 1329 } 1330 1331 /* 1332 * directly send the page to the stream 1333 * 1334 * Returns the number of pages written. 1335 * 1336 * @rs: current RAM state 1337 * @block: block that contains the page we want to send 1338 * @offset: offset inside the block for the page 1339 * @buf: the page to be sent 1340 * @async: send to page asyncly 1341 */ 1342 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1343 uint8_t *buf, bool async) 1344 { 1345 ram_transferred_add(save_page_header(rs, rs->f, block, 1346 offset | RAM_SAVE_FLAG_PAGE)); 1347 if (async) { 1348 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1349 migrate_release_ram() && 1350 migration_in_postcopy()); 1351 } else { 1352 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1353 } 1354 ram_transferred_add(TARGET_PAGE_SIZE); 1355 ram_counters.normal++; 1356 return 1; 1357 } 1358 1359 /** 1360 * ram_save_page: send the given page to the stream 1361 * 1362 * Returns the number of pages written. 1363 * < 0 - error 1364 * >=0 - Number of pages written - this might legally be 0 1365 * if xbzrle noticed the page was the same. 1366 * 1367 * @rs: current RAM state 1368 * @block: block that contains the page we want to send 1369 * @offset: offset inside the block for the page 1370 */ 1371 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1372 { 1373 int pages = -1; 1374 uint8_t *p; 1375 bool send_async = true; 1376 RAMBlock *block = pss->block; 1377 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1378 ram_addr_t current_addr = block->offset + offset; 1379 1380 p = block->host + offset; 1381 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1382 1383 XBZRLE_cache_lock(); 1384 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1385 pages = save_xbzrle_page(rs, &p, current_addr, block, 1386 offset); 1387 if (!rs->last_stage) { 1388 /* Can't send this cached data async, since the cache page 1389 * might get updated before it gets to the wire 1390 */ 1391 send_async = false; 1392 } 1393 } 1394 1395 /* XBZRLE overflow or normal page */ 1396 if (pages == -1) { 1397 pages = save_normal_page(rs, block, offset, p, send_async); 1398 } 1399 1400 XBZRLE_cache_unlock(); 1401 1402 return pages; 1403 } 1404 1405 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1406 ram_addr_t offset) 1407 { 1408 if (multifd_queue_page(rs->f, block, offset) < 0) { 1409 return -1; 1410 } 1411 ram_counters.normal++; 1412 1413 return 1; 1414 } 1415 1416 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1417 ram_addr_t offset, uint8_t *source_buf) 1418 { 1419 RAMState *rs = ram_state; 1420 uint8_t *p = block->host + offset; 1421 int ret; 1422 1423 if (save_zero_page_to_file(rs, f, block, offset)) { 1424 return true; 1425 } 1426 1427 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1428 1429 /* 1430 * copy it to a internal buffer to avoid it being modified by VM 1431 * so that we can catch up the error during compression and 1432 * decompression 1433 */ 1434 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1435 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1436 if (ret < 0) { 1437 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1438 error_report("compressed data failed!"); 1439 } 1440 return false; 1441 } 1442 1443 static void 1444 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1445 { 1446 ram_transferred_add(bytes_xmit); 1447 1448 if (param->zero_page) { 1449 ram_counters.duplicate++; 1450 return; 1451 } 1452 1453 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1454 compression_counters.compressed_size += bytes_xmit - 8; 1455 compression_counters.pages++; 1456 } 1457 1458 static bool save_page_use_compression(RAMState *rs); 1459 1460 static void flush_compressed_data(RAMState *rs) 1461 { 1462 int idx, len, thread_count; 1463 1464 if (!save_page_use_compression(rs)) { 1465 return; 1466 } 1467 thread_count = migrate_compress_threads(); 1468 1469 qemu_mutex_lock(&comp_done_lock); 1470 for (idx = 0; idx < thread_count; idx++) { 1471 while (!comp_param[idx].done) { 1472 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1473 } 1474 } 1475 qemu_mutex_unlock(&comp_done_lock); 1476 1477 for (idx = 0; idx < thread_count; idx++) { 1478 qemu_mutex_lock(&comp_param[idx].mutex); 1479 if (!comp_param[idx].quit) { 1480 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1481 /* 1482 * it's safe to fetch zero_page without holding comp_done_lock 1483 * as there is no further request submitted to the thread, 1484 * i.e, the thread should be waiting for a request at this point. 1485 */ 1486 update_compress_thread_counts(&comp_param[idx], len); 1487 } 1488 qemu_mutex_unlock(&comp_param[idx].mutex); 1489 } 1490 } 1491 1492 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1493 ram_addr_t offset) 1494 { 1495 param->block = block; 1496 param->offset = offset; 1497 } 1498 1499 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1500 ram_addr_t offset) 1501 { 1502 int idx, thread_count, bytes_xmit = -1, pages = -1; 1503 bool wait = migrate_compress_wait_thread(); 1504 1505 thread_count = migrate_compress_threads(); 1506 qemu_mutex_lock(&comp_done_lock); 1507 retry: 1508 for (idx = 0; idx < thread_count; idx++) { 1509 if (comp_param[idx].done) { 1510 comp_param[idx].done = false; 1511 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1512 qemu_mutex_lock(&comp_param[idx].mutex); 1513 set_compress_params(&comp_param[idx], block, offset); 1514 qemu_cond_signal(&comp_param[idx].cond); 1515 qemu_mutex_unlock(&comp_param[idx].mutex); 1516 pages = 1; 1517 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1518 break; 1519 } 1520 } 1521 1522 /* 1523 * wait for the free thread if the user specifies 'compress-wait-thread', 1524 * otherwise we will post the page out in the main thread as normal page. 1525 */ 1526 if (pages < 0 && wait) { 1527 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1528 goto retry; 1529 } 1530 qemu_mutex_unlock(&comp_done_lock); 1531 1532 return pages; 1533 } 1534 1535 /** 1536 * find_dirty_block: find the next dirty page and update any state 1537 * associated with the search process. 1538 * 1539 * Returns true if a page is found 1540 * 1541 * @rs: current RAM state 1542 * @pss: data about the state of the current dirty page scan 1543 * @again: set to false if the search has scanned the whole of RAM 1544 */ 1545 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1546 { 1547 /* 1548 * This is not a postcopy requested page, mark it "not urgent", and use 1549 * precopy channel to send it. 1550 */ 1551 pss->postcopy_requested = false; 1552 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; 1553 1554 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1555 if (pss->complete_round && pss->block == rs->last_seen_block && 1556 pss->page >= rs->last_page) { 1557 /* 1558 * We've been once around the RAM and haven't found anything. 1559 * Give up. 1560 */ 1561 *again = false; 1562 return false; 1563 } 1564 if (!offset_in_ramblock(pss->block, 1565 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1566 /* Didn't find anything in this RAM Block */ 1567 pss->page = 0; 1568 pss->block = QLIST_NEXT_RCU(pss->block, next); 1569 if (!pss->block) { 1570 /* 1571 * If memory migration starts over, we will meet a dirtied page 1572 * which may still exists in compression threads's ring, so we 1573 * should flush the compressed data to make sure the new page 1574 * is not overwritten by the old one in the destination. 1575 * 1576 * Also If xbzrle is on, stop using the data compression at this 1577 * point. In theory, xbzrle can do better than compression. 1578 */ 1579 flush_compressed_data(rs); 1580 1581 /* Hit the end of the list */ 1582 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1583 /* Flag that we've looped */ 1584 pss->complete_round = true; 1585 /* After the first round, enable XBZRLE. */ 1586 if (migrate_use_xbzrle()) { 1587 rs->xbzrle_enabled = true; 1588 } 1589 } 1590 /* Didn't find anything this time, but try again on the new block */ 1591 *again = true; 1592 return false; 1593 } else { 1594 /* Can go around again, but... */ 1595 *again = true; 1596 /* We've found something so probably don't need to */ 1597 return true; 1598 } 1599 } 1600 1601 /** 1602 * unqueue_page: gets a page of the queue 1603 * 1604 * Helper for 'get_queued_page' - gets a page off the queue 1605 * 1606 * Returns the block of the page (or NULL if none available) 1607 * 1608 * @rs: current RAM state 1609 * @offset: used to return the offset within the RAMBlock 1610 */ 1611 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1612 { 1613 struct RAMSrcPageRequest *entry; 1614 RAMBlock *block = NULL; 1615 1616 if (!postcopy_has_request(rs)) { 1617 return NULL; 1618 } 1619 1620 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1621 1622 /* 1623 * This should _never_ change even after we take the lock, because no one 1624 * should be taking anything off the request list other than us. 1625 */ 1626 assert(postcopy_has_request(rs)); 1627 1628 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1629 block = entry->rb; 1630 *offset = entry->offset; 1631 1632 if (entry->len > TARGET_PAGE_SIZE) { 1633 entry->len -= TARGET_PAGE_SIZE; 1634 entry->offset += TARGET_PAGE_SIZE; 1635 } else { 1636 memory_region_unref(block->mr); 1637 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1638 g_free(entry); 1639 migration_consume_urgent_request(); 1640 } 1641 1642 return block; 1643 } 1644 1645 #if defined(__linux__) 1646 /** 1647 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1648 * is found, return RAM block pointer and page offset 1649 * 1650 * Returns pointer to the RAMBlock containing faulting page, 1651 * NULL if no write faults are pending 1652 * 1653 * @rs: current RAM state 1654 * @offset: page offset from the beginning of the block 1655 */ 1656 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1657 { 1658 struct uffd_msg uffd_msg; 1659 void *page_address; 1660 RAMBlock *block; 1661 int res; 1662 1663 if (!migrate_background_snapshot()) { 1664 return NULL; 1665 } 1666 1667 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1668 if (res <= 0) { 1669 return NULL; 1670 } 1671 1672 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1673 block = qemu_ram_block_from_host(page_address, false, offset); 1674 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1675 return block; 1676 } 1677 1678 /** 1679 * ram_save_release_protection: release UFFD write protection after 1680 * a range of pages has been saved 1681 * 1682 * @rs: current RAM state 1683 * @pss: page-search-status structure 1684 * @start_page: index of the first page in the range relative to pss->block 1685 * 1686 * Returns 0 on success, negative value in case of an error 1687 */ 1688 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1689 unsigned long start_page) 1690 { 1691 int res = 0; 1692 1693 /* Check if page is from UFFD-managed region. */ 1694 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1695 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1696 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1697 1698 /* Flush async buffers before un-protect. */ 1699 qemu_fflush(rs->f); 1700 /* Un-protect memory range. */ 1701 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1702 false, false); 1703 } 1704 1705 return res; 1706 } 1707 1708 /* ram_write_tracking_available: check if kernel supports required UFFD features 1709 * 1710 * Returns true if supports, false otherwise 1711 */ 1712 bool ram_write_tracking_available(void) 1713 { 1714 uint64_t uffd_features; 1715 int res; 1716 1717 res = uffd_query_features(&uffd_features); 1718 return (res == 0 && 1719 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1720 } 1721 1722 /* ram_write_tracking_compatible: check if guest configuration is 1723 * compatible with 'write-tracking' 1724 * 1725 * Returns true if compatible, false otherwise 1726 */ 1727 bool ram_write_tracking_compatible(void) 1728 { 1729 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1730 int uffd_fd; 1731 RAMBlock *block; 1732 bool ret = false; 1733 1734 /* Open UFFD file descriptor */ 1735 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1736 if (uffd_fd < 0) { 1737 return false; 1738 } 1739 1740 RCU_READ_LOCK_GUARD(); 1741 1742 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1743 uint64_t uffd_ioctls; 1744 1745 /* Nothing to do with read-only and MMIO-writable regions */ 1746 if (block->mr->readonly || block->mr->rom_device) { 1747 continue; 1748 } 1749 /* Try to register block memory via UFFD-IO to track writes */ 1750 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1751 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1752 goto out; 1753 } 1754 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1755 goto out; 1756 } 1757 } 1758 ret = true; 1759 1760 out: 1761 uffd_close_fd(uffd_fd); 1762 return ret; 1763 } 1764 1765 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1766 ram_addr_t size) 1767 { 1768 /* 1769 * We read one byte of each page; this will preallocate page tables if 1770 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1771 * where no page was populated yet. This might require adaption when 1772 * supporting other mappings, like shmem. 1773 */ 1774 for (; offset < size; offset += block->page_size) { 1775 char tmp = *((char *)block->host + offset); 1776 1777 /* Don't optimize the read out */ 1778 asm volatile("" : "+r" (tmp)); 1779 } 1780 } 1781 1782 static inline int populate_read_section(MemoryRegionSection *section, 1783 void *opaque) 1784 { 1785 const hwaddr size = int128_get64(section->size); 1786 hwaddr offset = section->offset_within_region; 1787 RAMBlock *block = section->mr->ram_block; 1788 1789 populate_read_range(block, offset, size); 1790 return 0; 1791 } 1792 1793 /* 1794 * ram_block_populate_read: preallocate page tables and populate pages in the 1795 * RAM block by reading a byte of each page. 1796 * 1797 * Since it's solely used for userfault_fd WP feature, here we just 1798 * hardcode page size to qemu_real_host_page_size. 1799 * 1800 * @block: RAM block to populate 1801 */ 1802 static void ram_block_populate_read(RAMBlock *rb) 1803 { 1804 /* 1805 * Skip populating all pages that fall into a discarded range as managed by 1806 * a RamDiscardManager responsible for the mapped memory region of the 1807 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1808 * must not get populated automatically. We don't have to track 1809 * modifications via userfaultfd WP reliably, because these pages will 1810 * not be part of the migration stream either way -- see 1811 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1812 * 1813 * Note: The result is only stable while migrating (precopy/postcopy). 1814 */ 1815 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1816 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1817 MemoryRegionSection section = { 1818 .mr = rb->mr, 1819 .offset_within_region = 0, 1820 .size = rb->mr->size, 1821 }; 1822 1823 ram_discard_manager_replay_populated(rdm, §ion, 1824 populate_read_section, NULL); 1825 } else { 1826 populate_read_range(rb, 0, rb->used_length); 1827 } 1828 } 1829 1830 /* 1831 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1832 */ 1833 void ram_write_tracking_prepare(void) 1834 { 1835 RAMBlock *block; 1836 1837 RCU_READ_LOCK_GUARD(); 1838 1839 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1840 /* Nothing to do with read-only and MMIO-writable regions */ 1841 if (block->mr->readonly || block->mr->rom_device) { 1842 continue; 1843 } 1844 1845 /* 1846 * Populate pages of the RAM block before enabling userfault_fd 1847 * write protection. 1848 * 1849 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1850 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1851 * pages with pte_none() entries in page table. 1852 */ 1853 ram_block_populate_read(block); 1854 } 1855 } 1856 1857 /* 1858 * ram_write_tracking_start: start UFFD-WP memory tracking 1859 * 1860 * Returns 0 for success or negative value in case of error 1861 */ 1862 int ram_write_tracking_start(void) 1863 { 1864 int uffd_fd; 1865 RAMState *rs = ram_state; 1866 RAMBlock *block; 1867 1868 /* Open UFFD file descriptor */ 1869 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1870 if (uffd_fd < 0) { 1871 return uffd_fd; 1872 } 1873 rs->uffdio_fd = uffd_fd; 1874 1875 RCU_READ_LOCK_GUARD(); 1876 1877 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1878 /* Nothing to do with read-only and MMIO-writable regions */ 1879 if (block->mr->readonly || block->mr->rom_device) { 1880 continue; 1881 } 1882 1883 /* Register block memory with UFFD to track writes */ 1884 if (uffd_register_memory(rs->uffdio_fd, block->host, 1885 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1886 goto fail; 1887 } 1888 /* Apply UFFD write protection to the block memory range */ 1889 if (uffd_change_protection(rs->uffdio_fd, block->host, 1890 block->max_length, true, false)) { 1891 goto fail; 1892 } 1893 block->flags |= RAM_UF_WRITEPROTECT; 1894 memory_region_ref(block->mr); 1895 1896 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1897 block->host, block->max_length); 1898 } 1899 1900 return 0; 1901 1902 fail: 1903 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1904 1905 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1906 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1907 continue; 1908 } 1909 /* 1910 * In case some memory block failed to be write-protected 1911 * remove protection and unregister all succeeded RAM blocks 1912 */ 1913 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1914 false, false); 1915 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1916 /* Cleanup flags and remove reference */ 1917 block->flags &= ~RAM_UF_WRITEPROTECT; 1918 memory_region_unref(block->mr); 1919 } 1920 1921 uffd_close_fd(uffd_fd); 1922 rs->uffdio_fd = -1; 1923 return -1; 1924 } 1925 1926 /** 1927 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1928 */ 1929 void ram_write_tracking_stop(void) 1930 { 1931 RAMState *rs = ram_state; 1932 RAMBlock *block; 1933 1934 RCU_READ_LOCK_GUARD(); 1935 1936 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1937 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1938 continue; 1939 } 1940 /* Remove protection and unregister all affected RAM blocks */ 1941 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1942 false, false); 1943 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1944 1945 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1946 block->host, block->max_length); 1947 1948 /* Cleanup flags and remove reference */ 1949 block->flags &= ~RAM_UF_WRITEPROTECT; 1950 memory_region_unref(block->mr); 1951 } 1952 1953 /* Finally close UFFD file descriptor */ 1954 uffd_close_fd(rs->uffdio_fd); 1955 rs->uffdio_fd = -1; 1956 } 1957 1958 #else 1959 /* No target OS support, stubs just fail or ignore */ 1960 1961 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1962 { 1963 (void) rs; 1964 (void) offset; 1965 1966 return NULL; 1967 } 1968 1969 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1970 unsigned long start_page) 1971 { 1972 (void) rs; 1973 (void) pss; 1974 (void) start_page; 1975 1976 return 0; 1977 } 1978 1979 bool ram_write_tracking_available(void) 1980 { 1981 return false; 1982 } 1983 1984 bool ram_write_tracking_compatible(void) 1985 { 1986 assert(0); 1987 return false; 1988 } 1989 1990 int ram_write_tracking_start(void) 1991 { 1992 assert(0); 1993 return -1; 1994 } 1995 1996 void ram_write_tracking_stop(void) 1997 { 1998 assert(0); 1999 } 2000 #endif /* defined(__linux__) */ 2001 2002 /* 2003 * Check whether two addr/offset of the ramblock falls onto the same host huge 2004 * page. Returns true if so, false otherwise. 2005 */ 2006 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1, 2007 uint64_t addr2) 2008 { 2009 size_t page_size = qemu_ram_pagesize(rb); 2010 2011 addr1 = ROUND_DOWN(addr1, page_size); 2012 addr2 = ROUND_DOWN(addr2, page_size); 2013 2014 return addr1 == addr2; 2015 } 2016 2017 /* 2018 * Whether a previous preempted precopy huge page contains current requested 2019 * page? Returns true if so, false otherwise. 2020 * 2021 * This should really happen very rarely, because it means when we were sending 2022 * during background migration for postcopy we're sending exactly the page that 2023 * some vcpu got faulted on on dest node. When it happens, we probably don't 2024 * need to do much but drop the request, because we know right after we restore 2025 * the precopy stream it'll be serviced. It'll slightly affect the order of 2026 * postcopy requests to be serviced (e.g. it'll be the same as we move current 2027 * request to the end of the queue) but it shouldn't be a big deal. The most 2028 * imporant thing is we can _never_ try to send a partial-sent huge page on the 2029 * POSTCOPY channel again, otherwise that huge page will got "split brain" on 2030 * two channels (PRECOPY, POSTCOPY). 2031 */ 2032 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block, 2033 ram_addr_t offset) 2034 { 2035 PostcopyPreemptState *state = &rs->postcopy_preempt_state; 2036 2037 /* No preemption at all? */ 2038 if (!state->preempted) { 2039 return false; 2040 } 2041 2042 /* Not even the same ramblock? */ 2043 if (state->ram_block != block) { 2044 return false; 2045 } 2046 2047 return offset_on_same_huge_page(block, offset, 2048 state->ram_page << TARGET_PAGE_BITS); 2049 } 2050 2051 /** 2052 * get_queued_page: unqueue a page from the postcopy requests 2053 * 2054 * Skips pages that are already sent (!dirty) 2055 * 2056 * Returns true if a queued page is found 2057 * 2058 * @rs: current RAM state 2059 * @pss: data about the state of the current dirty page scan 2060 */ 2061 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2062 { 2063 RAMBlock *block; 2064 ram_addr_t offset; 2065 bool dirty; 2066 2067 do { 2068 block = unqueue_page(rs, &offset); 2069 /* 2070 * We're sending this page, and since it's postcopy nothing else 2071 * will dirty it, and we must make sure it doesn't get sent again 2072 * even if this queue request was received after the background 2073 * search already sent it. 2074 */ 2075 if (block) { 2076 unsigned long page; 2077 2078 page = offset >> TARGET_PAGE_BITS; 2079 dirty = test_bit(page, block->bmap); 2080 if (!dirty) { 2081 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2082 page); 2083 } else { 2084 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2085 } 2086 } 2087 2088 } while (block && !dirty); 2089 2090 if (block) { 2091 /* See comment above postcopy_preempted_contains() */ 2092 if (postcopy_preempted_contains(rs, block, offset)) { 2093 trace_postcopy_preempt_hit(block->idstr, offset); 2094 /* 2095 * If what we preempted previously was exactly what we're 2096 * requesting right now, restore the preempted precopy 2097 * immediately, boosting its priority as it's requested by 2098 * postcopy. 2099 */ 2100 postcopy_preempt_restore(rs, pss, true); 2101 return true; 2102 } 2103 } else { 2104 /* 2105 * Poll write faults too if background snapshot is enabled; that's 2106 * when we have vcpus got blocked by the write protected pages. 2107 */ 2108 block = poll_fault_page(rs, &offset); 2109 } 2110 2111 if (block) { 2112 /* 2113 * We want the background search to continue from the queued page 2114 * since the guest is likely to want other pages near to the page 2115 * it just requested. 2116 */ 2117 pss->block = block; 2118 pss->page = offset >> TARGET_PAGE_BITS; 2119 2120 /* 2121 * This unqueued page would break the "one round" check, even is 2122 * really rare. 2123 */ 2124 pss->complete_round = false; 2125 /* Mark it an urgent request, meanwhile using POSTCOPY channel */ 2126 pss->postcopy_requested = true; 2127 pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY; 2128 } 2129 2130 return !!block; 2131 } 2132 2133 /** 2134 * migration_page_queue_free: drop any remaining pages in the ram 2135 * request queue 2136 * 2137 * It should be empty at the end anyway, but in error cases there may 2138 * be some left. in case that there is any page left, we drop it. 2139 * 2140 */ 2141 static void migration_page_queue_free(RAMState *rs) 2142 { 2143 struct RAMSrcPageRequest *mspr, *next_mspr; 2144 /* This queue generally should be empty - but in the case of a failed 2145 * migration might have some droppings in. 2146 */ 2147 RCU_READ_LOCK_GUARD(); 2148 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2149 memory_region_unref(mspr->rb->mr); 2150 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2151 g_free(mspr); 2152 } 2153 } 2154 2155 /** 2156 * ram_save_queue_pages: queue the page for transmission 2157 * 2158 * A request from postcopy destination for example. 2159 * 2160 * Returns zero on success or negative on error 2161 * 2162 * @rbname: Name of the RAMBLock of the request. NULL means the 2163 * same that last one. 2164 * @start: starting address from the start of the RAMBlock 2165 * @len: length (in bytes) to send 2166 */ 2167 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2168 { 2169 RAMBlock *ramblock; 2170 RAMState *rs = ram_state; 2171 2172 ram_counters.postcopy_requests++; 2173 RCU_READ_LOCK_GUARD(); 2174 2175 if (!rbname) { 2176 /* Reuse last RAMBlock */ 2177 ramblock = rs->last_req_rb; 2178 2179 if (!ramblock) { 2180 /* 2181 * Shouldn't happen, we can't reuse the last RAMBlock if 2182 * it's the 1st request. 2183 */ 2184 error_report("ram_save_queue_pages no previous block"); 2185 return -1; 2186 } 2187 } else { 2188 ramblock = qemu_ram_block_by_name(rbname); 2189 2190 if (!ramblock) { 2191 /* We shouldn't be asked for a non-existent RAMBlock */ 2192 error_report("ram_save_queue_pages no block '%s'", rbname); 2193 return -1; 2194 } 2195 rs->last_req_rb = ramblock; 2196 } 2197 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2198 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2199 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2200 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2201 __func__, start, len, ramblock->used_length); 2202 return -1; 2203 } 2204 2205 struct RAMSrcPageRequest *new_entry = 2206 g_new0(struct RAMSrcPageRequest, 1); 2207 new_entry->rb = ramblock; 2208 new_entry->offset = start; 2209 new_entry->len = len; 2210 2211 memory_region_ref(ramblock->mr); 2212 qemu_mutex_lock(&rs->src_page_req_mutex); 2213 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2214 migration_make_urgent_request(); 2215 qemu_mutex_unlock(&rs->src_page_req_mutex); 2216 2217 return 0; 2218 } 2219 2220 static bool save_page_use_compression(RAMState *rs) 2221 { 2222 if (!migrate_use_compression()) { 2223 return false; 2224 } 2225 2226 /* 2227 * If xbzrle is enabled (e.g., after first round of migration), stop 2228 * using the data compression. In theory, xbzrle can do better than 2229 * compression. 2230 */ 2231 if (rs->xbzrle_enabled) { 2232 return false; 2233 } 2234 2235 return true; 2236 } 2237 2238 /* 2239 * try to compress the page before posting it out, return true if the page 2240 * has been properly handled by compression, otherwise needs other 2241 * paths to handle it 2242 */ 2243 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 2244 { 2245 if (!save_page_use_compression(rs)) { 2246 return false; 2247 } 2248 2249 /* 2250 * When starting the process of a new block, the first page of 2251 * the block should be sent out before other pages in the same 2252 * block, and all the pages in last block should have been sent 2253 * out, keeping this order is important, because the 'cont' flag 2254 * is used to avoid resending the block name. 2255 * 2256 * We post the fist page as normal page as compression will take 2257 * much CPU resource. 2258 */ 2259 if (block != rs->last_sent_block) { 2260 flush_compressed_data(rs); 2261 return false; 2262 } 2263 2264 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 2265 return true; 2266 } 2267 2268 compression_counters.busy++; 2269 return false; 2270 } 2271 2272 /** 2273 * ram_save_target_page: save one target page 2274 * 2275 * Returns the number of pages written 2276 * 2277 * @rs: current RAM state 2278 * @pss: data about the page we want to send 2279 */ 2280 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 2281 { 2282 RAMBlock *block = pss->block; 2283 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2284 int res; 2285 2286 if (control_save_page(rs, block, offset, &res)) { 2287 return res; 2288 } 2289 2290 if (save_compress_page(rs, block, offset)) { 2291 return 1; 2292 } 2293 2294 res = save_zero_page(rs, block, offset); 2295 if (res > 0) { 2296 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2297 * page would be stale 2298 */ 2299 if (!save_page_use_compression(rs)) { 2300 XBZRLE_cache_lock(); 2301 xbzrle_cache_zero_page(rs, block->offset + offset); 2302 XBZRLE_cache_unlock(); 2303 } 2304 return res; 2305 } 2306 2307 /* 2308 * Do not use multifd for: 2309 * 1. Compression as the first page in the new block should be posted out 2310 * before sending the compressed page 2311 * 2. In postcopy as one whole host page should be placed 2312 */ 2313 if (!save_page_use_compression(rs) && migrate_use_multifd() 2314 && !migration_in_postcopy()) { 2315 return ram_save_multifd_page(rs, block, offset); 2316 } 2317 2318 return ram_save_page(rs, pss); 2319 } 2320 2321 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss) 2322 { 2323 MigrationState *ms = migrate_get_current(); 2324 2325 /* Not enabled eager preempt? Then never do that. */ 2326 if (!migrate_postcopy_preempt()) { 2327 return false; 2328 } 2329 2330 /* If the user explicitly disabled breaking of huge page, skip */ 2331 if (!ms->postcopy_preempt_break_huge) { 2332 return false; 2333 } 2334 2335 /* If the ramblock we're sending is a small page? Never bother. */ 2336 if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) { 2337 return false; 2338 } 2339 2340 /* Not in postcopy at all? */ 2341 if (!migration_in_postcopy()) { 2342 return false; 2343 } 2344 2345 /* 2346 * If we're already handling a postcopy request, don't preempt as this page 2347 * has got the same high priority. 2348 */ 2349 if (pss->postcopy_requested) { 2350 return false; 2351 } 2352 2353 /* If there's postcopy requests, then check it up! */ 2354 return postcopy_has_request(rs); 2355 } 2356 2357 /* Returns true if we preempted precopy, false otherwise */ 2358 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss) 2359 { 2360 PostcopyPreemptState *p_state = &rs->postcopy_preempt_state; 2361 2362 trace_postcopy_preempt_triggered(pss->block->idstr, pss->page); 2363 2364 /* 2365 * Time to preempt precopy. Cache current PSS into preempt state, so that 2366 * after handling the postcopy pages we can recover to it. We need to do 2367 * so because the dest VM will have partial of the precopy huge page kept 2368 * over in its tmp huge page caches; better move on with it when we can. 2369 */ 2370 p_state->ram_block = pss->block; 2371 p_state->ram_page = pss->page; 2372 p_state->preempted = true; 2373 } 2374 2375 /* Whether we're preempted by a postcopy request during sending a huge page */ 2376 static bool postcopy_preempt_triggered(RAMState *rs) 2377 { 2378 return rs->postcopy_preempt_state.preempted; 2379 } 2380 2381 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, 2382 bool postcopy_requested) 2383 { 2384 PostcopyPreemptState *state = &rs->postcopy_preempt_state; 2385 2386 assert(state->preempted); 2387 2388 pss->block = state->ram_block; 2389 pss->page = state->ram_page; 2390 2391 /* Whether this is a postcopy request? */ 2392 pss->postcopy_requested = postcopy_requested; 2393 /* 2394 * When restoring a preempted page, the old data resides in PRECOPY 2395 * slow channel, even if postcopy_requested is set. So always use 2396 * PRECOPY channel here. 2397 */ 2398 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; 2399 2400 trace_postcopy_preempt_restored(pss->block->idstr, pss->page); 2401 2402 /* Reset preempt state, most importantly, set preempted==false */ 2403 postcopy_preempt_reset(rs); 2404 } 2405 2406 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss) 2407 { 2408 MigrationState *s = migrate_get_current(); 2409 unsigned int channel = pss->postcopy_target_channel; 2410 QEMUFile *next; 2411 2412 if (channel != rs->postcopy_channel) { 2413 if (channel == RAM_CHANNEL_PRECOPY) { 2414 next = s->to_dst_file; 2415 } else { 2416 next = s->postcopy_qemufile_src; 2417 } 2418 /* Update and cache the current channel */ 2419 rs->f = next; 2420 rs->postcopy_channel = channel; 2421 2422 /* 2423 * If channel switched, reset last_sent_block since the old sent block 2424 * may not be on the same channel. 2425 */ 2426 rs->last_sent_block = NULL; 2427 2428 trace_postcopy_preempt_switch_channel(channel); 2429 } 2430 2431 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2432 } 2433 2434 /* We need to make sure rs->f always points to the default channel elsewhere */ 2435 static void postcopy_preempt_reset_channel(RAMState *rs) 2436 { 2437 if (migrate_postcopy_preempt() && migration_in_postcopy()) { 2438 rs->postcopy_channel = RAM_CHANNEL_PRECOPY; 2439 rs->f = migrate_get_current()->to_dst_file; 2440 trace_postcopy_preempt_reset_channel(); 2441 } 2442 } 2443 2444 /** 2445 * ram_save_host_page: save a whole host page 2446 * 2447 * Starting at *offset send pages up to the end of the current host 2448 * page. It's valid for the initial offset to point into the middle of 2449 * a host page in which case the remainder of the hostpage is sent. 2450 * Only dirty target pages are sent. Note that the host page size may 2451 * be a huge page for this block. 2452 * The saving stops at the boundary of the used_length of the block 2453 * if the RAMBlock isn't a multiple of the host page size. 2454 * 2455 * Returns the number of pages written or negative on error 2456 * 2457 * @rs: current RAM state 2458 * @pss: data about the page we want to send 2459 */ 2460 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2461 { 2462 int tmppages, pages = 0; 2463 size_t pagesize_bits = 2464 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2465 unsigned long hostpage_boundary = 2466 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2467 unsigned long start_page = pss->page; 2468 int res; 2469 2470 if (ramblock_is_ignored(pss->block)) { 2471 error_report("block %s should not be migrated !", pss->block->idstr); 2472 return 0; 2473 } 2474 2475 if (migrate_postcopy_preempt() && migration_in_postcopy()) { 2476 postcopy_preempt_choose_channel(rs, pss); 2477 } 2478 2479 do { 2480 if (postcopy_needs_preempt(rs, pss)) { 2481 postcopy_do_preempt(rs, pss); 2482 break; 2483 } 2484 2485 /* Check the pages is dirty and if it is send it */ 2486 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2487 tmppages = ram_save_target_page(rs, pss); 2488 if (tmppages < 0) { 2489 return tmppages; 2490 } 2491 2492 pages += tmppages; 2493 /* 2494 * Allow rate limiting to happen in the middle of huge pages if 2495 * something is sent in the current iteration. 2496 */ 2497 if (pagesize_bits > 1 && tmppages > 0) { 2498 migration_rate_limit(); 2499 } 2500 } 2501 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2502 } while ((pss->page < hostpage_boundary) && 2503 offset_in_ramblock(pss->block, 2504 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2505 /* The offset we leave with is the min boundary of host page and block */ 2506 pss->page = MIN(pss->page, hostpage_boundary); 2507 2508 /* 2509 * When with postcopy preempt mode, flush the data as soon as possible for 2510 * postcopy requests, because we've already sent a whole huge page, so the 2511 * dst node should already have enough resource to atomically filling in 2512 * the current missing page. 2513 * 2514 * More importantly, when using separate postcopy channel, we must do 2515 * explicit flush or it won't flush until the buffer is full. 2516 */ 2517 if (migrate_postcopy_preempt() && pss->postcopy_requested) { 2518 qemu_fflush(rs->f); 2519 } 2520 2521 res = ram_save_release_protection(rs, pss, start_page); 2522 return (res < 0 ? res : pages); 2523 } 2524 2525 /** 2526 * ram_find_and_save_block: finds a dirty page and sends it to f 2527 * 2528 * Called within an RCU critical section. 2529 * 2530 * Returns the number of pages written where zero means no dirty pages, 2531 * or negative on error 2532 * 2533 * @rs: current RAM state 2534 * 2535 * On systems where host-page-size > target-page-size it will send all the 2536 * pages in a host page that are dirty. 2537 */ 2538 static int ram_find_and_save_block(RAMState *rs) 2539 { 2540 PageSearchStatus pss; 2541 int pages = 0; 2542 bool again, found; 2543 2544 /* No dirty page as there is zero RAM */ 2545 if (!ram_bytes_total()) { 2546 return pages; 2547 } 2548 2549 pss.block = rs->last_seen_block; 2550 pss.page = rs->last_page; 2551 pss.complete_round = false; 2552 2553 if (!pss.block) { 2554 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2555 } 2556 2557 do { 2558 again = true; 2559 found = get_queued_page(rs, &pss); 2560 2561 if (!found) { 2562 /* 2563 * Recover previous precopy ramblock/offset if postcopy has 2564 * preempted precopy. Otherwise find the next dirty bit. 2565 */ 2566 if (postcopy_preempt_triggered(rs)) { 2567 postcopy_preempt_restore(rs, &pss, false); 2568 found = true; 2569 } else { 2570 /* priority queue empty, so just search for something dirty */ 2571 found = find_dirty_block(rs, &pss, &again); 2572 } 2573 } 2574 2575 if (found) { 2576 pages = ram_save_host_page(rs, &pss); 2577 } 2578 } while (!pages && again); 2579 2580 rs->last_seen_block = pss.block; 2581 rs->last_page = pss.page; 2582 2583 return pages; 2584 } 2585 2586 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2587 { 2588 uint64_t pages = size / TARGET_PAGE_SIZE; 2589 2590 if (zero) { 2591 ram_counters.duplicate += pages; 2592 } else { 2593 ram_counters.normal += pages; 2594 ram_transferred_add(size); 2595 qemu_file_credit_transfer(f, size); 2596 } 2597 } 2598 2599 static uint64_t ram_bytes_total_common(bool count_ignored) 2600 { 2601 RAMBlock *block; 2602 uint64_t total = 0; 2603 2604 RCU_READ_LOCK_GUARD(); 2605 2606 if (count_ignored) { 2607 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2608 total += block->used_length; 2609 } 2610 } else { 2611 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2612 total += block->used_length; 2613 } 2614 } 2615 return total; 2616 } 2617 2618 uint64_t ram_bytes_total(void) 2619 { 2620 return ram_bytes_total_common(false); 2621 } 2622 2623 static void xbzrle_load_setup(void) 2624 { 2625 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2626 } 2627 2628 static void xbzrle_load_cleanup(void) 2629 { 2630 g_free(XBZRLE.decoded_buf); 2631 XBZRLE.decoded_buf = NULL; 2632 } 2633 2634 static void ram_state_cleanup(RAMState **rsp) 2635 { 2636 if (*rsp) { 2637 migration_page_queue_free(*rsp); 2638 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2639 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2640 g_free(*rsp); 2641 *rsp = NULL; 2642 } 2643 } 2644 2645 static void xbzrle_cleanup(void) 2646 { 2647 XBZRLE_cache_lock(); 2648 if (XBZRLE.cache) { 2649 cache_fini(XBZRLE.cache); 2650 g_free(XBZRLE.encoded_buf); 2651 g_free(XBZRLE.current_buf); 2652 g_free(XBZRLE.zero_target_page); 2653 XBZRLE.cache = NULL; 2654 XBZRLE.encoded_buf = NULL; 2655 XBZRLE.current_buf = NULL; 2656 XBZRLE.zero_target_page = NULL; 2657 } 2658 XBZRLE_cache_unlock(); 2659 } 2660 2661 static void ram_save_cleanup(void *opaque) 2662 { 2663 RAMState **rsp = opaque; 2664 RAMBlock *block; 2665 2666 /* We don't use dirty log with background snapshots */ 2667 if (!migrate_background_snapshot()) { 2668 /* caller have hold iothread lock or is in a bh, so there is 2669 * no writing race against the migration bitmap 2670 */ 2671 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2672 /* 2673 * do not stop dirty log without starting it, since 2674 * memory_global_dirty_log_stop will assert that 2675 * memory_global_dirty_log_start/stop used in pairs 2676 */ 2677 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2678 } 2679 } 2680 2681 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2682 g_free(block->clear_bmap); 2683 block->clear_bmap = NULL; 2684 g_free(block->bmap); 2685 block->bmap = NULL; 2686 } 2687 2688 xbzrle_cleanup(); 2689 compress_threads_save_cleanup(); 2690 ram_state_cleanup(rsp); 2691 } 2692 2693 static void ram_state_reset(RAMState *rs) 2694 { 2695 rs->last_seen_block = NULL; 2696 rs->last_sent_block = NULL; 2697 rs->last_page = 0; 2698 rs->last_version = ram_list.version; 2699 rs->xbzrle_enabled = false; 2700 postcopy_preempt_reset(rs); 2701 rs->postcopy_channel = RAM_CHANNEL_PRECOPY; 2702 } 2703 2704 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2705 2706 /* **** functions for postcopy ***** */ 2707 2708 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2709 { 2710 struct RAMBlock *block; 2711 2712 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2713 unsigned long *bitmap = block->bmap; 2714 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2715 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2716 2717 while (run_start < range) { 2718 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2719 ram_discard_range(block->idstr, 2720 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2721 ((ram_addr_t)(run_end - run_start)) 2722 << TARGET_PAGE_BITS); 2723 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2724 } 2725 } 2726 } 2727 2728 /** 2729 * postcopy_send_discard_bm_ram: discard a RAMBlock 2730 * 2731 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2732 * 2733 * @ms: current migration state 2734 * @block: RAMBlock to discard 2735 */ 2736 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2737 { 2738 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2739 unsigned long current; 2740 unsigned long *bitmap = block->bmap; 2741 2742 for (current = 0; current < end; ) { 2743 unsigned long one = find_next_bit(bitmap, end, current); 2744 unsigned long zero, discard_length; 2745 2746 if (one >= end) { 2747 break; 2748 } 2749 2750 zero = find_next_zero_bit(bitmap, end, one + 1); 2751 2752 if (zero >= end) { 2753 discard_length = end - one; 2754 } else { 2755 discard_length = zero - one; 2756 } 2757 postcopy_discard_send_range(ms, one, discard_length); 2758 current = one + discard_length; 2759 } 2760 } 2761 2762 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2763 2764 /** 2765 * postcopy_each_ram_send_discard: discard all RAMBlocks 2766 * 2767 * Utility for the outgoing postcopy code. 2768 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2769 * passing it bitmap indexes and name. 2770 * (qemu_ram_foreach_block ends up passing unscaled lengths 2771 * which would mean postcopy code would have to deal with target page) 2772 * 2773 * @ms: current migration state 2774 */ 2775 static void postcopy_each_ram_send_discard(MigrationState *ms) 2776 { 2777 struct RAMBlock *block; 2778 2779 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2780 postcopy_discard_send_init(ms, block->idstr); 2781 2782 /* 2783 * Deal with TPS != HPS and huge pages. It discard any partially sent 2784 * host-page size chunks, mark any partially dirty host-page size 2785 * chunks as all dirty. In this case the host-page is the host-page 2786 * for the particular RAMBlock, i.e. it might be a huge page. 2787 */ 2788 postcopy_chunk_hostpages_pass(ms, block); 2789 2790 /* 2791 * Postcopy sends chunks of bitmap over the wire, but it 2792 * just needs indexes at this point, avoids it having 2793 * target page specific code. 2794 */ 2795 postcopy_send_discard_bm_ram(ms, block); 2796 postcopy_discard_send_finish(ms); 2797 } 2798 } 2799 2800 /** 2801 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2802 * 2803 * Helper for postcopy_chunk_hostpages; it's called twice to 2804 * canonicalize the two bitmaps, that are similar, but one is 2805 * inverted. 2806 * 2807 * Postcopy requires that all target pages in a hostpage are dirty or 2808 * clean, not a mix. This function canonicalizes the bitmaps. 2809 * 2810 * @ms: current migration state 2811 * @block: block that contains the page we want to canonicalize 2812 */ 2813 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2814 { 2815 RAMState *rs = ram_state; 2816 unsigned long *bitmap = block->bmap; 2817 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2818 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2819 unsigned long run_start; 2820 2821 if (block->page_size == TARGET_PAGE_SIZE) { 2822 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2823 return; 2824 } 2825 2826 /* Find a dirty page */ 2827 run_start = find_next_bit(bitmap, pages, 0); 2828 2829 while (run_start < pages) { 2830 2831 /* 2832 * If the start of this run of pages is in the middle of a host 2833 * page, then we need to fixup this host page. 2834 */ 2835 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2836 /* Find the end of this run */ 2837 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2838 /* 2839 * If the end isn't at the start of a host page, then the 2840 * run doesn't finish at the end of a host page 2841 * and we need to discard. 2842 */ 2843 } 2844 2845 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2846 unsigned long page; 2847 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2848 host_ratio); 2849 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2850 2851 /* Clean up the bitmap */ 2852 for (page = fixup_start_addr; 2853 page < fixup_start_addr + host_ratio; page++) { 2854 /* 2855 * Remark them as dirty, updating the count for any pages 2856 * that weren't previously dirty. 2857 */ 2858 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2859 } 2860 } 2861 2862 /* Find the next dirty page for the next iteration */ 2863 run_start = find_next_bit(bitmap, pages, run_start); 2864 } 2865 } 2866 2867 /** 2868 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2869 * 2870 * Transmit the set of pages to be discarded after precopy to the target 2871 * these are pages that: 2872 * a) Have been previously transmitted but are now dirty again 2873 * b) Pages that have never been transmitted, this ensures that 2874 * any pages on the destination that have been mapped by background 2875 * tasks get discarded (transparent huge pages is the specific concern) 2876 * Hopefully this is pretty sparse 2877 * 2878 * @ms: current migration state 2879 */ 2880 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2881 { 2882 RAMState *rs = ram_state; 2883 2884 RCU_READ_LOCK_GUARD(); 2885 2886 /* This should be our last sync, the src is now paused */ 2887 migration_bitmap_sync(rs); 2888 2889 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2890 rs->last_seen_block = NULL; 2891 rs->last_sent_block = NULL; 2892 rs->last_page = 0; 2893 2894 postcopy_each_ram_send_discard(ms); 2895 2896 trace_ram_postcopy_send_discard_bitmap(); 2897 } 2898 2899 /** 2900 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2901 * 2902 * Returns zero on success 2903 * 2904 * @rbname: name of the RAMBlock of the request. NULL means the 2905 * same that last one. 2906 * @start: RAMBlock starting page 2907 * @length: RAMBlock size 2908 */ 2909 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2910 { 2911 trace_ram_discard_range(rbname, start, length); 2912 2913 RCU_READ_LOCK_GUARD(); 2914 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2915 2916 if (!rb) { 2917 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2918 return -1; 2919 } 2920 2921 /* 2922 * On source VM, we don't need to update the received bitmap since 2923 * we don't even have one. 2924 */ 2925 if (rb->receivedmap) { 2926 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2927 length >> qemu_target_page_bits()); 2928 } 2929 2930 return ram_block_discard_range(rb, start, length); 2931 } 2932 2933 /* 2934 * For every allocation, we will try not to crash the VM if the 2935 * allocation failed. 2936 */ 2937 static int xbzrle_init(void) 2938 { 2939 Error *local_err = NULL; 2940 2941 if (!migrate_use_xbzrle()) { 2942 return 0; 2943 } 2944 2945 XBZRLE_cache_lock(); 2946 2947 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2948 if (!XBZRLE.zero_target_page) { 2949 error_report("%s: Error allocating zero page", __func__); 2950 goto err_out; 2951 } 2952 2953 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2954 TARGET_PAGE_SIZE, &local_err); 2955 if (!XBZRLE.cache) { 2956 error_report_err(local_err); 2957 goto free_zero_page; 2958 } 2959 2960 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2961 if (!XBZRLE.encoded_buf) { 2962 error_report("%s: Error allocating encoded_buf", __func__); 2963 goto free_cache; 2964 } 2965 2966 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2967 if (!XBZRLE.current_buf) { 2968 error_report("%s: Error allocating current_buf", __func__); 2969 goto free_encoded_buf; 2970 } 2971 2972 /* We are all good */ 2973 XBZRLE_cache_unlock(); 2974 return 0; 2975 2976 free_encoded_buf: 2977 g_free(XBZRLE.encoded_buf); 2978 XBZRLE.encoded_buf = NULL; 2979 free_cache: 2980 cache_fini(XBZRLE.cache); 2981 XBZRLE.cache = NULL; 2982 free_zero_page: 2983 g_free(XBZRLE.zero_target_page); 2984 XBZRLE.zero_target_page = NULL; 2985 err_out: 2986 XBZRLE_cache_unlock(); 2987 return -ENOMEM; 2988 } 2989 2990 static int ram_state_init(RAMState **rsp) 2991 { 2992 *rsp = g_try_new0(RAMState, 1); 2993 2994 if (!*rsp) { 2995 error_report("%s: Init ramstate fail", __func__); 2996 return -1; 2997 } 2998 2999 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3000 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3001 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3002 3003 /* 3004 * Count the total number of pages used by ram blocks not including any 3005 * gaps due to alignment or unplugs. 3006 * This must match with the initial values of dirty bitmap. 3007 */ 3008 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 3009 ram_state_reset(*rsp); 3010 3011 return 0; 3012 } 3013 3014 static void ram_list_init_bitmaps(void) 3015 { 3016 MigrationState *ms = migrate_get_current(); 3017 RAMBlock *block; 3018 unsigned long pages; 3019 uint8_t shift; 3020 3021 /* Skip setting bitmap if there is no RAM */ 3022 if (ram_bytes_total()) { 3023 shift = ms->clear_bitmap_shift; 3024 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3025 error_report("clear_bitmap_shift (%u) too big, using " 3026 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3027 shift = CLEAR_BITMAP_SHIFT_MAX; 3028 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3029 error_report("clear_bitmap_shift (%u) too small, using " 3030 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3031 shift = CLEAR_BITMAP_SHIFT_MIN; 3032 } 3033 3034 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3035 pages = block->max_length >> TARGET_PAGE_BITS; 3036 /* 3037 * The initial dirty bitmap for migration must be set with all 3038 * ones to make sure we'll migrate every guest RAM page to 3039 * destination. 3040 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3041 * new migration after a failed migration, ram_list. 3042 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3043 * guest memory. 3044 */ 3045 block->bmap = bitmap_new(pages); 3046 bitmap_set(block->bmap, 0, pages); 3047 block->clear_bmap_shift = shift; 3048 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3049 } 3050 } 3051 } 3052 3053 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3054 { 3055 unsigned long pages; 3056 RAMBlock *rb; 3057 3058 RCU_READ_LOCK_GUARD(); 3059 3060 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3061 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3062 rs->migration_dirty_pages -= pages; 3063 } 3064 } 3065 3066 static void ram_init_bitmaps(RAMState *rs) 3067 { 3068 /* For memory_global_dirty_log_start below. */ 3069 qemu_mutex_lock_iothread(); 3070 qemu_mutex_lock_ramlist(); 3071 3072 WITH_RCU_READ_LOCK_GUARD() { 3073 ram_list_init_bitmaps(); 3074 /* We don't use dirty log with background snapshots */ 3075 if (!migrate_background_snapshot()) { 3076 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3077 migration_bitmap_sync_precopy(rs); 3078 } 3079 } 3080 qemu_mutex_unlock_ramlist(); 3081 qemu_mutex_unlock_iothread(); 3082 3083 /* 3084 * After an eventual first bitmap sync, fixup the initial bitmap 3085 * containing all 1s to exclude any discarded pages from migration. 3086 */ 3087 migration_bitmap_clear_discarded_pages(rs); 3088 } 3089 3090 static int ram_init_all(RAMState **rsp) 3091 { 3092 if (ram_state_init(rsp)) { 3093 return -1; 3094 } 3095 3096 if (xbzrle_init()) { 3097 ram_state_cleanup(rsp); 3098 return -1; 3099 } 3100 3101 ram_init_bitmaps(*rsp); 3102 3103 return 0; 3104 } 3105 3106 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3107 { 3108 RAMBlock *block; 3109 uint64_t pages = 0; 3110 3111 /* 3112 * Postcopy is not using xbzrle/compression, so no need for that. 3113 * Also, since source are already halted, we don't need to care 3114 * about dirty page logging as well. 3115 */ 3116 3117 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3118 pages += bitmap_count_one(block->bmap, 3119 block->used_length >> TARGET_PAGE_BITS); 3120 } 3121 3122 /* This may not be aligned with current bitmaps. Recalculate. */ 3123 rs->migration_dirty_pages = pages; 3124 3125 ram_state_reset(rs); 3126 3127 /* Update RAMState cache of output QEMUFile */ 3128 rs->f = out; 3129 3130 trace_ram_state_resume_prepare(pages); 3131 } 3132 3133 /* 3134 * This function clears bits of the free pages reported by the caller from the 3135 * migration dirty bitmap. @addr is the host address corresponding to the 3136 * start of the continuous guest free pages, and @len is the total bytes of 3137 * those pages. 3138 */ 3139 void qemu_guest_free_page_hint(void *addr, size_t len) 3140 { 3141 RAMBlock *block; 3142 ram_addr_t offset; 3143 size_t used_len, start, npages; 3144 MigrationState *s = migrate_get_current(); 3145 3146 /* This function is currently expected to be used during live migration */ 3147 if (!migration_is_setup_or_active(s->state)) { 3148 return; 3149 } 3150 3151 for (; len > 0; len -= used_len, addr += used_len) { 3152 block = qemu_ram_block_from_host(addr, false, &offset); 3153 if (unlikely(!block || offset >= block->used_length)) { 3154 /* 3155 * The implementation might not support RAMBlock resize during 3156 * live migration, but it could happen in theory with future 3157 * updates. So we add a check here to capture that case. 3158 */ 3159 error_report_once("%s unexpected error", __func__); 3160 return; 3161 } 3162 3163 if (len <= block->used_length - offset) { 3164 used_len = len; 3165 } else { 3166 used_len = block->used_length - offset; 3167 } 3168 3169 start = offset >> TARGET_PAGE_BITS; 3170 npages = used_len >> TARGET_PAGE_BITS; 3171 3172 qemu_mutex_lock(&ram_state->bitmap_mutex); 3173 /* 3174 * The skipped free pages are equavalent to be sent from clear_bmap's 3175 * perspective, so clear the bits from the memory region bitmap which 3176 * are initially set. Otherwise those skipped pages will be sent in 3177 * the next round after syncing from the memory region bitmap. 3178 */ 3179 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3180 ram_state->migration_dirty_pages -= 3181 bitmap_count_one_with_offset(block->bmap, start, npages); 3182 bitmap_clear(block->bmap, start, npages); 3183 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3184 } 3185 } 3186 3187 /* 3188 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3189 * long-running RCU critical section. When rcu-reclaims in the code 3190 * start to become numerous it will be necessary to reduce the 3191 * granularity of these critical sections. 3192 */ 3193 3194 /** 3195 * ram_save_setup: Setup RAM for migration 3196 * 3197 * Returns zero to indicate success and negative for error 3198 * 3199 * @f: QEMUFile where to send the data 3200 * @opaque: RAMState pointer 3201 */ 3202 static int ram_save_setup(QEMUFile *f, void *opaque) 3203 { 3204 RAMState **rsp = opaque; 3205 RAMBlock *block; 3206 int ret; 3207 3208 if (compress_threads_save_setup()) { 3209 return -1; 3210 } 3211 3212 /* migration has already setup the bitmap, reuse it. */ 3213 if (!migration_in_colo_state()) { 3214 if (ram_init_all(rsp) != 0) { 3215 compress_threads_save_cleanup(); 3216 return -1; 3217 } 3218 } 3219 (*rsp)->f = f; 3220 3221 WITH_RCU_READ_LOCK_GUARD() { 3222 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 3223 3224 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3225 qemu_put_byte(f, strlen(block->idstr)); 3226 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3227 qemu_put_be64(f, block->used_length); 3228 if (migrate_postcopy_ram() && block->page_size != 3229 qemu_host_page_size) { 3230 qemu_put_be64(f, block->page_size); 3231 } 3232 if (migrate_ignore_shared()) { 3233 qemu_put_be64(f, block->mr->addr); 3234 } 3235 } 3236 } 3237 3238 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3239 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3240 3241 ret = multifd_send_sync_main(f); 3242 if (ret < 0) { 3243 return ret; 3244 } 3245 3246 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3247 qemu_fflush(f); 3248 3249 return 0; 3250 } 3251 3252 /** 3253 * ram_save_iterate: iterative stage for migration 3254 * 3255 * Returns zero to indicate success and negative for error 3256 * 3257 * @f: QEMUFile where to send the data 3258 * @opaque: RAMState pointer 3259 */ 3260 static int ram_save_iterate(QEMUFile *f, void *opaque) 3261 { 3262 RAMState **temp = opaque; 3263 RAMState *rs = *temp; 3264 int ret = 0; 3265 int i; 3266 int64_t t0; 3267 int done = 0; 3268 3269 if (blk_mig_bulk_active()) { 3270 /* Avoid transferring ram during bulk phase of block migration as 3271 * the bulk phase will usually take a long time and transferring 3272 * ram updates during that time is pointless. */ 3273 goto out; 3274 } 3275 3276 /* 3277 * We'll take this lock a little bit long, but it's okay for two reasons. 3278 * Firstly, the only possible other thread to take it is who calls 3279 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3280 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3281 * guarantees that we'll at least released it in a regular basis. 3282 */ 3283 qemu_mutex_lock(&rs->bitmap_mutex); 3284 WITH_RCU_READ_LOCK_GUARD() { 3285 if (ram_list.version != rs->last_version) { 3286 ram_state_reset(rs); 3287 } 3288 3289 /* Read version before ram_list.blocks */ 3290 smp_rmb(); 3291 3292 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3293 3294 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3295 i = 0; 3296 while ((ret = qemu_file_rate_limit(f)) == 0 || 3297 postcopy_has_request(rs)) { 3298 int pages; 3299 3300 if (qemu_file_get_error(f)) { 3301 break; 3302 } 3303 3304 pages = ram_find_and_save_block(rs); 3305 /* no more pages to sent */ 3306 if (pages == 0) { 3307 done = 1; 3308 break; 3309 } 3310 3311 if (pages < 0) { 3312 qemu_file_set_error(f, pages); 3313 break; 3314 } 3315 3316 rs->target_page_count += pages; 3317 3318 /* 3319 * During postcopy, it is necessary to make sure one whole host 3320 * page is sent in one chunk. 3321 */ 3322 if (migrate_postcopy_ram()) { 3323 flush_compressed_data(rs); 3324 } 3325 3326 /* 3327 * we want to check in the 1st loop, just in case it was the 1st 3328 * time and we had to sync the dirty bitmap. 3329 * qemu_clock_get_ns() is a bit expensive, so we only check each 3330 * some iterations 3331 */ 3332 if ((i & 63) == 0) { 3333 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3334 1000000; 3335 if (t1 > MAX_WAIT) { 3336 trace_ram_save_iterate_big_wait(t1, i); 3337 break; 3338 } 3339 } 3340 i++; 3341 } 3342 } 3343 qemu_mutex_unlock(&rs->bitmap_mutex); 3344 3345 postcopy_preempt_reset_channel(rs); 3346 3347 /* 3348 * Must occur before EOS (or any QEMUFile operation) 3349 * because of RDMA protocol. 3350 */ 3351 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3352 3353 out: 3354 if (ret >= 0 3355 && migration_is_setup_or_active(migrate_get_current()->state)) { 3356 ret = multifd_send_sync_main(rs->f); 3357 if (ret < 0) { 3358 return ret; 3359 } 3360 3361 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3362 qemu_fflush(f); 3363 ram_transferred_add(8); 3364 3365 ret = qemu_file_get_error(f); 3366 } 3367 if (ret < 0) { 3368 return ret; 3369 } 3370 3371 return done; 3372 } 3373 3374 /** 3375 * ram_save_complete: function called to send the remaining amount of ram 3376 * 3377 * Returns zero to indicate success or negative on error 3378 * 3379 * Called with iothread lock 3380 * 3381 * @f: QEMUFile where to send the data 3382 * @opaque: RAMState pointer 3383 */ 3384 static int ram_save_complete(QEMUFile *f, void *opaque) 3385 { 3386 RAMState **temp = opaque; 3387 RAMState *rs = *temp; 3388 int ret = 0; 3389 3390 rs->last_stage = !migration_in_colo_state(); 3391 3392 WITH_RCU_READ_LOCK_GUARD() { 3393 if (!migration_in_postcopy()) { 3394 migration_bitmap_sync_precopy(rs); 3395 } 3396 3397 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3398 3399 /* try transferring iterative blocks of memory */ 3400 3401 /* flush all remaining blocks regardless of rate limiting */ 3402 while (true) { 3403 int pages; 3404 3405 pages = ram_find_and_save_block(rs); 3406 /* no more blocks to sent */ 3407 if (pages == 0) { 3408 break; 3409 } 3410 if (pages < 0) { 3411 ret = pages; 3412 break; 3413 } 3414 } 3415 3416 flush_compressed_data(rs); 3417 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3418 } 3419 3420 if (ret < 0) { 3421 return ret; 3422 } 3423 3424 postcopy_preempt_reset_channel(rs); 3425 3426 ret = multifd_send_sync_main(rs->f); 3427 if (ret < 0) { 3428 return ret; 3429 } 3430 3431 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3432 qemu_fflush(f); 3433 3434 return 0; 3435 } 3436 3437 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3438 uint64_t *res_precopy_only, 3439 uint64_t *res_compatible, 3440 uint64_t *res_postcopy_only) 3441 { 3442 RAMState **temp = opaque; 3443 RAMState *rs = *temp; 3444 uint64_t remaining_size; 3445 3446 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3447 3448 if (!migration_in_postcopy() && 3449 remaining_size < max_size) { 3450 qemu_mutex_lock_iothread(); 3451 WITH_RCU_READ_LOCK_GUARD() { 3452 migration_bitmap_sync_precopy(rs); 3453 } 3454 qemu_mutex_unlock_iothread(); 3455 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3456 } 3457 3458 if (migrate_postcopy_ram()) { 3459 /* We can do postcopy, and all the data is postcopiable */ 3460 *res_compatible += remaining_size; 3461 } else { 3462 *res_precopy_only += remaining_size; 3463 } 3464 } 3465 3466 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3467 { 3468 unsigned int xh_len; 3469 int xh_flags; 3470 uint8_t *loaded_data; 3471 3472 /* extract RLE header */ 3473 xh_flags = qemu_get_byte(f); 3474 xh_len = qemu_get_be16(f); 3475 3476 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3477 error_report("Failed to load XBZRLE page - wrong compression!"); 3478 return -1; 3479 } 3480 3481 if (xh_len > TARGET_PAGE_SIZE) { 3482 error_report("Failed to load XBZRLE page - len overflow!"); 3483 return -1; 3484 } 3485 loaded_data = XBZRLE.decoded_buf; 3486 /* load data and decode */ 3487 /* it can change loaded_data to point to an internal buffer */ 3488 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3489 3490 /* decode RLE */ 3491 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3492 TARGET_PAGE_SIZE) == -1) { 3493 error_report("Failed to load XBZRLE page - decode error!"); 3494 return -1; 3495 } 3496 3497 return 0; 3498 } 3499 3500 /** 3501 * ram_block_from_stream: read a RAMBlock id from the migration stream 3502 * 3503 * Must be called from within a rcu critical section. 3504 * 3505 * Returns a pointer from within the RCU-protected ram_list. 3506 * 3507 * @mis: the migration incoming state pointer 3508 * @f: QEMUFile where to read the data from 3509 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3510 * @channel: the channel we're using 3511 */ 3512 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3513 QEMUFile *f, int flags, 3514 int channel) 3515 { 3516 RAMBlock *block = mis->last_recv_block[channel]; 3517 char id[256]; 3518 uint8_t len; 3519 3520 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3521 if (!block) { 3522 error_report("Ack, bad migration stream!"); 3523 return NULL; 3524 } 3525 return block; 3526 } 3527 3528 len = qemu_get_byte(f); 3529 qemu_get_buffer(f, (uint8_t *)id, len); 3530 id[len] = 0; 3531 3532 block = qemu_ram_block_by_name(id); 3533 if (!block) { 3534 error_report("Can't find block %s", id); 3535 return NULL; 3536 } 3537 3538 if (ramblock_is_ignored(block)) { 3539 error_report("block %s should not be migrated !", id); 3540 return NULL; 3541 } 3542 3543 mis->last_recv_block[channel] = block; 3544 3545 return block; 3546 } 3547 3548 static inline void *host_from_ram_block_offset(RAMBlock *block, 3549 ram_addr_t offset) 3550 { 3551 if (!offset_in_ramblock(block, offset)) { 3552 return NULL; 3553 } 3554 3555 return block->host + offset; 3556 } 3557 3558 static void *host_page_from_ram_block_offset(RAMBlock *block, 3559 ram_addr_t offset) 3560 { 3561 /* Note: Explicitly no check against offset_in_ramblock(). */ 3562 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3563 block->page_size); 3564 } 3565 3566 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3567 ram_addr_t offset) 3568 { 3569 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3570 } 3571 3572 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3573 ram_addr_t offset, bool record_bitmap) 3574 { 3575 if (!offset_in_ramblock(block, offset)) { 3576 return NULL; 3577 } 3578 if (!block->colo_cache) { 3579 error_report("%s: colo_cache is NULL in block :%s", 3580 __func__, block->idstr); 3581 return NULL; 3582 } 3583 3584 /* 3585 * During colo checkpoint, we need bitmap of these migrated pages. 3586 * It help us to decide which pages in ram cache should be flushed 3587 * into VM's RAM later. 3588 */ 3589 if (record_bitmap && 3590 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3591 ram_state->migration_dirty_pages++; 3592 } 3593 return block->colo_cache + offset; 3594 } 3595 3596 /** 3597 * ram_handle_compressed: handle the zero page case 3598 * 3599 * If a page (or a whole RDMA chunk) has been 3600 * determined to be zero, then zap it. 3601 * 3602 * @host: host address for the zero page 3603 * @ch: what the page is filled from. We only support zero 3604 * @size: size of the zero page 3605 */ 3606 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3607 { 3608 if (ch != 0 || !buffer_is_zero(host, size)) { 3609 memset(host, ch, size); 3610 } 3611 } 3612 3613 /* return the size after decompression, or negative value on error */ 3614 static int 3615 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3616 const uint8_t *source, size_t source_len) 3617 { 3618 int err; 3619 3620 err = inflateReset(stream); 3621 if (err != Z_OK) { 3622 return -1; 3623 } 3624 3625 stream->avail_in = source_len; 3626 stream->next_in = (uint8_t *)source; 3627 stream->avail_out = dest_len; 3628 stream->next_out = dest; 3629 3630 err = inflate(stream, Z_NO_FLUSH); 3631 if (err != Z_STREAM_END) { 3632 return -1; 3633 } 3634 3635 return stream->total_out; 3636 } 3637 3638 static void *do_data_decompress(void *opaque) 3639 { 3640 DecompressParam *param = opaque; 3641 unsigned long pagesize; 3642 uint8_t *des; 3643 int len, ret; 3644 3645 qemu_mutex_lock(¶m->mutex); 3646 while (!param->quit) { 3647 if (param->des) { 3648 des = param->des; 3649 len = param->len; 3650 param->des = 0; 3651 qemu_mutex_unlock(¶m->mutex); 3652 3653 pagesize = TARGET_PAGE_SIZE; 3654 3655 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3656 param->compbuf, len); 3657 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3658 error_report("decompress data failed"); 3659 qemu_file_set_error(decomp_file, ret); 3660 } 3661 3662 qemu_mutex_lock(&decomp_done_lock); 3663 param->done = true; 3664 qemu_cond_signal(&decomp_done_cond); 3665 qemu_mutex_unlock(&decomp_done_lock); 3666 3667 qemu_mutex_lock(¶m->mutex); 3668 } else { 3669 qemu_cond_wait(¶m->cond, ¶m->mutex); 3670 } 3671 } 3672 qemu_mutex_unlock(¶m->mutex); 3673 3674 return NULL; 3675 } 3676 3677 static int wait_for_decompress_done(void) 3678 { 3679 int idx, thread_count; 3680 3681 if (!migrate_use_compression()) { 3682 return 0; 3683 } 3684 3685 thread_count = migrate_decompress_threads(); 3686 qemu_mutex_lock(&decomp_done_lock); 3687 for (idx = 0; idx < thread_count; idx++) { 3688 while (!decomp_param[idx].done) { 3689 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3690 } 3691 } 3692 qemu_mutex_unlock(&decomp_done_lock); 3693 return qemu_file_get_error(decomp_file); 3694 } 3695 3696 static void compress_threads_load_cleanup(void) 3697 { 3698 int i, thread_count; 3699 3700 if (!migrate_use_compression()) { 3701 return; 3702 } 3703 thread_count = migrate_decompress_threads(); 3704 for (i = 0; i < thread_count; i++) { 3705 /* 3706 * we use it as a indicator which shows if the thread is 3707 * properly init'd or not 3708 */ 3709 if (!decomp_param[i].compbuf) { 3710 break; 3711 } 3712 3713 qemu_mutex_lock(&decomp_param[i].mutex); 3714 decomp_param[i].quit = true; 3715 qemu_cond_signal(&decomp_param[i].cond); 3716 qemu_mutex_unlock(&decomp_param[i].mutex); 3717 } 3718 for (i = 0; i < thread_count; i++) { 3719 if (!decomp_param[i].compbuf) { 3720 break; 3721 } 3722 3723 qemu_thread_join(decompress_threads + i); 3724 qemu_mutex_destroy(&decomp_param[i].mutex); 3725 qemu_cond_destroy(&decomp_param[i].cond); 3726 inflateEnd(&decomp_param[i].stream); 3727 g_free(decomp_param[i].compbuf); 3728 decomp_param[i].compbuf = NULL; 3729 } 3730 g_free(decompress_threads); 3731 g_free(decomp_param); 3732 decompress_threads = NULL; 3733 decomp_param = NULL; 3734 decomp_file = NULL; 3735 } 3736 3737 static int compress_threads_load_setup(QEMUFile *f) 3738 { 3739 int i, thread_count; 3740 3741 if (!migrate_use_compression()) { 3742 return 0; 3743 } 3744 3745 thread_count = migrate_decompress_threads(); 3746 decompress_threads = g_new0(QemuThread, thread_count); 3747 decomp_param = g_new0(DecompressParam, thread_count); 3748 qemu_mutex_init(&decomp_done_lock); 3749 qemu_cond_init(&decomp_done_cond); 3750 decomp_file = f; 3751 for (i = 0; i < thread_count; i++) { 3752 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3753 goto exit; 3754 } 3755 3756 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3757 qemu_mutex_init(&decomp_param[i].mutex); 3758 qemu_cond_init(&decomp_param[i].cond); 3759 decomp_param[i].done = true; 3760 decomp_param[i].quit = false; 3761 qemu_thread_create(decompress_threads + i, "decompress", 3762 do_data_decompress, decomp_param + i, 3763 QEMU_THREAD_JOINABLE); 3764 } 3765 return 0; 3766 exit: 3767 compress_threads_load_cleanup(); 3768 return -1; 3769 } 3770 3771 static void decompress_data_with_multi_threads(QEMUFile *f, 3772 void *host, int len) 3773 { 3774 int idx, thread_count; 3775 3776 thread_count = migrate_decompress_threads(); 3777 QEMU_LOCK_GUARD(&decomp_done_lock); 3778 while (true) { 3779 for (idx = 0; idx < thread_count; idx++) { 3780 if (decomp_param[idx].done) { 3781 decomp_param[idx].done = false; 3782 qemu_mutex_lock(&decomp_param[idx].mutex); 3783 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3784 decomp_param[idx].des = host; 3785 decomp_param[idx].len = len; 3786 qemu_cond_signal(&decomp_param[idx].cond); 3787 qemu_mutex_unlock(&decomp_param[idx].mutex); 3788 break; 3789 } 3790 } 3791 if (idx < thread_count) { 3792 break; 3793 } else { 3794 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3795 } 3796 } 3797 } 3798 3799 static void colo_init_ram_state(void) 3800 { 3801 ram_state_init(&ram_state); 3802 } 3803 3804 /* 3805 * colo cache: this is for secondary VM, we cache the whole 3806 * memory of the secondary VM, it is need to hold the global lock 3807 * to call this helper. 3808 */ 3809 int colo_init_ram_cache(void) 3810 { 3811 RAMBlock *block; 3812 3813 WITH_RCU_READ_LOCK_GUARD() { 3814 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3815 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3816 NULL, false, false); 3817 if (!block->colo_cache) { 3818 error_report("%s: Can't alloc memory for COLO cache of block %s," 3819 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3820 block->used_length); 3821 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3822 if (block->colo_cache) { 3823 qemu_anon_ram_free(block->colo_cache, block->used_length); 3824 block->colo_cache = NULL; 3825 } 3826 } 3827 return -errno; 3828 } 3829 if (!machine_dump_guest_core(current_machine)) { 3830 qemu_madvise(block->colo_cache, block->used_length, 3831 QEMU_MADV_DONTDUMP); 3832 } 3833 } 3834 } 3835 3836 /* 3837 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3838 * with to decide which page in cache should be flushed into SVM's RAM. Here 3839 * we use the same name 'ram_bitmap' as for migration. 3840 */ 3841 if (ram_bytes_total()) { 3842 RAMBlock *block; 3843 3844 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3845 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3846 block->bmap = bitmap_new(pages); 3847 } 3848 } 3849 3850 colo_init_ram_state(); 3851 return 0; 3852 } 3853 3854 /* TODO: duplicated with ram_init_bitmaps */ 3855 void colo_incoming_start_dirty_log(void) 3856 { 3857 RAMBlock *block = NULL; 3858 /* For memory_global_dirty_log_start below. */ 3859 qemu_mutex_lock_iothread(); 3860 qemu_mutex_lock_ramlist(); 3861 3862 memory_global_dirty_log_sync(); 3863 WITH_RCU_READ_LOCK_GUARD() { 3864 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3865 ramblock_sync_dirty_bitmap(ram_state, block); 3866 /* Discard this dirty bitmap record */ 3867 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3868 } 3869 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3870 } 3871 ram_state->migration_dirty_pages = 0; 3872 qemu_mutex_unlock_ramlist(); 3873 qemu_mutex_unlock_iothread(); 3874 } 3875 3876 /* It is need to hold the global lock to call this helper */ 3877 void colo_release_ram_cache(void) 3878 { 3879 RAMBlock *block; 3880 3881 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3882 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3883 g_free(block->bmap); 3884 block->bmap = NULL; 3885 } 3886 3887 WITH_RCU_READ_LOCK_GUARD() { 3888 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3889 if (block->colo_cache) { 3890 qemu_anon_ram_free(block->colo_cache, block->used_length); 3891 block->colo_cache = NULL; 3892 } 3893 } 3894 } 3895 ram_state_cleanup(&ram_state); 3896 } 3897 3898 /** 3899 * ram_load_setup: Setup RAM for migration incoming side 3900 * 3901 * Returns zero to indicate success and negative for error 3902 * 3903 * @f: QEMUFile where to receive the data 3904 * @opaque: RAMState pointer 3905 */ 3906 static int ram_load_setup(QEMUFile *f, void *opaque) 3907 { 3908 if (compress_threads_load_setup(f)) { 3909 return -1; 3910 } 3911 3912 xbzrle_load_setup(); 3913 ramblock_recv_map_init(); 3914 3915 return 0; 3916 } 3917 3918 static int ram_load_cleanup(void *opaque) 3919 { 3920 RAMBlock *rb; 3921 3922 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3923 qemu_ram_block_writeback(rb); 3924 } 3925 3926 xbzrle_load_cleanup(); 3927 compress_threads_load_cleanup(); 3928 3929 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3930 g_free(rb->receivedmap); 3931 rb->receivedmap = NULL; 3932 } 3933 3934 return 0; 3935 } 3936 3937 /** 3938 * ram_postcopy_incoming_init: allocate postcopy data structures 3939 * 3940 * Returns 0 for success and negative if there was one error 3941 * 3942 * @mis: current migration incoming state 3943 * 3944 * Allocate data structures etc needed by incoming migration with 3945 * postcopy-ram. postcopy-ram's similarly names 3946 * postcopy_ram_incoming_init does the work. 3947 */ 3948 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3949 { 3950 return postcopy_ram_incoming_init(mis); 3951 } 3952 3953 /** 3954 * ram_load_postcopy: load a page in postcopy case 3955 * 3956 * Returns 0 for success or -errno in case of error 3957 * 3958 * Called in postcopy mode by ram_load(). 3959 * rcu_read_lock is taken prior to this being called. 3960 * 3961 * @f: QEMUFile where to send the data 3962 * @channel: the channel to use for loading 3963 */ 3964 int ram_load_postcopy(QEMUFile *f, int channel) 3965 { 3966 int flags = 0, ret = 0; 3967 bool place_needed = false; 3968 bool matches_target_page_size = false; 3969 MigrationIncomingState *mis = migration_incoming_get_current(); 3970 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3971 3972 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3973 ram_addr_t addr; 3974 void *page_buffer = NULL; 3975 void *place_source = NULL; 3976 RAMBlock *block = NULL; 3977 uint8_t ch; 3978 int len; 3979 3980 addr = qemu_get_be64(f); 3981 3982 /* 3983 * If qemu file error, we should stop here, and then "addr" 3984 * may be invalid 3985 */ 3986 ret = qemu_file_get_error(f); 3987 if (ret) { 3988 break; 3989 } 3990 3991 flags = addr & ~TARGET_PAGE_MASK; 3992 addr &= TARGET_PAGE_MASK; 3993 3994 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3995 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3996 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3997 block = ram_block_from_stream(mis, f, flags, channel); 3998 if (!block) { 3999 ret = -EINVAL; 4000 break; 4001 } 4002 4003 /* 4004 * Relying on used_length is racy and can result in false positives. 4005 * We might place pages beyond used_length in case RAM was shrunk 4006 * while in postcopy, which is fine - trying to place via 4007 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4008 */ 4009 if (!block->host || addr >= block->postcopy_length) { 4010 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4011 ret = -EINVAL; 4012 break; 4013 } 4014 tmp_page->target_pages++; 4015 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4016 /* 4017 * Postcopy requires that we place whole host pages atomically; 4018 * these may be huge pages for RAMBlocks that are backed by 4019 * hugetlbfs. 4020 * To make it atomic, the data is read into a temporary page 4021 * that's moved into place later. 4022 * The migration protocol uses, possibly smaller, target-pages 4023 * however the source ensures it always sends all the components 4024 * of a host page in one chunk. 4025 */ 4026 page_buffer = tmp_page->tmp_huge_page + 4027 host_page_offset_from_ram_block_offset(block, addr); 4028 /* If all TP are zero then we can optimise the place */ 4029 if (tmp_page->target_pages == 1) { 4030 tmp_page->host_addr = 4031 host_page_from_ram_block_offset(block, addr); 4032 } else if (tmp_page->host_addr != 4033 host_page_from_ram_block_offset(block, addr)) { 4034 /* not the 1st TP within the HP */ 4035 error_report("Non-same host page detected on channel %d: " 4036 "Target host page %p, received host page %p " 4037 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4038 channel, tmp_page->host_addr, 4039 host_page_from_ram_block_offset(block, addr), 4040 block->idstr, addr, tmp_page->target_pages); 4041 ret = -EINVAL; 4042 break; 4043 } 4044 4045 /* 4046 * If it's the last part of a host page then we place the host 4047 * page 4048 */ 4049 if (tmp_page->target_pages == 4050 (block->page_size / TARGET_PAGE_SIZE)) { 4051 place_needed = true; 4052 } 4053 place_source = tmp_page->tmp_huge_page; 4054 } 4055 4056 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4057 case RAM_SAVE_FLAG_ZERO: 4058 ch = qemu_get_byte(f); 4059 /* 4060 * Can skip to set page_buffer when 4061 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4062 */ 4063 if (ch || !matches_target_page_size) { 4064 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4065 } 4066 if (ch) { 4067 tmp_page->all_zero = false; 4068 } 4069 break; 4070 4071 case RAM_SAVE_FLAG_PAGE: 4072 tmp_page->all_zero = false; 4073 if (!matches_target_page_size) { 4074 /* For huge pages, we always use temporary buffer */ 4075 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4076 } else { 4077 /* 4078 * For small pages that matches target page size, we 4079 * avoid the qemu_file copy. Instead we directly use 4080 * the buffer of QEMUFile to place the page. Note: we 4081 * cannot do any QEMUFile operation before using that 4082 * buffer to make sure the buffer is valid when 4083 * placing the page. 4084 */ 4085 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4086 TARGET_PAGE_SIZE); 4087 } 4088 break; 4089 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4090 tmp_page->all_zero = false; 4091 len = qemu_get_be32(f); 4092 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4093 error_report("Invalid compressed data length: %d", len); 4094 ret = -EINVAL; 4095 break; 4096 } 4097 decompress_data_with_multi_threads(f, page_buffer, len); 4098 break; 4099 4100 case RAM_SAVE_FLAG_EOS: 4101 /* normal exit */ 4102 multifd_recv_sync_main(); 4103 break; 4104 default: 4105 error_report("Unknown combination of migration flags: 0x%x" 4106 " (postcopy mode)", flags); 4107 ret = -EINVAL; 4108 break; 4109 } 4110 4111 /* Got the whole host page, wait for decompress before placing. */ 4112 if (place_needed) { 4113 ret |= wait_for_decompress_done(); 4114 } 4115 4116 /* Detect for any possible file errors */ 4117 if (!ret && qemu_file_get_error(f)) { 4118 ret = qemu_file_get_error(f); 4119 } 4120 4121 if (!ret && place_needed) { 4122 if (tmp_page->all_zero) { 4123 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4124 } else { 4125 ret = postcopy_place_page(mis, tmp_page->host_addr, 4126 place_source, block); 4127 } 4128 place_needed = false; 4129 postcopy_temp_page_reset(tmp_page); 4130 } 4131 } 4132 4133 return ret; 4134 } 4135 4136 static bool postcopy_is_advised(void) 4137 { 4138 PostcopyState ps = postcopy_state_get(); 4139 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 4140 } 4141 4142 static bool postcopy_is_running(void) 4143 { 4144 PostcopyState ps = postcopy_state_get(); 4145 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4146 } 4147 4148 /* 4149 * Flush content of RAM cache into SVM's memory. 4150 * Only flush the pages that be dirtied by PVM or SVM or both. 4151 */ 4152 void colo_flush_ram_cache(void) 4153 { 4154 RAMBlock *block = NULL; 4155 void *dst_host; 4156 void *src_host; 4157 unsigned long offset = 0; 4158 4159 memory_global_dirty_log_sync(); 4160 WITH_RCU_READ_LOCK_GUARD() { 4161 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4162 ramblock_sync_dirty_bitmap(ram_state, block); 4163 } 4164 } 4165 4166 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4167 WITH_RCU_READ_LOCK_GUARD() { 4168 block = QLIST_FIRST_RCU(&ram_list.blocks); 4169 4170 while (block) { 4171 unsigned long num = 0; 4172 4173 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4174 if (!offset_in_ramblock(block, 4175 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4176 offset = 0; 4177 num = 0; 4178 block = QLIST_NEXT_RCU(block, next); 4179 } else { 4180 unsigned long i = 0; 4181 4182 for (i = 0; i < num; i++) { 4183 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4184 } 4185 dst_host = block->host 4186 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4187 src_host = block->colo_cache 4188 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4189 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4190 offset += num; 4191 } 4192 } 4193 } 4194 trace_colo_flush_ram_cache_end(); 4195 } 4196 4197 /** 4198 * ram_load_precopy: load pages in precopy case 4199 * 4200 * Returns 0 for success or -errno in case of error 4201 * 4202 * Called in precopy mode by ram_load(). 4203 * rcu_read_lock is taken prior to this being called. 4204 * 4205 * @f: QEMUFile where to send the data 4206 */ 4207 static int ram_load_precopy(QEMUFile *f) 4208 { 4209 MigrationIncomingState *mis = migration_incoming_get_current(); 4210 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4211 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4212 bool postcopy_advised = postcopy_is_advised(); 4213 if (!migrate_use_compression()) { 4214 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4215 } 4216 4217 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4218 ram_addr_t addr, total_ram_bytes; 4219 void *host = NULL, *host_bak = NULL; 4220 uint8_t ch; 4221 4222 /* 4223 * Yield periodically to let main loop run, but an iteration of 4224 * the main loop is expensive, so do it each some iterations 4225 */ 4226 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4227 aio_co_schedule(qemu_get_current_aio_context(), 4228 qemu_coroutine_self()); 4229 qemu_coroutine_yield(); 4230 } 4231 i++; 4232 4233 addr = qemu_get_be64(f); 4234 flags = addr & ~TARGET_PAGE_MASK; 4235 addr &= TARGET_PAGE_MASK; 4236 4237 if (flags & invalid_flags) { 4238 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4239 error_report("Received an unexpected compressed page"); 4240 } 4241 4242 ret = -EINVAL; 4243 break; 4244 } 4245 4246 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4247 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4248 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4249 RAM_CHANNEL_PRECOPY); 4250 4251 host = host_from_ram_block_offset(block, addr); 4252 /* 4253 * After going into COLO stage, we should not load the page 4254 * into SVM's memory directly, we put them into colo_cache firstly. 4255 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4256 * Previously, we copied all these memory in preparing stage of COLO 4257 * while we need to stop VM, which is a time-consuming process. 4258 * Here we optimize it by a trick, back-up every page while in 4259 * migration process while COLO is enabled, though it affects the 4260 * speed of the migration, but it obviously reduce the downtime of 4261 * back-up all SVM'S memory in COLO preparing stage. 4262 */ 4263 if (migration_incoming_colo_enabled()) { 4264 if (migration_incoming_in_colo_state()) { 4265 /* In COLO stage, put all pages into cache temporarily */ 4266 host = colo_cache_from_block_offset(block, addr, true); 4267 } else { 4268 /* 4269 * In migration stage but before COLO stage, 4270 * Put all pages into both cache and SVM's memory. 4271 */ 4272 host_bak = colo_cache_from_block_offset(block, addr, false); 4273 } 4274 } 4275 if (!host) { 4276 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4277 ret = -EINVAL; 4278 break; 4279 } 4280 if (!migration_incoming_in_colo_state()) { 4281 ramblock_recv_bitmap_set(block, host); 4282 } 4283 4284 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4285 } 4286 4287 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4288 case RAM_SAVE_FLAG_MEM_SIZE: 4289 /* Synchronize RAM block list */ 4290 total_ram_bytes = addr; 4291 while (!ret && total_ram_bytes) { 4292 RAMBlock *block; 4293 char id[256]; 4294 ram_addr_t length; 4295 4296 len = qemu_get_byte(f); 4297 qemu_get_buffer(f, (uint8_t *)id, len); 4298 id[len] = 0; 4299 length = qemu_get_be64(f); 4300 4301 block = qemu_ram_block_by_name(id); 4302 if (block && !qemu_ram_is_migratable(block)) { 4303 error_report("block %s should not be migrated !", id); 4304 ret = -EINVAL; 4305 } else if (block) { 4306 if (length != block->used_length) { 4307 Error *local_err = NULL; 4308 4309 ret = qemu_ram_resize(block, length, 4310 &local_err); 4311 if (local_err) { 4312 error_report_err(local_err); 4313 } 4314 } 4315 /* For postcopy we need to check hugepage sizes match */ 4316 if (postcopy_advised && migrate_postcopy_ram() && 4317 block->page_size != qemu_host_page_size) { 4318 uint64_t remote_page_size = qemu_get_be64(f); 4319 if (remote_page_size != block->page_size) { 4320 error_report("Mismatched RAM page size %s " 4321 "(local) %zd != %" PRId64, 4322 id, block->page_size, 4323 remote_page_size); 4324 ret = -EINVAL; 4325 } 4326 } 4327 if (migrate_ignore_shared()) { 4328 hwaddr addr = qemu_get_be64(f); 4329 if (ramblock_is_ignored(block) && 4330 block->mr->addr != addr) { 4331 error_report("Mismatched GPAs for block %s " 4332 "%" PRId64 "!= %" PRId64, 4333 id, (uint64_t)addr, 4334 (uint64_t)block->mr->addr); 4335 ret = -EINVAL; 4336 } 4337 } 4338 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4339 block->idstr); 4340 } else { 4341 error_report("Unknown ramblock \"%s\", cannot " 4342 "accept migration", id); 4343 ret = -EINVAL; 4344 } 4345 4346 total_ram_bytes -= length; 4347 } 4348 break; 4349 4350 case RAM_SAVE_FLAG_ZERO: 4351 ch = qemu_get_byte(f); 4352 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4353 break; 4354 4355 case RAM_SAVE_FLAG_PAGE: 4356 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4357 break; 4358 4359 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4360 len = qemu_get_be32(f); 4361 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4362 error_report("Invalid compressed data length: %d", len); 4363 ret = -EINVAL; 4364 break; 4365 } 4366 decompress_data_with_multi_threads(f, host, len); 4367 break; 4368 4369 case RAM_SAVE_FLAG_XBZRLE: 4370 if (load_xbzrle(f, addr, host) < 0) { 4371 error_report("Failed to decompress XBZRLE page at " 4372 RAM_ADDR_FMT, addr); 4373 ret = -EINVAL; 4374 break; 4375 } 4376 break; 4377 case RAM_SAVE_FLAG_EOS: 4378 /* normal exit */ 4379 multifd_recv_sync_main(); 4380 break; 4381 default: 4382 if (flags & RAM_SAVE_FLAG_HOOK) { 4383 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4384 } else { 4385 error_report("Unknown combination of migration flags: 0x%x", 4386 flags); 4387 ret = -EINVAL; 4388 } 4389 } 4390 if (!ret) { 4391 ret = qemu_file_get_error(f); 4392 } 4393 if (!ret && host_bak) { 4394 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4395 } 4396 } 4397 4398 ret |= wait_for_decompress_done(); 4399 return ret; 4400 } 4401 4402 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4403 { 4404 int ret = 0; 4405 static uint64_t seq_iter; 4406 /* 4407 * If system is running in postcopy mode, page inserts to host memory must 4408 * be atomic 4409 */ 4410 bool postcopy_running = postcopy_is_running(); 4411 4412 seq_iter++; 4413 4414 if (version_id != 4) { 4415 return -EINVAL; 4416 } 4417 4418 /* 4419 * This RCU critical section can be very long running. 4420 * When RCU reclaims in the code start to become numerous, 4421 * it will be necessary to reduce the granularity of this 4422 * critical section. 4423 */ 4424 WITH_RCU_READ_LOCK_GUARD() { 4425 if (postcopy_running) { 4426 /* 4427 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4428 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4429 * service fast page faults. 4430 */ 4431 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4432 } else { 4433 ret = ram_load_precopy(f); 4434 } 4435 } 4436 trace_ram_load_complete(ret, seq_iter); 4437 4438 return ret; 4439 } 4440 4441 static bool ram_has_postcopy(void *opaque) 4442 { 4443 RAMBlock *rb; 4444 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4445 if (ramblock_is_pmem(rb)) { 4446 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4447 "is not supported now!", rb->idstr, rb->host); 4448 return false; 4449 } 4450 } 4451 4452 return migrate_postcopy_ram(); 4453 } 4454 4455 /* Sync all the dirty bitmap with destination VM. */ 4456 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4457 { 4458 RAMBlock *block; 4459 QEMUFile *file = s->to_dst_file; 4460 int ramblock_count = 0; 4461 4462 trace_ram_dirty_bitmap_sync_start(); 4463 4464 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4465 qemu_savevm_send_recv_bitmap(file, block->idstr); 4466 trace_ram_dirty_bitmap_request(block->idstr); 4467 ramblock_count++; 4468 } 4469 4470 trace_ram_dirty_bitmap_sync_wait(); 4471 4472 /* Wait until all the ramblocks' dirty bitmap synced */ 4473 while (ramblock_count--) { 4474 qemu_sem_wait(&s->rp_state.rp_sem); 4475 } 4476 4477 trace_ram_dirty_bitmap_sync_complete(); 4478 4479 return 0; 4480 } 4481 4482 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4483 { 4484 qemu_sem_post(&s->rp_state.rp_sem); 4485 } 4486 4487 /* 4488 * Read the received bitmap, revert it as the initial dirty bitmap. 4489 * This is only used when the postcopy migration is paused but wants 4490 * to resume from a middle point. 4491 */ 4492 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4493 { 4494 int ret = -EINVAL; 4495 /* from_dst_file is always valid because we're within rp_thread */ 4496 QEMUFile *file = s->rp_state.from_dst_file; 4497 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4498 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4499 uint64_t size, end_mark; 4500 4501 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4502 4503 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4504 error_report("%s: incorrect state %s", __func__, 4505 MigrationStatus_str(s->state)); 4506 return -EINVAL; 4507 } 4508 4509 /* 4510 * Note: see comments in ramblock_recv_bitmap_send() on why we 4511 * need the endianness conversion, and the paddings. 4512 */ 4513 local_size = ROUND_UP(local_size, 8); 4514 4515 /* Add paddings */ 4516 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4517 4518 size = qemu_get_be64(file); 4519 4520 /* The size of the bitmap should match with our ramblock */ 4521 if (size != local_size) { 4522 error_report("%s: ramblock '%s' bitmap size mismatch " 4523 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4524 block->idstr, size, local_size); 4525 ret = -EINVAL; 4526 goto out; 4527 } 4528 4529 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4530 end_mark = qemu_get_be64(file); 4531 4532 ret = qemu_file_get_error(file); 4533 if (ret || size != local_size) { 4534 error_report("%s: read bitmap failed for ramblock '%s': %d" 4535 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4536 __func__, block->idstr, ret, local_size, size); 4537 ret = -EIO; 4538 goto out; 4539 } 4540 4541 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4542 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4543 __func__, block->idstr, end_mark); 4544 ret = -EINVAL; 4545 goto out; 4546 } 4547 4548 /* 4549 * Endianness conversion. We are during postcopy (though paused). 4550 * The dirty bitmap won't change. We can directly modify it. 4551 */ 4552 bitmap_from_le(block->bmap, le_bitmap, nbits); 4553 4554 /* 4555 * What we received is "received bitmap". Revert it as the initial 4556 * dirty bitmap for this ramblock. 4557 */ 4558 bitmap_complement(block->bmap, block->bmap, nbits); 4559 4560 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4561 ramblock_dirty_bitmap_clear_discarded_pages(block); 4562 4563 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4564 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4565 4566 /* 4567 * We succeeded to sync bitmap for current ramblock. If this is 4568 * the last one to sync, we need to notify the main send thread. 4569 */ 4570 ram_dirty_bitmap_reload_notify(s); 4571 4572 ret = 0; 4573 out: 4574 g_free(le_bitmap); 4575 return ret; 4576 } 4577 4578 static int ram_resume_prepare(MigrationState *s, void *opaque) 4579 { 4580 RAMState *rs = *(RAMState **)opaque; 4581 int ret; 4582 4583 ret = ram_dirty_bitmap_sync_all(s, rs); 4584 if (ret) { 4585 return ret; 4586 } 4587 4588 ram_state_resume_prepare(rs, s->to_dst_file); 4589 4590 return 0; 4591 } 4592 4593 void postcopy_preempt_shutdown_file(MigrationState *s) 4594 { 4595 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4596 qemu_fflush(s->postcopy_qemufile_src); 4597 } 4598 4599 static SaveVMHandlers savevm_ram_handlers = { 4600 .save_setup = ram_save_setup, 4601 .save_live_iterate = ram_save_iterate, 4602 .save_live_complete_postcopy = ram_save_complete, 4603 .save_live_complete_precopy = ram_save_complete, 4604 .has_postcopy = ram_has_postcopy, 4605 .save_live_pending = ram_save_pending, 4606 .load_state = ram_load, 4607 .save_cleanup = ram_save_cleanup, 4608 .load_setup = ram_load_setup, 4609 .load_cleanup = ram_load_cleanup, 4610 .resume_prepare = ram_resume_prepare, 4611 }; 4612 4613 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4614 size_t old_size, size_t new_size) 4615 { 4616 PostcopyState ps = postcopy_state_get(); 4617 ram_addr_t offset; 4618 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4619 Error *err = NULL; 4620 4621 if (ramblock_is_ignored(rb)) { 4622 return; 4623 } 4624 4625 if (!migration_is_idle()) { 4626 /* 4627 * Precopy code on the source cannot deal with the size of RAM blocks 4628 * changing at random points in time - especially after sending the 4629 * RAM block sizes in the migration stream, they must no longer change. 4630 * Abort and indicate a proper reason. 4631 */ 4632 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4633 migration_cancel(err); 4634 error_free(err); 4635 } 4636 4637 switch (ps) { 4638 case POSTCOPY_INCOMING_ADVISE: 4639 /* 4640 * Update what ram_postcopy_incoming_init()->init_range() does at the 4641 * time postcopy was advised. Syncing RAM blocks with the source will 4642 * result in RAM resizes. 4643 */ 4644 if (old_size < new_size) { 4645 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4646 error_report("RAM block '%s' discard of resized RAM failed", 4647 rb->idstr); 4648 } 4649 } 4650 rb->postcopy_length = new_size; 4651 break; 4652 case POSTCOPY_INCOMING_NONE: 4653 case POSTCOPY_INCOMING_RUNNING: 4654 case POSTCOPY_INCOMING_END: 4655 /* 4656 * Once our guest is running, postcopy does no longer care about 4657 * resizes. When growing, the new memory was not available on the 4658 * source, no handler needed. 4659 */ 4660 break; 4661 default: 4662 error_report("RAM block '%s' resized during postcopy state: %d", 4663 rb->idstr, ps); 4664 exit(-1); 4665 } 4666 } 4667 4668 static RAMBlockNotifier ram_mig_ram_notifier = { 4669 .ram_block_resized = ram_mig_ram_block_resized, 4670 }; 4671 4672 void ram_mig_init(void) 4673 { 4674 qemu_mutex_init(&XBZRLE.lock); 4675 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4676 ram_block_notifier_add(&ram_mig_ram_notifier); 4677 } 4678