1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #include "hw/boards.h" /* for machine_dump_guest_core() */ 62 63 #if defined(__linux__) 64 #include "qemu/userfaultfd.h" 65 #endif /* defined(__linux__) */ 66 67 /***********************************************************/ 68 /* ram save/restore */ 69 70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 71 * worked for pages that where filled with the same char. We switched 72 * it to only search for the zero value. And to avoid confusion with 73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 74 */ 75 76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 77 #define RAM_SAVE_FLAG_ZERO 0x02 78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 79 #define RAM_SAVE_FLAG_PAGE 0x08 80 #define RAM_SAVE_FLAG_EOS 0x10 81 #define RAM_SAVE_FLAG_CONTINUE 0x20 82 #define RAM_SAVE_FLAG_XBZRLE 0x40 83 /* 0x80 is reserved in migration.h start with 0x100 next */ 84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 85 86 XBZRLECacheStats xbzrle_counters; 87 88 /* struct contains XBZRLE cache and a static page 89 used by the compression */ 90 static struct { 91 /* buffer used for XBZRLE encoding */ 92 uint8_t *encoded_buf; 93 /* buffer for storing page content */ 94 uint8_t *current_buf; 95 /* Cache for XBZRLE, Protected by lock. */ 96 PageCache *cache; 97 QemuMutex lock; 98 /* it will store a page full of zeros */ 99 uint8_t *zero_target_page; 100 /* buffer used for XBZRLE decoding */ 101 uint8_t *decoded_buf; 102 } XBZRLE; 103 104 static void XBZRLE_cache_lock(void) 105 { 106 if (migrate_use_xbzrle()) { 107 qemu_mutex_lock(&XBZRLE.lock); 108 } 109 } 110 111 static void XBZRLE_cache_unlock(void) 112 { 113 if (migrate_use_xbzrle()) { 114 qemu_mutex_unlock(&XBZRLE.lock); 115 } 116 } 117 118 /** 119 * xbzrle_cache_resize: resize the xbzrle cache 120 * 121 * This function is called from migrate_params_apply in main 122 * thread, possibly while a migration is in progress. A running 123 * migration may be using the cache and might finish during this call, 124 * hence changes to the cache are protected by XBZRLE.lock(). 125 * 126 * Returns 0 for success or -1 for error 127 * 128 * @new_size: new cache size 129 * @errp: set *errp if the check failed, with reason 130 */ 131 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 132 { 133 PageCache *new_cache; 134 int64_t ret = 0; 135 136 /* Check for truncation */ 137 if (new_size != (size_t)new_size) { 138 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 139 "exceeding address space"); 140 return -1; 141 } 142 143 if (new_size == migrate_xbzrle_cache_size()) { 144 /* nothing to do */ 145 return 0; 146 } 147 148 XBZRLE_cache_lock(); 149 150 if (XBZRLE.cache != NULL) { 151 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 152 if (!new_cache) { 153 ret = -1; 154 goto out; 155 } 156 157 cache_fini(XBZRLE.cache); 158 XBZRLE.cache = new_cache; 159 } 160 out: 161 XBZRLE_cache_unlock(); 162 return ret; 163 } 164 165 bool ramblock_is_ignored(RAMBlock *block) 166 { 167 return !qemu_ram_is_migratable(block) || 168 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 169 } 170 171 #undef RAMBLOCK_FOREACH 172 173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 174 { 175 RAMBlock *block; 176 int ret = 0; 177 178 RCU_READ_LOCK_GUARD(); 179 180 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 181 ret = func(block, opaque); 182 if (ret) { 183 break; 184 } 185 } 186 return ret; 187 } 188 189 static void ramblock_recv_map_init(void) 190 { 191 RAMBlock *rb; 192 193 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 194 assert(!rb->receivedmap); 195 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 196 } 197 } 198 199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 200 { 201 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 202 rb->receivedmap); 203 } 204 205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 206 { 207 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 208 } 209 210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 211 { 212 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 213 } 214 215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 216 size_t nr) 217 { 218 bitmap_set_atomic(rb->receivedmap, 219 ramblock_recv_bitmap_offset(host_addr, rb), 220 nr); 221 } 222 223 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 224 225 /* 226 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 227 * 228 * Returns >0 if success with sent bytes, or <0 if error. 229 */ 230 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 231 const char *block_name) 232 { 233 RAMBlock *block = qemu_ram_block_by_name(block_name); 234 unsigned long *le_bitmap, nbits; 235 uint64_t size; 236 237 if (!block) { 238 error_report("%s: invalid block name: %s", __func__, block_name); 239 return -1; 240 } 241 242 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 243 244 /* 245 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 246 * machines we may need 4 more bytes for padding (see below 247 * comment). So extend it a bit before hand. 248 */ 249 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 250 251 /* 252 * Always use little endian when sending the bitmap. This is 253 * required that when source and destination VMs are not using the 254 * same endianness. (Note: big endian won't work.) 255 */ 256 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 257 258 /* Size of the bitmap, in bytes */ 259 size = DIV_ROUND_UP(nbits, 8); 260 261 /* 262 * size is always aligned to 8 bytes for 64bit machines, but it 263 * may not be true for 32bit machines. We need this padding to 264 * make sure the migration can survive even between 32bit and 265 * 64bit machines. 266 */ 267 size = ROUND_UP(size, 8); 268 269 qemu_put_be64(file, size); 270 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 271 /* 272 * Mark as an end, in case the middle part is screwed up due to 273 * some "mysterious" reason. 274 */ 275 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 276 qemu_fflush(file); 277 278 g_free(le_bitmap); 279 280 if (qemu_file_get_error(file)) { 281 return qemu_file_get_error(file); 282 } 283 284 return size + sizeof(size); 285 } 286 287 /* 288 * An outstanding page request, on the source, having been received 289 * and queued 290 */ 291 struct RAMSrcPageRequest { 292 RAMBlock *rb; 293 hwaddr offset; 294 hwaddr len; 295 296 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 297 }; 298 299 typedef struct { 300 /* 301 * Cached ramblock/offset values if preempted. They're only meaningful if 302 * preempted==true below. 303 */ 304 RAMBlock *ram_block; 305 unsigned long ram_page; 306 /* 307 * Whether a postcopy preemption just happened. Will be reset after 308 * precopy recovered to background migration. 309 */ 310 bool preempted; 311 } PostcopyPreemptState; 312 313 /* State of RAM for migration */ 314 struct RAMState { 315 /* QEMUFile used for this migration */ 316 QEMUFile *f; 317 /* UFFD file descriptor, used in 'write-tracking' migration */ 318 int uffdio_fd; 319 /* Last block that we have visited searching for dirty pages */ 320 RAMBlock *last_seen_block; 321 /* Last block from where we have sent data */ 322 RAMBlock *last_sent_block; 323 /* Last dirty target page we have sent */ 324 ram_addr_t last_page; 325 /* last ram version we have seen */ 326 uint32_t last_version; 327 /* How many times we have dirty too many pages */ 328 int dirty_rate_high_cnt; 329 /* these variables are used for bitmap sync */ 330 /* last time we did a full bitmap_sync */ 331 int64_t time_last_bitmap_sync; 332 /* bytes transferred at start_time */ 333 uint64_t bytes_xfer_prev; 334 /* number of dirty pages since start_time */ 335 uint64_t num_dirty_pages_period; 336 /* xbzrle misses since the beginning of the period */ 337 uint64_t xbzrle_cache_miss_prev; 338 /* Amount of xbzrle pages since the beginning of the period */ 339 uint64_t xbzrle_pages_prev; 340 /* Amount of xbzrle encoded bytes since the beginning of the period */ 341 uint64_t xbzrle_bytes_prev; 342 /* Start using XBZRLE (e.g., after the first round). */ 343 bool xbzrle_enabled; 344 /* Are we on the last stage of migration */ 345 bool last_stage; 346 /* compression statistics since the beginning of the period */ 347 /* amount of count that no free thread to compress data */ 348 uint64_t compress_thread_busy_prev; 349 /* amount bytes after compression */ 350 uint64_t compressed_size_prev; 351 /* amount of compressed pages */ 352 uint64_t compress_pages_prev; 353 354 /* total handled target pages at the beginning of period */ 355 uint64_t target_page_count_prev; 356 /* total handled target pages since start */ 357 uint64_t target_page_count; 358 /* number of dirty bits in the bitmap */ 359 uint64_t migration_dirty_pages; 360 /* Protects modification of the bitmap and migration dirty pages */ 361 QemuMutex bitmap_mutex; 362 /* The RAMBlock used in the last src_page_requests */ 363 RAMBlock *last_req_rb; 364 /* Queue of outstanding page requests from the destination */ 365 QemuMutex src_page_req_mutex; 366 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 367 368 /* Postcopy preemption informations */ 369 PostcopyPreemptState postcopy_preempt_state; 370 /* 371 * Current channel we're using on src VM. Only valid if postcopy-preempt 372 * is enabled. 373 */ 374 unsigned int postcopy_channel; 375 }; 376 typedef struct RAMState RAMState; 377 378 static RAMState *ram_state; 379 380 static NotifierWithReturnList precopy_notifier_list; 381 382 static void postcopy_preempt_reset(RAMState *rs) 383 { 384 memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState)); 385 } 386 387 /* Whether postcopy has queued requests? */ 388 static bool postcopy_has_request(RAMState *rs) 389 { 390 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 391 } 392 393 void precopy_infrastructure_init(void) 394 { 395 notifier_with_return_list_init(&precopy_notifier_list); 396 } 397 398 void precopy_add_notifier(NotifierWithReturn *n) 399 { 400 notifier_with_return_list_add(&precopy_notifier_list, n); 401 } 402 403 void precopy_remove_notifier(NotifierWithReturn *n) 404 { 405 notifier_with_return_remove(n); 406 } 407 408 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 409 { 410 PrecopyNotifyData pnd; 411 pnd.reason = reason; 412 pnd.errp = errp; 413 414 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 415 } 416 417 uint64_t ram_bytes_remaining(void) 418 { 419 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 420 0; 421 } 422 423 MigrationStats ram_counters; 424 425 static void ram_transferred_add(uint64_t bytes) 426 { 427 if (runstate_is_running()) { 428 ram_counters.precopy_bytes += bytes; 429 } else if (migration_in_postcopy()) { 430 ram_counters.postcopy_bytes += bytes; 431 } else { 432 ram_counters.downtime_bytes += bytes; 433 } 434 ram_counters.transferred += bytes; 435 } 436 437 /* used by the search for pages to send */ 438 struct PageSearchStatus { 439 /* Current block being searched */ 440 RAMBlock *block; 441 /* Current page to search from */ 442 unsigned long page; 443 /* Set once we wrap around */ 444 bool complete_round; 445 /* 446 * [POSTCOPY-ONLY] Whether current page is explicitly requested by 447 * postcopy. When set, the request is "urgent" because the dest QEMU 448 * threads are waiting for us. 449 */ 450 bool postcopy_requested; 451 /* 452 * [POSTCOPY-ONLY] The target channel to use to send current page. 453 * 454 * Note: This may _not_ match with the value in postcopy_requested 455 * above. Let's imagine the case where the postcopy request is exactly 456 * the page that we're sending in progress during precopy. In this case 457 * we'll have postcopy_requested set to true but the target channel 458 * will be the precopy channel (so that we don't split brain on that 459 * specific page since the precopy channel already contains partial of 460 * that page data). 461 * 462 * Besides that specific use case, postcopy_target_channel should 463 * always be equal to postcopy_requested, because by default we send 464 * postcopy pages via postcopy preempt channel. 465 */ 466 bool postcopy_target_channel; 467 }; 468 typedef struct PageSearchStatus PageSearchStatus; 469 470 CompressionStats compression_counters; 471 472 struct CompressParam { 473 bool done; 474 bool quit; 475 bool zero_page; 476 QEMUFile *file; 477 QemuMutex mutex; 478 QemuCond cond; 479 RAMBlock *block; 480 ram_addr_t offset; 481 482 /* internally used fields */ 483 z_stream stream; 484 uint8_t *originbuf; 485 }; 486 typedef struct CompressParam CompressParam; 487 488 struct DecompressParam { 489 bool done; 490 bool quit; 491 QemuMutex mutex; 492 QemuCond cond; 493 void *des; 494 uint8_t *compbuf; 495 int len; 496 z_stream stream; 497 }; 498 typedef struct DecompressParam DecompressParam; 499 500 static CompressParam *comp_param; 501 static QemuThread *compress_threads; 502 /* comp_done_cond is used to wake up the migration thread when 503 * one of the compression threads has finished the compression. 504 * comp_done_lock is used to co-work with comp_done_cond. 505 */ 506 static QemuMutex comp_done_lock; 507 static QemuCond comp_done_cond; 508 509 static QEMUFile *decomp_file; 510 static DecompressParam *decomp_param; 511 static QemuThread *decompress_threads; 512 static QemuMutex decomp_done_lock; 513 static QemuCond decomp_done_cond; 514 515 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 516 ram_addr_t offset, uint8_t *source_buf); 517 518 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, 519 bool postcopy_requested); 520 521 static void *do_data_compress(void *opaque) 522 { 523 CompressParam *param = opaque; 524 RAMBlock *block; 525 ram_addr_t offset; 526 bool zero_page; 527 528 qemu_mutex_lock(¶m->mutex); 529 while (!param->quit) { 530 if (param->block) { 531 block = param->block; 532 offset = param->offset; 533 param->block = NULL; 534 qemu_mutex_unlock(¶m->mutex); 535 536 zero_page = do_compress_ram_page(param->file, ¶m->stream, 537 block, offset, param->originbuf); 538 539 qemu_mutex_lock(&comp_done_lock); 540 param->done = true; 541 param->zero_page = zero_page; 542 qemu_cond_signal(&comp_done_cond); 543 qemu_mutex_unlock(&comp_done_lock); 544 545 qemu_mutex_lock(¶m->mutex); 546 } else { 547 qemu_cond_wait(¶m->cond, ¶m->mutex); 548 } 549 } 550 qemu_mutex_unlock(¶m->mutex); 551 552 return NULL; 553 } 554 555 static void compress_threads_save_cleanup(void) 556 { 557 int i, thread_count; 558 559 if (!migrate_use_compression() || !comp_param) { 560 return; 561 } 562 563 thread_count = migrate_compress_threads(); 564 for (i = 0; i < thread_count; i++) { 565 /* 566 * we use it as a indicator which shows if the thread is 567 * properly init'd or not 568 */ 569 if (!comp_param[i].file) { 570 break; 571 } 572 573 qemu_mutex_lock(&comp_param[i].mutex); 574 comp_param[i].quit = true; 575 qemu_cond_signal(&comp_param[i].cond); 576 qemu_mutex_unlock(&comp_param[i].mutex); 577 578 qemu_thread_join(compress_threads + i); 579 qemu_mutex_destroy(&comp_param[i].mutex); 580 qemu_cond_destroy(&comp_param[i].cond); 581 deflateEnd(&comp_param[i].stream); 582 g_free(comp_param[i].originbuf); 583 qemu_fclose(comp_param[i].file); 584 comp_param[i].file = NULL; 585 } 586 qemu_mutex_destroy(&comp_done_lock); 587 qemu_cond_destroy(&comp_done_cond); 588 g_free(compress_threads); 589 g_free(comp_param); 590 compress_threads = NULL; 591 comp_param = NULL; 592 } 593 594 static int compress_threads_save_setup(void) 595 { 596 int i, thread_count; 597 598 if (!migrate_use_compression()) { 599 return 0; 600 } 601 thread_count = migrate_compress_threads(); 602 compress_threads = g_new0(QemuThread, thread_count); 603 comp_param = g_new0(CompressParam, thread_count); 604 qemu_cond_init(&comp_done_cond); 605 qemu_mutex_init(&comp_done_lock); 606 for (i = 0; i < thread_count; i++) { 607 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 608 if (!comp_param[i].originbuf) { 609 goto exit; 610 } 611 612 if (deflateInit(&comp_param[i].stream, 613 migrate_compress_level()) != Z_OK) { 614 g_free(comp_param[i].originbuf); 615 goto exit; 616 } 617 618 /* comp_param[i].file is just used as a dummy buffer to save data, 619 * set its ops to empty. 620 */ 621 comp_param[i].file = qemu_file_new_output( 622 QIO_CHANNEL(qio_channel_null_new())); 623 comp_param[i].done = true; 624 comp_param[i].quit = false; 625 qemu_mutex_init(&comp_param[i].mutex); 626 qemu_cond_init(&comp_param[i].cond); 627 qemu_thread_create(compress_threads + i, "compress", 628 do_data_compress, comp_param + i, 629 QEMU_THREAD_JOINABLE); 630 } 631 return 0; 632 633 exit: 634 compress_threads_save_cleanup(); 635 return -1; 636 } 637 638 /** 639 * save_page_header: write page header to wire 640 * 641 * If this is the 1st block, it also writes the block identification 642 * 643 * Returns the number of bytes written 644 * 645 * @f: QEMUFile where to send the data 646 * @block: block that contains the page we want to send 647 * @offset: offset inside the block for the page 648 * in the lower bits, it contains flags 649 */ 650 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 651 ram_addr_t offset) 652 { 653 size_t size, len; 654 655 if (block == rs->last_sent_block) { 656 offset |= RAM_SAVE_FLAG_CONTINUE; 657 } 658 qemu_put_be64(f, offset); 659 size = 8; 660 661 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 662 len = strlen(block->idstr); 663 qemu_put_byte(f, len); 664 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 665 size += 1 + len; 666 rs->last_sent_block = block; 667 } 668 return size; 669 } 670 671 /** 672 * mig_throttle_guest_down: throttle down the guest 673 * 674 * Reduce amount of guest cpu execution to hopefully slow down memory 675 * writes. If guest dirty memory rate is reduced below the rate at 676 * which we can transfer pages to the destination then we should be 677 * able to complete migration. Some workloads dirty memory way too 678 * fast and will not effectively converge, even with auto-converge. 679 */ 680 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 681 uint64_t bytes_dirty_threshold) 682 { 683 MigrationState *s = migrate_get_current(); 684 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 685 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 686 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 687 int pct_max = s->parameters.max_cpu_throttle; 688 689 uint64_t throttle_now = cpu_throttle_get_percentage(); 690 uint64_t cpu_now, cpu_ideal, throttle_inc; 691 692 /* We have not started throttling yet. Let's start it. */ 693 if (!cpu_throttle_active()) { 694 cpu_throttle_set(pct_initial); 695 } else { 696 /* Throttling already on, just increase the rate */ 697 if (!pct_tailslow) { 698 throttle_inc = pct_increment; 699 } else { 700 /* Compute the ideal CPU percentage used by Guest, which may 701 * make the dirty rate match the dirty rate threshold. */ 702 cpu_now = 100 - throttle_now; 703 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 704 bytes_dirty_period); 705 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 706 } 707 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 708 } 709 } 710 711 void mig_throttle_counter_reset(void) 712 { 713 RAMState *rs = ram_state; 714 715 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 716 rs->num_dirty_pages_period = 0; 717 rs->bytes_xfer_prev = ram_counters.transferred; 718 } 719 720 /** 721 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 722 * 723 * @rs: current RAM state 724 * @current_addr: address for the zero page 725 * 726 * Update the xbzrle cache to reflect a page that's been sent as all 0. 727 * The important thing is that a stale (not-yet-0'd) page be replaced 728 * by the new data. 729 * As a bonus, if the page wasn't in the cache it gets added so that 730 * when a small write is made into the 0'd page it gets XBZRLE sent. 731 */ 732 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 733 { 734 if (!rs->xbzrle_enabled) { 735 return; 736 } 737 738 /* We don't care if this fails to allocate a new cache page 739 * as long as it updated an old one */ 740 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 741 ram_counters.dirty_sync_count); 742 } 743 744 #define ENCODING_FLAG_XBZRLE 0x1 745 746 /** 747 * save_xbzrle_page: compress and send current page 748 * 749 * Returns: 1 means that we wrote the page 750 * 0 means that page is identical to the one already sent 751 * -1 means that xbzrle would be longer than normal 752 * 753 * @rs: current RAM state 754 * @current_data: pointer to the address of the page contents 755 * @current_addr: addr of the page 756 * @block: block that contains the page we want to send 757 * @offset: offset inside the block for the page 758 */ 759 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 760 ram_addr_t current_addr, RAMBlock *block, 761 ram_addr_t offset) 762 { 763 int encoded_len = 0, bytes_xbzrle; 764 uint8_t *prev_cached_page; 765 766 if (!cache_is_cached(XBZRLE.cache, current_addr, 767 ram_counters.dirty_sync_count)) { 768 xbzrle_counters.cache_miss++; 769 if (!rs->last_stage) { 770 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 771 ram_counters.dirty_sync_count) == -1) { 772 return -1; 773 } else { 774 /* update *current_data when the page has been 775 inserted into cache */ 776 *current_data = get_cached_data(XBZRLE.cache, current_addr); 777 } 778 } 779 return -1; 780 } 781 782 /* 783 * Reaching here means the page has hit the xbzrle cache, no matter what 784 * encoding result it is (normal encoding, overflow or skipping the page), 785 * count the page as encoded. This is used to calculate the encoding rate. 786 * 787 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 788 * 2nd page turns out to be skipped (i.e. no new bytes written to the 789 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 790 * skipped page included. In this way, the encoding rate can tell if the 791 * guest page is good for xbzrle encoding. 792 */ 793 xbzrle_counters.pages++; 794 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 795 796 /* save current buffer into memory */ 797 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 798 799 /* XBZRLE encoding (if there is no overflow) */ 800 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 801 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 802 TARGET_PAGE_SIZE); 803 804 /* 805 * Update the cache contents, so that it corresponds to the data 806 * sent, in all cases except where we skip the page. 807 */ 808 if (!rs->last_stage && encoded_len != 0) { 809 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 810 /* 811 * In the case where we couldn't compress, ensure that the caller 812 * sends the data from the cache, since the guest might have 813 * changed the RAM since we copied it. 814 */ 815 *current_data = prev_cached_page; 816 } 817 818 if (encoded_len == 0) { 819 trace_save_xbzrle_page_skipping(); 820 return 0; 821 } else if (encoded_len == -1) { 822 trace_save_xbzrle_page_overflow(); 823 xbzrle_counters.overflow++; 824 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 825 return -1; 826 } 827 828 /* Send XBZRLE based compressed page */ 829 bytes_xbzrle = save_page_header(rs, rs->f, block, 830 offset | RAM_SAVE_FLAG_XBZRLE); 831 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 832 qemu_put_be16(rs->f, encoded_len); 833 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 834 bytes_xbzrle += encoded_len + 1 + 2; 835 /* 836 * Like compressed_size (please see update_compress_thread_counts), 837 * the xbzrle encoded bytes don't count the 8 byte header with 838 * RAM_SAVE_FLAG_CONTINUE. 839 */ 840 xbzrle_counters.bytes += bytes_xbzrle - 8; 841 ram_transferred_add(bytes_xbzrle); 842 843 return 1; 844 } 845 846 /** 847 * migration_bitmap_find_dirty: find the next dirty page from start 848 * 849 * Returns the page offset within memory region of the start of a dirty page 850 * 851 * @rs: current RAM state 852 * @rb: RAMBlock where to search for dirty pages 853 * @start: page where we start the search 854 */ 855 static inline 856 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 857 unsigned long start) 858 { 859 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 860 unsigned long *bitmap = rb->bmap; 861 862 if (ramblock_is_ignored(rb)) { 863 return size; 864 } 865 866 return find_next_bit(bitmap, size, start); 867 } 868 869 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 870 unsigned long page) 871 { 872 uint8_t shift; 873 hwaddr size, start; 874 875 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 876 return; 877 } 878 879 shift = rb->clear_bmap_shift; 880 /* 881 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 882 * can make things easier sometimes since then start address 883 * of the small chunk will always be 64 pages aligned so the 884 * bitmap will always be aligned to unsigned long. We should 885 * even be able to remove this restriction but I'm simply 886 * keeping it. 887 */ 888 assert(shift >= 6); 889 890 size = 1ULL << (TARGET_PAGE_BITS + shift); 891 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 892 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 893 memory_region_clear_dirty_bitmap(rb->mr, start, size); 894 } 895 896 static void 897 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 898 unsigned long start, 899 unsigned long npages) 900 { 901 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 902 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 903 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 904 905 /* 906 * Clear pages from start to start + npages - 1, so the end boundary is 907 * exclusive. 908 */ 909 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 910 migration_clear_memory_region_dirty_bitmap(rb, i); 911 } 912 } 913 914 /* 915 * colo_bitmap_find_diry:find contiguous dirty pages from start 916 * 917 * Returns the page offset within memory region of the start of the contiguout 918 * dirty page 919 * 920 * @rs: current RAM state 921 * @rb: RAMBlock where to search for dirty pages 922 * @start: page where we start the search 923 * @num: the number of contiguous dirty pages 924 */ 925 static inline 926 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 927 unsigned long start, unsigned long *num) 928 { 929 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 930 unsigned long *bitmap = rb->bmap; 931 unsigned long first, next; 932 933 *num = 0; 934 935 if (ramblock_is_ignored(rb)) { 936 return size; 937 } 938 939 first = find_next_bit(bitmap, size, start); 940 if (first >= size) { 941 return first; 942 } 943 next = find_next_zero_bit(bitmap, size, first + 1); 944 assert(next >= first); 945 *num = next - first; 946 return first; 947 } 948 949 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 950 RAMBlock *rb, 951 unsigned long page) 952 { 953 bool ret; 954 955 /* 956 * Clear dirty bitmap if needed. This _must_ be called before we 957 * send any of the page in the chunk because we need to make sure 958 * we can capture further page content changes when we sync dirty 959 * log the next time. So as long as we are going to send any of 960 * the page in the chunk we clear the remote dirty bitmap for all. 961 * Clearing it earlier won't be a problem, but too late will. 962 */ 963 migration_clear_memory_region_dirty_bitmap(rb, page); 964 965 ret = test_and_clear_bit(page, rb->bmap); 966 if (ret) { 967 rs->migration_dirty_pages--; 968 } 969 970 return ret; 971 } 972 973 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 974 void *opaque) 975 { 976 const hwaddr offset = section->offset_within_region; 977 const hwaddr size = int128_get64(section->size); 978 const unsigned long start = offset >> TARGET_PAGE_BITS; 979 const unsigned long npages = size >> TARGET_PAGE_BITS; 980 RAMBlock *rb = section->mr->ram_block; 981 uint64_t *cleared_bits = opaque; 982 983 /* 984 * We don't grab ram_state->bitmap_mutex because we expect to run 985 * only when starting migration or during postcopy recovery where 986 * we don't have concurrent access. 987 */ 988 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 989 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 990 } 991 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 992 bitmap_clear(rb->bmap, start, npages); 993 } 994 995 /* 996 * Exclude all dirty pages from migration that fall into a discarded range as 997 * managed by a RamDiscardManager responsible for the mapped memory region of 998 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 999 * 1000 * Discarded pages ("logically unplugged") have undefined content and must 1001 * not get migrated, because even reading these pages for migration might 1002 * result in undesired behavior. 1003 * 1004 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1005 * 1006 * Note: The result is only stable while migrating (precopy/postcopy). 1007 */ 1008 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1009 { 1010 uint64_t cleared_bits = 0; 1011 1012 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1013 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1014 MemoryRegionSection section = { 1015 .mr = rb->mr, 1016 .offset_within_region = 0, 1017 .size = int128_make64(qemu_ram_get_used_length(rb)), 1018 }; 1019 1020 ram_discard_manager_replay_discarded(rdm, §ion, 1021 dirty_bitmap_clear_section, 1022 &cleared_bits); 1023 } 1024 return cleared_bits; 1025 } 1026 1027 /* 1028 * Check if a host-page aligned page falls into a discarded range as managed by 1029 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1030 * 1031 * Note: The result is only stable while migrating (precopy/postcopy). 1032 */ 1033 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1034 { 1035 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1036 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1037 MemoryRegionSection section = { 1038 .mr = rb->mr, 1039 .offset_within_region = start, 1040 .size = int128_make64(qemu_ram_pagesize(rb)), 1041 }; 1042 1043 return !ram_discard_manager_is_populated(rdm, §ion); 1044 } 1045 return false; 1046 } 1047 1048 /* Called with RCU critical section */ 1049 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1050 { 1051 uint64_t new_dirty_pages = 1052 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1053 1054 rs->migration_dirty_pages += new_dirty_pages; 1055 rs->num_dirty_pages_period += new_dirty_pages; 1056 } 1057 1058 /** 1059 * ram_pagesize_summary: calculate all the pagesizes of a VM 1060 * 1061 * Returns a summary bitmap of the page sizes of all RAMBlocks 1062 * 1063 * For VMs with just normal pages this is equivalent to the host page 1064 * size. If it's got some huge pages then it's the OR of all the 1065 * different page sizes. 1066 */ 1067 uint64_t ram_pagesize_summary(void) 1068 { 1069 RAMBlock *block; 1070 uint64_t summary = 0; 1071 1072 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1073 summary |= block->page_size; 1074 } 1075 1076 return summary; 1077 } 1078 1079 uint64_t ram_get_total_transferred_pages(void) 1080 { 1081 return ram_counters.normal + ram_counters.duplicate + 1082 compression_counters.pages + xbzrle_counters.pages; 1083 } 1084 1085 static void migration_update_rates(RAMState *rs, int64_t end_time) 1086 { 1087 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1088 double compressed_size; 1089 1090 /* calculate period counters */ 1091 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1092 / (end_time - rs->time_last_bitmap_sync); 1093 1094 if (!page_count) { 1095 return; 1096 } 1097 1098 if (migrate_use_xbzrle()) { 1099 double encoded_size, unencoded_size; 1100 1101 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1102 rs->xbzrle_cache_miss_prev) / page_count; 1103 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1104 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1105 TARGET_PAGE_SIZE; 1106 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1107 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1108 xbzrle_counters.encoding_rate = 0; 1109 } else { 1110 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1111 } 1112 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1113 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1114 } 1115 1116 if (migrate_use_compression()) { 1117 compression_counters.busy_rate = (double)(compression_counters.busy - 1118 rs->compress_thread_busy_prev) / page_count; 1119 rs->compress_thread_busy_prev = compression_counters.busy; 1120 1121 compressed_size = compression_counters.compressed_size - 1122 rs->compressed_size_prev; 1123 if (compressed_size) { 1124 double uncompressed_size = (compression_counters.pages - 1125 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1126 1127 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1128 compression_counters.compression_rate = 1129 uncompressed_size / compressed_size; 1130 1131 rs->compress_pages_prev = compression_counters.pages; 1132 rs->compressed_size_prev = compression_counters.compressed_size; 1133 } 1134 } 1135 } 1136 1137 static void migration_trigger_throttle(RAMState *rs) 1138 { 1139 MigrationState *s = migrate_get_current(); 1140 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1141 1142 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 1143 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1144 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1145 1146 /* During block migration the auto-converge logic incorrectly detects 1147 * that ram migration makes no progress. Avoid this by disabling the 1148 * throttling logic during the bulk phase of block migration. */ 1149 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1150 /* The following detection logic can be refined later. For now: 1151 Check to see if the ratio between dirtied bytes and the approx. 1152 amount of bytes that just got transferred since the last time 1153 we were in this routine reaches the threshold. If that happens 1154 twice, start or increase throttling. */ 1155 1156 if ((bytes_dirty_period > bytes_dirty_threshold) && 1157 (++rs->dirty_rate_high_cnt >= 2)) { 1158 trace_migration_throttle(); 1159 rs->dirty_rate_high_cnt = 0; 1160 mig_throttle_guest_down(bytes_dirty_period, 1161 bytes_dirty_threshold); 1162 } 1163 } 1164 } 1165 1166 static void migration_bitmap_sync(RAMState *rs) 1167 { 1168 RAMBlock *block; 1169 int64_t end_time; 1170 1171 ram_counters.dirty_sync_count++; 1172 1173 if (!rs->time_last_bitmap_sync) { 1174 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1175 } 1176 1177 trace_migration_bitmap_sync_start(); 1178 memory_global_dirty_log_sync(); 1179 1180 qemu_mutex_lock(&rs->bitmap_mutex); 1181 WITH_RCU_READ_LOCK_GUARD() { 1182 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1183 ramblock_sync_dirty_bitmap(rs, block); 1184 } 1185 ram_counters.remaining = ram_bytes_remaining(); 1186 } 1187 qemu_mutex_unlock(&rs->bitmap_mutex); 1188 1189 memory_global_after_dirty_log_sync(); 1190 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1191 1192 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1193 1194 /* more than 1 second = 1000 millisecons */ 1195 if (end_time > rs->time_last_bitmap_sync + 1000) { 1196 migration_trigger_throttle(rs); 1197 1198 migration_update_rates(rs, end_time); 1199 1200 rs->target_page_count_prev = rs->target_page_count; 1201 1202 /* reset period counters */ 1203 rs->time_last_bitmap_sync = end_time; 1204 rs->num_dirty_pages_period = 0; 1205 rs->bytes_xfer_prev = ram_counters.transferred; 1206 } 1207 if (migrate_use_events()) { 1208 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1209 } 1210 } 1211 1212 static void migration_bitmap_sync_precopy(RAMState *rs) 1213 { 1214 Error *local_err = NULL; 1215 1216 /* 1217 * The current notifier usage is just an optimization to migration, so we 1218 * don't stop the normal migration process in the error case. 1219 */ 1220 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1221 error_report_err(local_err); 1222 local_err = NULL; 1223 } 1224 1225 migration_bitmap_sync(rs); 1226 1227 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1228 error_report_err(local_err); 1229 } 1230 } 1231 1232 static void ram_release_page(const char *rbname, uint64_t offset) 1233 { 1234 if (!migrate_release_ram() || !migration_in_postcopy()) { 1235 return; 1236 } 1237 1238 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1239 } 1240 1241 /** 1242 * save_zero_page_to_file: send the zero page to the file 1243 * 1244 * Returns the size of data written to the file, 0 means the page is not 1245 * a zero page 1246 * 1247 * @rs: current RAM state 1248 * @file: the file where the data is saved 1249 * @block: block that contains the page we want to send 1250 * @offset: offset inside the block for the page 1251 */ 1252 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1253 RAMBlock *block, ram_addr_t offset) 1254 { 1255 uint8_t *p = block->host + offset; 1256 int len = 0; 1257 1258 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1259 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1260 qemu_put_byte(file, 0); 1261 len += 1; 1262 ram_release_page(block->idstr, offset); 1263 } 1264 return len; 1265 } 1266 1267 /** 1268 * save_zero_page: send the zero page to the stream 1269 * 1270 * Returns the number of pages written. 1271 * 1272 * @rs: current RAM state 1273 * @block: block that contains the page we want to send 1274 * @offset: offset inside the block for the page 1275 */ 1276 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1277 { 1278 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1279 1280 if (len) { 1281 ram_counters.duplicate++; 1282 ram_transferred_add(len); 1283 return 1; 1284 } 1285 return -1; 1286 } 1287 1288 /* 1289 * @pages: the number of pages written by the control path, 1290 * < 0 - error 1291 * > 0 - number of pages written 1292 * 1293 * Return true if the pages has been saved, otherwise false is returned. 1294 */ 1295 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1296 int *pages) 1297 { 1298 uint64_t bytes_xmit = 0; 1299 int ret; 1300 1301 *pages = -1; 1302 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1303 &bytes_xmit); 1304 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1305 return false; 1306 } 1307 1308 if (bytes_xmit) { 1309 ram_transferred_add(bytes_xmit); 1310 *pages = 1; 1311 } 1312 1313 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1314 return true; 1315 } 1316 1317 if (bytes_xmit > 0) { 1318 ram_counters.normal++; 1319 } else if (bytes_xmit == 0) { 1320 ram_counters.duplicate++; 1321 } 1322 1323 return true; 1324 } 1325 1326 /* 1327 * directly send the page to the stream 1328 * 1329 * Returns the number of pages written. 1330 * 1331 * @rs: current RAM state 1332 * @block: block that contains the page we want to send 1333 * @offset: offset inside the block for the page 1334 * @buf: the page to be sent 1335 * @async: send to page asyncly 1336 */ 1337 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1338 uint8_t *buf, bool async) 1339 { 1340 ram_transferred_add(save_page_header(rs, rs->f, block, 1341 offset | RAM_SAVE_FLAG_PAGE)); 1342 if (async) { 1343 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1344 migrate_release_ram() && 1345 migration_in_postcopy()); 1346 } else { 1347 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1348 } 1349 ram_transferred_add(TARGET_PAGE_SIZE); 1350 ram_counters.normal++; 1351 return 1; 1352 } 1353 1354 /** 1355 * ram_save_page: send the given page to the stream 1356 * 1357 * Returns the number of pages written. 1358 * < 0 - error 1359 * >=0 - Number of pages written - this might legally be 0 1360 * if xbzrle noticed the page was the same. 1361 * 1362 * @rs: current RAM state 1363 * @block: block that contains the page we want to send 1364 * @offset: offset inside the block for the page 1365 */ 1366 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1367 { 1368 int pages = -1; 1369 uint8_t *p; 1370 bool send_async = true; 1371 RAMBlock *block = pss->block; 1372 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1373 ram_addr_t current_addr = block->offset + offset; 1374 1375 p = block->host + offset; 1376 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1377 1378 XBZRLE_cache_lock(); 1379 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1380 pages = save_xbzrle_page(rs, &p, current_addr, block, 1381 offset); 1382 if (!rs->last_stage) { 1383 /* Can't send this cached data async, since the cache page 1384 * might get updated before it gets to the wire 1385 */ 1386 send_async = false; 1387 } 1388 } 1389 1390 /* XBZRLE overflow or normal page */ 1391 if (pages == -1) { 1392 pages = save_normal_page(rs, block, offset, p, send_async); 1393 } 1394 1395 XBZRLE_cache_unlock(); 1396 1397 return pages; 1398 } 1399 1400 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1401 ram_addr_t offset) 1402 { 1403 if (multifd_queue_page(rs->f, block, offset) < 0) { 1404 return -1; 1405 } 1406 ram_counters.normal++; 1407 1408 return 1; 1409 } 1410 1411 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1412 ram_addr_t offset, uint8_t *source_buf) 1413 { 1414 RAMState *rs = ram_state; 1415 uint8_t *p = block->host + offset; 1416 int ret; 1417 1418 if (save_zero_page_to_file(rs, f, block, offset)) { 1419 return true; 1420 } 1421 1422 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1423 1424 /* 1425 * copy it to a internal buffer to avoid it being modified by VM 1426 * so that we can catch up the error during compression and 1427 * decompression 1428 */ 1429 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1430 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1431 if (ret < 0) { 1432 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1433 error_report("compressed data failed!"); 1434 } 1435 return false; 1436 } 1437 1438 static void 1439 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1440 { 1441 ram_transferred_add(bytes_xmit); 1442 1443 if (param->zero_page) { 1444 ram_counters.duplicate++; 1445 return; 1446 } 1447 1448 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1449 compression_counters.compressed_size += bytes_xmit - 8; 1450 compression_counters.pages++; 1451 } 1452 1453 static bool save_page_use_compression(RAMState *rs); 1454 1455 static void flush_compressed_data(RAMState *rs) 1456 { 1457 int idx, len, thread_count; 1458 1459 if (!save_page_use_compression(rs)) { 1460 return; 1461 } 1462 thread_count = migrate_compress_threads(); 1463 1464 qemu_mutex_lock(&comp_done_lock); 1465 for (idx = 0; idx < thread_count; idx++) { 1466 while (!comp_param[idx].done) { 1467 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1468 } 1469 } 1470 qemu_mutex_unlock(&comp_done_lock); 1471 1472 for (idx = 0; idx < thread_count; idx++) { 1473 qemu_mutex_lock(&comp_param[idx].mutex); 1474 if (!comp_param[idx].quit) { 1475 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1476 /* 1477 * it's safe to fetch zero_page without holding comp_done_lock 1478 * as there is no further request submitted to the thread, 1479 * i.e, the thread should be waiting for a request at this point. 1480 */ 1481 update_compress_thread_counts(&comp_param[idx], len); 1482 } 1483 qemu_mutex_unlock(&comp_param[idx].mutex); 1484 } 1485 } 1486 1487 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1488 ram_addr_t offset) 1489 { 1490 param->block = block; 1491 param->offset = offset; 1492 } 1493 1494 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1495 ram_addr_t offset) 1496 { 1497 int idx, thread_count, bytes_xmit = -1, pages = -1; 1498 bool wait = migrate_compress_wait_thread(); 1499 1500 thread_count = migrate_compress_threads(); 1501 qemu_mutex_lock(&comp_done_lock); 1502 retry: 1503 for (idx = 0; idx < thread_count; idx++) { 1504 if (comp_param[idx].done) { 1505 comp_param[idx].done = false; 1506 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1507 qemu_mutex_lock(&comp_param[idx].mutex); 1508 set_compress_params(&comp_param[idx], block, offset); 1509 qemu_cond_signal(&comp_param[idx].cond); 1510 qemu_mutex_unlock(&comp_param[idx].mutex); 1511 pages = 1; 1512 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1513 break; 1514 } 1515 } 1516 1517 /* 1518 * wait for the free thread if the user specifies 'compress-wait-thread', 1519 * otherwise we will post the page out in the main thread as normal page. 1520 */ 1521 if (pages < 0 && wait) { 1522 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1523 goto retry; 1524 } 1525 qemu_mutex_unlock(&comp_done_lock); 1526 1527 return pages; 1528 } 1529 1530 /** 1531 * find_dirty_block: find the next dirty page and update any state 1532 * associated with the search process. 1533 * 1534 * Returns true if a page is found 1535 * 1536 * @rs: current RAM state 1537 * @pss: data about the state of the current dirty page scan 1538 * @again: set to false if the search has scanned the whole of RAM 1539 */ 1540 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1541 { 1542 /* 1543 * This is not a postcopy requested page, mark it "not urgent", and use 1544 * precopy channel to send it. 1545 */ 1546 pss->postcopy_requested = false; 1547 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; 1548 1549 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1550 if (pss->complete_round && pss->block == rs->last_seen_block && 1551 pss->page >= rs->last_page) { 1552 /* 1553 * We've been once around the RAM and haven't found anything. 1554 * Give up. 1555 */ 1556 *again = false; 1557 return false; 1558 } 1559 if (!offset_in_ramblock(pss->block, 1560 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1561 /* Didn't find anything in this RAM Block */ 1562 pss->page = 0; 1563 pss->block = QLIST_NEXT_RCU(pss->block, next); 1564 if (!pss->block) { 1565 /* 1566 * If memory migration starts over, we will meet a dirtied page 1567 * which may still exists in compression threads's ring, so we 1568 * should flush the compressed data to make sure the new page 1569 * is not overwritten by the old one in the destination. 1570 * 1571 * Also If xbzrle is on, stop using the data compression at this 1572 * point. In theory, xbzrle can do better than compression. 1573 */ 1574 flush_compressed_data(rs); 1575 1576 /* Hit the end of the list */ 1577 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1578 /* Flag that we've looped */ 1579 pss->complete_round = true; 1580 /* After the first round, enable XBZRLE. */ 1581 if (migrate_use_xbzrle()) { 1582 rs->xbzrle_enabled = true; 1583 } 1584 } 1585 /* Didn't find anything this time, but try again on the new block */ 1586 *again = true; 1587 return false; 1588 } else { 1589 /* Can go around again, but... */ 1590 *again = true; 1591 /* We've found something so probably don't need to */ 1592 return true; 1593 } 1594 } 1595 1596 /** 1597 * unqueue_page: gets a page of the queue 1598 * 1599 * Helper for 'get_queued_page' - gets a page off the queue 1600 * 1601 * Returns the block of the page (or NULL if none available) 1602 * 1603 * @rs: current RAM state 1604 * @offset: used to return the offset within the RAMBlock 1605 */ 1606 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1607 { 1608 struct RAMSrcPageRequest *entry; 1609 RAMBlock *block = NULL; 1610 size_t page_size; 1611 1612 if (!postcopy_has_request(rs)) { 1613 return NULL; 1614 } 1615 1616 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1617 1618 /* 1619 * This should _never_ change even after we take the lock, because no one 1620 * should be taking anything off the request list other than us. 1621 */ 1622 assert(postcopy_has_request(rs)); 1623 1624 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1625 block = entry->rb; 1626 *offset = entry->offset; 1627 page_size = qemu_ram_pagesize(block); 1628 /* Each page request should only be multiple page size of the ramblock */ 1629 assert((entry->len % page_size) == 0); 1630 1631 if (entry->len > page_size) { 1632 entry->len -= page_size; 1633 entry->offset += page_size; 1634 } else { 1635 memory_region_unref(block->mr); 1636 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1637 g_free(entry); 1638 migration_consume_urgent_request(); 1639 } 1640 1641 trace_unqueue_page(block->idstr, *offset, 1642 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap)); 1643 1644 return block; 1645 } 1646 1647 #if defined(__linux__) 1648 /** 1649 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1650 * is found, return RAM block pointer and page offset 1651 * 1652 * Returns pointer to the RAMBlock containing faulting page, 1653 * NULL if no write faults are pending 1654 * 1655 * @rs: current RAM state 1656 * @offset: page offset from the beginning of the block 1657 */ 1658 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1659 { 1660 struct uffd_msg uffd_msg; 1661 void *page_address; 1662 RAMBlock *block; 1663 int res; 1664 1665 if (!migrate_background_snapshot()) { 1666 return NULL; 1667 } 1668 1669 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1670 if (res <= 0) { 1671 return NULL; 1672 } 1673 1674 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1675 block = qemu_ram_block_from_host(page_address, false, offset); 1676 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1677 return block; 1678 } 1679 1680 /** 1681 * ram_save_release_protection: release UFFD write protection after 1682 * a range of pages has been saved 1683 * 1684 * @rs: current RAM state 1685 * @pss: page-search-status structure 1686 * @start_page: index of the first page in the range relative to pss->block 1687 * 1688 * Returns 0 on success, negative value in case of an error 1689 */ 1690 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1691 unsigned long start_page) 1692 { 1693 int res = 0; 1694 1695 /* Check if page is from UFFD-managed region. */ 1696 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1697 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1698 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1699 1700 /* Flush async buffers before un-protect. */ 1701 qemu_fflush(rs->f); 1702 /* Un-protect memory range. */ 1703 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1704 false, false); 1705 } 1706 1707 return res; 1708 } 1709 1710 /* ram_write_tracking_available: check if kernel supports required UFFD features 1711 * 1712 * Returns true if supports, false otherwise 1713 */ 1714 bool ram_write_tracking_available(void) 1715 { 1716 uint64_t uffd_features; 1717 int res; 1718 1719 res = uffd_query_features(&uffd_features); 1720 return (res == 0 && 1721 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1722 } 1723 1724 /* ram_write_tracking_compatible: check if guest configuration is 1725 * compatible with 'write-tracking' 1726 * 1727 * Returns true if compatible, false otherwise 1728 */ 1729 bool ram_write_tracking_compatible(void) 1730 { 1731 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1732 int uffd_fd; 1733 RAMBlock *block; 1734 bool ret = false; 1735 1736 /* Open UFFD file descriptor */ 1737 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1738 if (uffd_fd < 0) { 1739 return false; 1740 } 1741 1742 RCU_READ_LOCK_GUARD(); 1743 1744 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1745 uint64_t uffd_ioctls; 1746 1747 /* Nothing to do with read-only and MMIO-writable regions */ 1748 if (block->mr->readonly || block->mr->rom_device) { 1749 continue; 1750 } 1751 /* Try to register block memory via UFFD-IO to track writes */ 1752 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1753 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1754 goto out; 1755 } 1756 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1757 goto out; 1758 } 1759 } 1760 ret = true; 1761 1762 out: 1763 uffd_close_fd(uffd_fd); 1764 return ret; 1765 } 1766 1767 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1768 ram_addr_t size) 1769 { 1770 /* 1771 * We read one byte of each page; this will preallocate page tables if 1772 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1773 * where no page was populated yet. This might require adaption when 1774 * supporting other mappings, like shmem. 1775 */ 1776 for (; offset < size; offset += block->page_size) { 1777 char tmp = *((char *)block->host + offset); 1778 1779 /* Don't optimize the read out */ 1780 asm volatile("" : "+r" (tmp)); 1781 } 1782 } 1783 1784 static inline int populate_read_section(MemoryRegionSection *section, 1785 void *opaque) 1786 { 1787 const hwaddr size = int128_get64(section->size); 1788 hwaddr offset = section->offset_within_region; 1789 RAMBlock *block = section->mr->ram_block; 1790 1791 populate_read_range(block, offset, size); 1792 return 0; 1793 } 1794 1795 /* 1796 * ram_block_populate_read: preallocate page tables and populate pages in the 1797 * RAM block by reading a byte of each page. 1798 * 1799 * Since it's solely used for userfault_fd WP feature, here we just 1800 * hardcode page size to qemu_real_host_page_size. 1801 * 1802 * @block: RAM block to populate 1803 */ 1804 static void ram_block_populate_read(RAMBlock *rb) 1805 { 1806 /* 1807 * Skip populating all pages that fall into a discarded range as managed by 1808 * a RamDiscardManager responsible for the mapped memory region of the 1809 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1810 * must not get populated automatically. We don't have to track 1811 * modifications via userfaultfd WP reliably, because these pages will 1812 * not be part of the migration stream either way -- see 1813 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1814 * 1815 * Note: The result is only stable while migrating (precopy/postcopy). 1816 */ 1817 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1818 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1819 MemoryRegionSection section = { 1820 .mr = rb->mr, 1821 .offset_within_region = 0, 1822 .size = rb->mr->size, 1823 }; 1824 1825 ram_discard_manager_replay_populated(rdm, §ion, 1826 populate_read_section, NULL); 1827 } else { 1828 populate_read_range(rb, 0, rb->used_length); 1829 } 1830 } 1831 1832 /* 1833 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1834 */ 1835 void ram_write_tracking_prepare(void) 1836 { 1837 RAMBlock *block; 1838 1839 RCU_READ_LOCK_GUARD(); 1840 1841 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1842 /* Nothing to do with read-only and MMIO-writable regions */ 1843 if (block->mr->readonly || block->mr->rom_device) { 1844 continue; 1845 } 1846 1847 /* 1848 * Populate pages of the RAM block before enabling userfault_fd 1849 * write protection. 1850 * 1851 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1852 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1853 * pages with pte_none() entries in page table. 1854 */ 1855 ram_block_populate_read(block); 1856 } 1857 } 1858 1859 /* 1860 * ram_write_tracking_start: start UFFD-WP memory tracking 1861 * 1862 * Returns 0 for success or negative value in case of error 1863 */ 1864 int ram_write_tracking_start(void) 1865 { 1866 int uffd_fd; 1867 RAMState *rs = ram_state; 1868 RAMBlock *block; 1869 1870 /* Open UFFD file descriptor */ 1871 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1872 if (uffd_fd < 0) { 1873 return uffd_fd; 1874 } 1875 rs->uffdio_fd = uffd_fd; 1876 1877 RCU_READ_LOCK_GUARD(); 1878 1879 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1880 /* Nothing to do with read-only and MMIO-writable regions */ 1881 if (block->mr->readonly || block->mr->rom_device) { 1882 continue; 1883 } 1884 1885 /* Register block memory with UFFD to track writes */ 1886 if (uffd_register_memory(rs->uffdio_fd, block->host, 1887 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1888 goto fail; 1889 } 1890 /* Apply UFFD write protection to the block memory range */ 1891 if (uffd_change_protection(rs->uffdio_fd, block->host, 1892 block->max_length, true, false)) { 1893 goto fail; 1894 } 1895 block->flags |= RAM_UF_WRITEPROTECT; 1896 memory_region_ref(block->mr); 1897 1898 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1899 block->host, block->max_length); 1900 } 1901 1902 return 0; 1903 1904 fail: 1905 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1906 1907 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1908 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1909 continue; 1910 } 1911 /* 1912 * In case some memory block failed to be write-protected 1913 * remove protection and unregister all succeeded RAM blocks 1914 */ 1915 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1916 false, false); 1917 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1918 /* Cleanup flags and remove reference */ 1919 block->flags &= ~RAM_UF_WRITEPROTECT; 1920 memory_region_unref(block->mr); 1921 } 1922 1923 uffd_close_fd(uffd_fd); 1924 rs->uffdio_fd = -1; 1925 return -1; 1926 } 1927 1928 /** 1929 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1930 */ 1931 void ram_write_tracking_stop(void) 1932 { 1933 RAMState *rs = ram_state; 1934 RAMBlock *block; 1935 1936 RCU_READ_LOCK_GUARD(); 1937 1938 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1939 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1940 continue; 1941 } 1942 /* Remove protection and unregister all affected RAM blocks */ 1943 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1944 false, false); 1945 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1946 1947 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1948 block->host, block->max_length); 1949 1950 /* Cleanup flags and remove reference */ 1951 block->flags &= ~RAM_UF_WRITEPROTECT; 1952 memory_region_unref(block->mr); 1953 } 1954 1955 /* Finally close UFFD file descriptor */ 1956 uffd_close_fd(rs->uffdio_fd); 1957 rs->uffdio_fd = -1; 1958 } 1959 1960 #else 1961 /* No target OS support, stubs just fail or ignore */ 1962 1963 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1964 { 1965 (void) rs; 1966 (void) offset; 1967 1968 return NULL; 1969 } 1970 1971 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1972 unsigned long start_page) 1973 { 1974 (void) rs; 1975 (void) pss; 1976 (void) start_page; 1977 1978 return 0; 1979 } 1980 1981 bool ram_write_tracking_available(void) 1982 { 1983 return false; 1984 } 1985 1986 bool ram_write_tracking_compatible(void) 1987 { 1988 assert(0); 1989 return false; 1990 } 1991 1992 int ram_write_tracking_start(void) 1993 { 1994 assert(0); 1995 return -1; 1996 } 1997 1998 void ram_write_tracking_stop(void) 1999 { 2000 assert(0); 2001 } 2002 #endif /* defined(__linux__) */ 2003 2004 /* 2005 * Check whether two addr/offset of the ramblock falls onto the same host huge 2006 * page. Returns true if so, false otherwise. 2007 */ 2008 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1, 2009 uint64_t addr2) 2010 { 2011 size_t page_size = qemu_ram_pagesize(rb); 2012 2013 addr1 = ROUND_DOWN(addr1, page_size); 2014 addr2 = ROUND_DOWN(addr2, page_size); 2015 2016 return addr1 == addr2; 2017 } 2018 2019 /* 2020 * Whether a previous preempted precopy huge page contains current requested 2021 * page? Returns true if so, false otherwise. 2022 * 2023 * This should really happen very rarely, because it means when we were sending 2024 * during background migration for postcopy we're sending exactly the page that 2025 * some vcpu got faulted on on dest node. When it happens, we probably don't 2026 * need to do much but drop the request, because we know right after we restore 2027 * the precopy stream it'll be serviced. It'll slightly affect the order of 2028 * postcopy requests to be serviced (e.g. it'll be the same as we move current 2029 * request to the end of the queue) but it shouldn't be a big deal. The most 2030 * imporant thing is we can _never_ try to send a partial-sent huge page on the 2031 * POSTCOPY channel again, otherwise that huge page will got "split brain" on 2032 * two channels (PRECOPY, POSTCOPY). 2033 */ 2034 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block, 2035 ram_addr_t offset) 2036 { 2037 PostcopyPreemptState *state = &rs->postcopy_preempt_state; 2038 2039 /* No preemption at all? */ 2040 if (!state->preempted) { 2041 return false; 2042 } 2043 2044 /* Not even the same ramblock? */ 2045 if (state->ram_block != block) { 2046 return false; 2047 } 2048 2049 return offset_on_same_huge_page(block, offset, 2050 state->ram_page << TARGET_PAGE_BITS); 2051 } 2052 2053 /** 2054 * get_queued_page: unqueue a page from the postcopy requests 2055 * 2056 * Skips pages that are already sent (!dirty) 2057 * 2058 * Returns true if a queued page is found 2059 * 2060 * @rs: current RAM state 2061 * @pss: data about the state of the current dirty page scan 2062 */ 2063 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2064 { 2065 RAMBlock *block; 2066 ram_addr_t offset; 2067 2068 block = unqueue_page(rs, &offset); 2069 2070 if (block) { 2071 /* See comment above postcopy_preempted_contains() */ 2072 if (postcopy_preempted_contains(rs, block, offset)) { 2073 trace_postcopy_preempt_hit(block->idstr, offset); 2074 /* 2075 * If what we preempted previously was exactly what we're 2076 * requesting right now, restore the preempted precopy 2077 * immediately, boosting its priority as it's requested by 2078 * postcopy. 2079 */ 2080 postcopy_preempt_restore(rs, pss, true); 2081 return true; 2082 } 2083 } else { 2084 /* 2085 * Poll write faults too if background snapshot is enabled; that's 2086 * when we have vcpus got blocked by the write protected pages. 2087 */ 2088 block = poll_fault_page(rs, &offset); 2089 } 2090 2091 if (block) { 2092 /* 2093 * We want the background search to continue from the queued page 2094 * since the guest is likely to want other pages near to the page 2095 * it just requested. 2096 */ 2097 pss->block = block; 2098 pss->page = offset >> TARGET_PAGE_BITS; 2099 2100 /* 2101 * This unqueued page would break the "one round" check, even is 2102 * really rare. 2103 */ 2104 pss->complete_round = false; 2105 /* Mark it an urgent request, meanwhile using POSTCOPY channel */ 2106 pss->postcopy_requested = true; 2107 pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY; 2108 } 2109 2110 return !!block; 2111 } 2112 2113 /** 2114 * migration_page_queue_free: drop any remaining pages in the ram 2115 * request queue 2116 * 2117 * It should be empty at the end anyway, but in error cases there may 2118 * be some left. in case that there is any page left, we drop it. 2119 * 2120 */ 2121 static void migration_page_queue_free(RAMState *rs) 2122 { 2123 struct RAMSrcPageRequest *mspr, *next_mspr; 2124 /* This queue generally should be empty - but in the case of a failed 2125 * migration might have some droppings in. 2126 */ 2127 RCU_READ_LOCK_GUARD(); 2128 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2129 memory_region_unref(mspr->rb->mr); 2130 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2131 g_free(mspr); 2132 } 2133 } 2134 2135 /** 2136 * ram_save_queue_pages: queue the page for transmission 2137 * 2138 * A request from postcopy destination for example. 2139 * 2140 * Returns zero on success or negative on error 2141 * 2142 * @rbname: Name of the RAMBLock of the request. NULL means the 2143 * same that last one. 2144 * @start: starting address from the start of the RAMBlock 2145 * @len: length (in bytes) to send 2146 */ 2147 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2148 { 2149 RAMBlock *ramblock; 2150 RAMState *rs = ram_state; 2151 2152 ram_counters.postcopy_requests++; 2153 RCU_READ_LOCK_GUARD(); 2154 2155 if (!rbname) { 2156 /* Reuse last RAMBlock */ 2157 ramblock = rs->last_req_rb; 2158 2159 if (!ramblock) { 2160 /* 2161 * Shouldn't happen, we can't reuse the last RAMBlock if 2162 * it's the 1st request. 2163 */ 2164 error_report("ram_save_queue_pages no previous block"); 2165 return -1; 2166 } 2167 } else { 2168 ramblock = qemu_ram_block_by_name(rbname); 2169 2170 if (!ramblock) { 2171 /* We shouldn't be asked for a non-existent RAMBlock */ 2172 error_report("ram_save_queue_pages no block '%s'", rbname); 2173 return -1; 2174 } 2175 rs->last_req_rb = ramblock; 2176 } 2177 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2178 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2179 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2180 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2181 __func__, start, len, ramblock->used_length); 2182 return -1; 2183 } 2184 2185 struct RAMSrcPageRequest *new_entry = 2186 g_new0(struct RAMSrcPageRequest, 1); 2187 new_entry->rb = ramblock; 2188 new_entry->offset = start; 2189 new_entry->len = len; 2190 2191 memory_region_ref(ramblock->mr); 2192 qemu_mutex_lock(&rs->src_page_req_mutex); 2193 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2194 migration_make_urgent_request(); 2195 qemu_mutex_unlock(&rs->src_page_req_mutex); 2196 2197 return 0; 2198 } 2199 2200 static bool save_page_use_compression(RAMState *rs) 2201 { 2202 if (!migrate_use_compression()) { 2203 return false; 2204 } 2205 2206 /* 2207 * If xbzrle is enabled (e.g., after first round of migration), stop 2208 * using the data compression. In theory, xbzrle can do better than 2209 * compression. 2210 */ 2211 if (rs->xbzrle_enabled) { 2212 return false; 2213 } 2214 2215 return true; 2216 } 2217 2218 /* 2219 * try to compress the page before posting it out, return true if the page 2220 * has been properly handled by compression, otherwise needs other 2221 * paths to handle it 2222 */ 2223 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 2224 { 2225 if (!save_page_use_compression(rs)) { 2226 return false; 2227 } 2228 2229 /* 2230 * When starting the process of a new block, the first page of 2231 * the block should be sent out before other pages in the same 2232 * block, and all the pages in last block should have been sent 2233 * out, keeping this order is important, because the 'cont' flag 2234 * is used to avoid resending the block name. 2235 * 2236 * We post the fist page as normal page as compression will take 2237 * much CPU resource. 2238 */ 2239 if (block != rs->last_sent_block) { 2240 flush_compressed_data(rs); 2241 return false; 2242 } 2243 2244 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 2245 return true; 2246 } 2247 2248 compression_counters.busy++; 2249 return false; 2250 } 2251 2252 /** 2253 * ram_save_target_page: save one target page 2254 * 2255 * Returns the number of pages written 2256 * 2257 * @rs: current RAM state 2258 * @pss: data about the page we want to send 2259 */ 2260 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 2261 { 2262 RAMBlock *block = pss->block; 2263 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2264 int res; 2265 2266 if (control_save_page(rs, block, offset, &res)) { 2267 return res; 2268 } 2269 2270 if (save_compress_page(rs, block, offset)) { 2271 return 1; 2272 } 2273 2274 res = save_zero_page(rs, block, offset); 2275 if (res > 0) { 2276 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2277 * page would be stale 2278 */ 2279 if (!save_page_use_compression(rs)) { 2280 XBZRLE_cache_lock(); 2281 xbzrle_cache_zero_page(rs, block->offset + offset); 2282 XBZRLE_cache_unlock(); 2283 } 2284 return res; 2285 } 2286 2287 /* 2288 * Do not use multifd for: 2289 * 1. Compression as the first page in the new block should be posted out 2290 * before sending the compressed page 2291 * 2. In postcopy as one whole host page should be placed 2292 */ 2293 if (!save_page_use_compression(rs) && migrate_use_multifd() 2294 && !migration_in_postcopy()) { 2295 return ram_save_multifd_page(rs, block, offset); 2296 } 2297 2298 return ram_save_page(rs, pss); 2299 } 2300 2301 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss) 2302 { 2303 MigrationState *ms = migrate_get_current(); 2304 2305 /* Not enabled eager preempt? Then never do that. */ 2306 if (!migrate_postcopy_preempt()) { 2307 return false; 2308 } 2309 2310 /* If the user explicitly disabled breaking of huge page, skip */ 2311 if (!ms->postcopy_preempt_break_huge) { 2312 return false; 2313 } 2314 2315 /* If the ramblock we're sending is a small page? Never bother. */ 2316 if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) { 2317 return false; 2318 } 2319 2320 /* Not in postcopy at all? */ 2321 if (!migration_in_postcopy()) { 2322 return false; 2323 } 2324 2325 /* 2326 * If we're already handling a postcopy request, don't preempt as this page 2327 * has got the same high priority. 2328 */ 2329 if (pss->postcopy_requested) { 2330 return false; 2331 } 2332 2333 /* If there's postcopy requests, then check it up! */ 2334 return postcopy_has_request(rs); 2335 } 2336 2337 /* Returns true if we preempted precopy, false otherwise */ 2338 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss) 2339 { 2340 PostcopyPreemptState *p_state = &rs->postcopy_preempt_state; 2341 2342 trace_postcopy_preempt_triggered(pss->block->idstr, pss->page); 2343 2344 /* 2345 * Time to preempt precopy. Cache current PSS into preempt state, so that 2346 * after handling the postcopy pages we can recover to it. We need to do 2347 * so because the dest VM will have partial of the precopy huge page kept 2348 * over in its tmp huge page caches; better move on with it when we can. 2349 */ 2350 p_state->ram_block = pss->block; 2351 p_state->ram_page = pss->page; 2352 p_state->preempted = true; 2353 } 2354 2355 /* Whether we're preempted by a postcopy request during sending a huge page */ 2356 static bool postcopy_preempt_triggered(RAMState *rs) 2357 { 2358 return rs->postcopy_preempt_state.preempted; 2359 } 2360 2361 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, 2362 bool postcopy_requested) 2363 { 2364 PostcopyPreemptState *state = &rs->postcopy_preempt_state; 2365 2366 assert(state->preempted); 2367 2368 pss->block = state->ram_block; 2369 pss->page = state->ram_page; 2370 2371 /* Whether this is a postcopy request? */ 2372 pss->postcopy_requested = postcopy_requested; 2373 /* 2374 * When restoring a preempted page, the old data resides in PRECOPY 2375 * slow channel, even if postcopy_requested is set. So always use 2376 * PRECOPY channel here. 2377 */ 2378 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; 2379 2380 trace_postcopy_preempt_restored(pss->block->idstr, pss->page); 2381 2382 /* Reset preempt state, most importantly, set preempted==false */ 2383 postcopy_preempt_reset(rs); 2384 } 2385 2386 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss) 2387 { 2388 MigrationState *s = migrate_get_current(); 2389 unsigned int channel = pss->postcopy_target_channel; 2390 QEMUFile *next; 2391 2392 if (channel != rs->postcopy_channel) { 2393 if (channel == RAM_CHANNEL_PRECOPY) { 2394 next = s->to_dst_file; 2395 } else { 2396 next = s->postcopy_qemufile_src; 2397 } 2398 /* Update and cache the current channel */ 2399 rs->f = next; 2400 rs->postcopy_channel = channel; 2401 2402 /* 2403 * If channel switched, reset last_sent_block since the old sent block 2404 * may not be on the same channel. 2405 */ 2406 rs->last_sent_block = NULL; 2407 2408 trace_postcopy_preempt_switch_channel(channel); 2409 } 2410 2411 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2412 } 2413 2414 /* We need to make sure rs->f always points to the default channel elsewhere */ 2415 static void postcopy_preempt_reset_channel(RAMState *rs) 2416 { 2417 if (migrate_postcopy_preempt() && migration_in_postcopy()) { 2418 rs->postcopy_channel = RAM_CHANNEL_PRECOPY; 2419 rs->f = migrate_get_current()->to_dst_file; 2420 trace_postcopy_preempt_reset_channel(); 2421 } 2422 } 2423 2424 /** 2425 * ram_save_host_page: save a whole host page 2426 * 2427 * Starting at *offset send pages up to the end of the current host 2428 * page. It's valid for the initial offset to point into the middle of 2429 * a host page in which case the remainder of the hostpage is sent. 2430 * Only dirty target pages are sent. Note that the host page size may 2431 * be a huge page for this block. 2432 * The saving stops at the boundary of the used_length of the block 2433 * if the RAMBlock isn't a multiple of the host page size. 2434 * 2435 * Returns the number of pages written or negative on error 2436 * 2437 * @rs: current RAM state 2438 * @pss: data about the page we want to send 2439 */ 2440 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2441 { 2442 int tmppages, pages = 0; 2443 size_t pagesize_bits = 2444 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2445 unsigned long hostpage_boundary = 2446 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2447 unsigned long start_page = pss->page; 2448 int res; 2449 2450 if (ramblock_is_ignored(pss->block)) { 2451 error_report("block %s should not be migrated !", pss->block->idstr); 2452 return 0; 2453 } 2454 2455 if (migrate_postcopy_preempt() && migration_in_postcopy()) { 2456 postcopy_preempt_choose_channel(rs, pss); 2457 } 2458 2459 do { 2460 if (postcopy_needs_preempt(rs, pss)) { 2461 postcopy_do_preempt(rs, pss); 2462 break; 2463 } 2464 2465 /* Check the pages is dirty and if it is send it */ 2466 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2467 tmppages = ram_save_target_page(rs, pss); 2468 if (tmppages < 0) { 2469 return tmppages; 2470 } 2471 2472 pages += tmppages; 2473 /* 2474 * Allow rate limiting to happen in the middle of huge pages if 2475 * something is sent in the current iteration. 2476 */ 2477 if (pagesize_bits > 1 && tmppages > 0) { 2478 migration_rate_limit(); 2479 } 2480 } 2481 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2482 } while ((pss->page < hostpage_boundary) && 2483 offset_in_ramblock(pss->block, 2484 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2485 /* The offset we leave with is the min boundary of host page and block */ 2486 pss->page = MIN(pss->page, hostpage_boundary); 2487 2488 /* 2489 * When with postcopy preempt mode, flush the data as soon as possible for 2490 * postcopy requests, because we've already sent a whole huge page, so the 2491 * dst node should already have enough resource to atomically filling in 2492 * the current missing page. 2493 * 2494 * More importantly, when using separate postcopy channel, we must do 2495 * explicit flush or it won't flush until the buffer is full. 2496 */ 2497 if (migrate_postcopy_preempt() && pss->postcopy_requested) { 2498 qemu_fflush(rs->f); 2499 } 2500 2501 res = ram_save_release_protection(rs, pss, start_page); 2502 return (res < 0 ? res : pages); 2503 } 2504 2505 /** 2506 * ram_find_and_save_block: finds a dirty page and sends it to f 2507 * 2508 * Called within an RCU critical section. 2509 * 2510 * Returns the number of pages written where zero means no dirty pages, 2511 * or negative on error 2512 * 2513 * @rs: current RAM state 2514 * 2515 * On systems where host-page-size > target-page-size it will send all the 2516 * pages in a host page that are dirty. 2517 */ 2518 static int ram_find_and_save_block(RAMState *rs) 2519 { 2520 PageSearchStatus pss; 2521 int pages = 0; 2522 bool again, found; 2523 2524 /* No dirty page as there is zero RAM */ 2525 if (!ram_bytes_total()) { 2526 return pages; 2527 } 2528 2529 pss.block = rs->last_seen_block; 2530 pss.page = rs->last_page; 2531 pss.complete_round = false; 2532 2533 if (!pss.block) { 2534 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2535 } 2536 2537 do { 2538 again = true; 2539 found = get_queued_page(rs, &pss); 2540 2541 if (!found) { 2542 /* 2543 * Recover previous precopy ramblock/offset if postcopy has 2544 * preempted precopy. Otherwise find the next dirty bit. 2545 */ 2546 if (postcopy_preempt_triggered(rs)) { 2547 postcopy_preempt_restore(rs, &pss, false); 2548 found = true; 2549 } else { 2550 /* priority queue empty, so just search for something dirty */ 2551 found = find_dirty_block(rs, &pss, &again); 2552 } 2553 } 2554 2555 if (found) { 2556 pages = ram_save_host_page(rs, &pss); 2557 } 2558 } while (!pages && again); 2559 2560 rs->last_seen_block = pss.block; 2561 rs->last_page = pss.page; 2562 2563 return pages; 2564 } 2565 2566 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2567 { 2568 uint64_t pages = size / TARGET_PAGE_SIZE; 2569 2570 if (zero) { 2571 ram_counters.duplicate += pages; 2572 } else { 2573 ram_counters.normal += pages; 2574 ram_transferred_add(size); 2575 qemu_file_credit_transfer(f, size); 2576 } 2577 } 2578 2579 static uint64_t ram_bytes_total_common(bool count_ignored) 2580 { 2581 RAMBlock *block; 2582 uint64_t total = 0; 2583 2584 RCU_READ_LOCK_GUARD(); 2585 2586 if (count_ignored) { 2587 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2588 total += block->used_length; 2589 } 2590 } else { 2591 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2592 total += block->used_length; 2593 } 2594 } 2595 return total; 2596 } 2597 2598 uint64_t ram_bytes_total(void) 2599 { 2600 return ram_bytes_total_common(false); 2601 } 2602 2603 static void xbzrle_load_setup(void) 2604 { 2605 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2606 } 2607 2608 static void xbzrle_load_cleanup(void) 2609 { 2610 g_free(XBZRLE.decoded_buf); 2611 XBZRLE.decoded_buf = NULL; 2612 } 2613 2614 static void ram_state_cleanup(RAMState **rsp) 2615 { 2616 if (*rsp) { 2617 migration_page_queue_free(*rsp); 2618 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2619 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2620 g_free(*rsp); 2621 *rsp = NULL; 2622 } 2623 } 2624 2625 static void xbzrle_cleanup(void) 2626 { 2627 XBZRLE_cache_lock(); 2628 if (XBZRLE.cache) { 2629 cache_fini(XBZRLE.cache); 2630 g_free(XBZRLE.encoded_buf); 2631 g_free(XBZRLE.current_buf); 2632 g_free(XBZRLE.zero_target_page); 2633 XBZRLE.cache = NULL; 2634 XBZRLE.encoded_buf = NULL; 2635 XBZRLE.current_buf = NULL; 2636 XBZRLE.zero_target_page = NULL; 2637 } 2638 XBZRLE_cache_unlock(); 2639 } 2640 2641 static void ram_save_cleanup(void *opaque) 2642 { 2643 RAMState **rsp = opaque; 2644 RAMBlock *block; 2645 2646 /* We don't use dirty log with background snapshots */ 2647 if (!migrate_background_snapshot()) { 2648 /* caller have hold iothread lock or is in a bh, so there is 2649 * no writing race against the migration bitmap 2650 */ 2651 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2652 /* 2653 * do not stop dirty log without starting it, since 2654 * memory_global_dirty_log_stop will assert that 2655 * memory_global_dirty_log_start/stop used in pairs 2656 */ 2657 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2658 } 2659 } 2660 2661 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2662 g_free(block->clear_bmap); 2663 block->clear_bmap = NULL; 2664 g_free(block->bmap); 2665 block->bmap = NULL; 2666 } 2667 2668 xbzrle_cleanup(); 2669 compress_threads_save_cleanup(); 2670 ram_state_cleanup(rsp); 2671 } 2672 2673 static void ram_state_reset(RAMState *rs) 2674 { 2675 rs->last_seen_block = NULL; 2676 rs->last_sent_block = NULL; 2677 rs->last_page = 0; 2678 rs->last_version = ram_list.version; 2679 rs->xbzrle_enabled = false; 2680 postcopy_preempt_reset(rs); 2681 rs->postcopy_channel = RAM_CHANNEL_PRECOPY; 2682 } 2683 2684 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2685 2686 /* **** functions for postcopy ***** */ 2687 2688 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2689 { 2690 struct RAMBlock *block; 2691 2692 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2693 unsigned long *bitmap = block->bmap; 2694 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2695 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2696 2697 while (run_start < range) { 2698 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2699 ram_discard_range(block->idstr, 2700 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2701 ((ram_addr_t)(run_end - run_start)) 2702 << TARGET_PAGE_BITS); 2703 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2704 } 2705 } 2706 } 2707 2708 /** 2709 * postcopy_send_discard_bm_ram: discard a RAMBlock 2710 * 2711 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2712 * 2713 * @ms: current migration state 2714 * @block: RAMBlock to discard 2715 */ 2716 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2717 { 2718 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2719 unsigned long current; 2720 unsigned long *bitmap = block->bmap; 2721 2722 for (current = 0; current < end; ) { 2723 unsigned long one = find_next_bit(bitmap, end, current); 2724 unsigned long zero, discard_length; 2725 2726 if (one >= end) { 2727 break; 2728 } 2729 2730 zero = find_next_zero_bit(bitmap, end, one + 1); 2731 2732 if (zero >= end) { 2733 discard_length = end - one; 2734 } else { 2735 discard_length = zero - one; 2736 } 2737 postcopy_discard_send_range(ms, one, discard_length); 2738 current = one + discard_length; 2739 } 2740 } 2741 2742 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2743 2744 /** 2745 * postcopy_each_ram_send_discard: discard all RAMBlocks 2746 * 2747 * Utility for the outgoing postcopy code. 2748 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2749 * passing it bitmap indexes and name. 2750 * (qemu_ram_foreach_block ends up passing unscaled lengths 2751 * which would mean postcopy code would have to deal with target page) 2752 * 2753 * @ms: current migration state 2754 */ 2755 static void postcopy_each_ram_send_discard(MigrationState *ms) 2756 { 2757 struct RAMBlock *block; 2758 2759 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2760 postcopy_discard_send_init(ms, block->idstr); 2761 2762 /* 2763 * Deal with TPS != HPS and huge pages. It discard any partially sent 2764 * host-page size chunks, mark any partially dirty host-page size 2765 * chunks as all dirty. In this case the host-page is the host-page 2766 * for the particular RAMBlock, i.e. it might be a huge page. 2767 */ 2768 postcopy_chunk_hostpages_pass(ms, block); 2769 2770 /* 2771 * Postcopy sends chunks of bitmap over the wire, but it 2772 * just needs indexes at this point, avoids it having 2773 * target page specific code. 2774 */ 2775 postcopy_send_discard_bm_ram(ms, block); 2776 postcopy_discard_send_finish(ms); 2777 } 2778 } 2779 2780 /** 2781 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2782 * 2783 * Helper for postcopy_chunk_hostpages; it's called twice to 2784 * canonicalize the two bitmaps, that are similar, but one is 2785 * inverted. 2786 * 2787 * Postcopy requires that all target pages in a hostpage are dirty or 2788 * clean, not a mix. This function canonicalizes the bitmaps. 2789 * 2790 * @ms: current migration state 2791 * @block: block that contains the page we want to canonicalize 2792 */ 2793 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2794 { 2795 RAMState *rs = ram_state; 2796 unsigned long *bitmap = block->bmap; 2797 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2798 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2799 unsigned long run_start; 2800 2801 if (block->page_size == TARGET_PAGE_SIZE) { 2802 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2803 return; 2804 } 2805 2806 /* Find a dirty page */ 2807 run_start = find_next_bit(bitmap, pages, 0); 2808 2809 while (run_start < pages) { 2810 2811 /* 2812 * If the start of this run of pages is in the middle of a host 2813 * page, then we need to fixup this host page. 2814 */ 2815 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2816 /* Find the end of this run */ 2817 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2818 /* 2819 * If the end isn't at the start of a host page, then the 2820 * run doesn't finish at the end of a host page 2821 * and we need to discard. 2822 */ 2823 } 2824 2825 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2826 unsigned long page; 2827 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2828 host_ratio); 2829 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2830 2831 /* Clean up the bitmap */ 2832 for (page = fixup_start_addr; 2833 page < fixup_start_addr + host_ratio; page++) { 2834 /* 2835 * Remark them as dirty, updating the count for any pages 2836 * that weren't previously dirty. 2837 */ 2838 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2839 } 2840 } 2841 2842 /* Find the next dirty page for the next iteration */ 2843 run_start = find_next_bit(bitmap, pages, run_start); 2844 } 2845 } 2846 2847 /** 2848 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2849 * 2850 * Transmit the set of pages to be discarded after precopy to the target 2851 * these are pages that: 2852 * a) Have been previously transmitted but are now dirty again 2853 * b) Pages that have never been transmitted, this ensures that 2854 * any pages on the destination that have been mapped by background 2855 * tasks get discarded (transparent huge pages is the specific concern) 2856 * Hopefully this is pretty sparse 2857 * 2858 * @ms: current migration state 2859 */ 2860 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2861 { 2862 RAMState *rs = ram_state; 2863 2864 RCU_READ_LOCK_GUARD(); 2865 2866 /* This should be our last sync, the src is now paused */ 2867 migration_bitmap_sync(rs); 2868 2869 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2870 rs->last_seen_block = NULL; 2871 rs->last_sent_block = NULL; 2872 rs->last_page = 0; 2873 2874 postcopy_each_ram_send_discard(ms); 2875 2876 trace_ram_postcopy_send_discard_bitmap(); 2877 } 2878 2879 /** 2880 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2881 * 2882 * Returns zero on success 2883 * 2884 * @rbname: name of the RAMBlock of the request. NULL means the 2885 * same that last one. 2886 * @start: RAMBlock starting page 2887 * @length: RAMBlock size 2888 */ 2889 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2890 { 2891 trace_ram_discard_range(rbname, start, length); 2892 2893 RCU_READ_LOCK_GUARD(); 2894 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2895 2896 if (!rb) { 2897 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2898 return -1; 2899 } 2900 2901 /* 2902 * On source VM, we don't need to update the received bitmap since 2903 * we don't even have one. 2904 */ 2905 if (rb->receivedmap) { 2906 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2907 length >> qemu_target_page_bits()); 2908 } 2909 2910 return ram_block_discard_range(rb, start, length); 2911 } 2912 2913 /* 2914 * For every allocation, we will try not to crash the VM if the 2915 * allocation failed. 2916 */ 2917 static int xbzrle_init(void) 2918 { 2919 Error *local_err = NULL; 2920 2921 if (!migrate_use_xbzrle()) { 2922 return 0; 2923 } 2924 2925 XBZRLE_cache_lock(); 2926 2927 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2928 if (!XBZRLE.zero_target_page) { 2929 error_report("%s: Error allocating zero page", __func__); 2930 goto err_out; 2931 } 2932 2933 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2934 TARGET_PAGE_SIZE, &local_err); 2935 if (!XBZRLE.cache) { 2936 error_report_err(local_err); 2937 goto free_zero_page; 2938 } 2939 2940 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2941 if (!XBZRLE.encoded_buf) { 2942 error_report("%s: Error allocating encoded_buf", __func__); 2943 goto free_cache; 2944 } 2945 2946 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2947 if (!XBZRLE.current_buf) { 2948 error_report("%s: Error allocating current_buf", __func__); 2949 goto free_encoded_buf; 2950 } 2951 2952 /* We are all good */ 2953 XBZRLE_cache_unlock(); 2954 return 0; 2955 2956 free_encoded_buf: 2957 g_free(XBZRLE.encoded_buf); 2958 XBZRLE.encoded_buf = NULL; 2959 free_cache: 2960 cache_fini(XBZRLE.cache); 2961 XBZRLE.cache = NULL; 2962 free_zero_page: 2963 g_free(XBZRLE.zero_target_page); 2964 XBZRLE.zero_target_page = NULL; 2965 err_out: 2966 XBZRLE_cache_unlock(); 2967 return -ENOMEM; 2968 } 2969 2970 static int ram_state_init(RAMState **rsp) 2971 { 2972 *rsp = g_try_new0(RAMState, 1); 2973 2974 if (!*rsp) { 2975 error_report("%s: Init ramstate fail", __func__); 2976 return -1; 2977 } 2978 2979 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2980 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2981 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2982 2983 /* 2984 * Count the total number of pages used by ram blocks not including any 2985 * gaps due to alignment or unplugs. 2986 * This must match with the initial values of dirty bitmap. 2987 */ 2988 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2989 ram_state_reset(*rsp); 2990 2991 return 0; 2992 } 2993 2994 static void ram_list_init_bitmaps(void) 2995 { 2996 MigrationState *ms = migrate_get_current(); 2997 RAMBlock *block; 2998 unsigned long pages; 2999 uint8_t shift; 3000 3001 /* Skip setting bitmap if there is no RAM */ 3002 if (ram_bytes_total()) { 3003 shift = ms->clear_bitmap_shift; 3004 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3005 error_report("clear_bitmap_shift (%u) too big, using " 3006 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3007 shift = CLEAR_BITMAP_SHIFT_MAX; 3008 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3009 error_report("clear_bitmap_shift (%u) too small, using " 3010 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3011 shift = CLEAR_BITMAP_SHIFT_MIN; 3012 } 3013 3014 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3015 pages = block->max_length >> TARGET_PAGE_BITS; 3016 /* 3017 * The initial dirty bitmap for migration must be set with all 3018 * ones to make sure we'll migrate every guest RAM page to 3019 * destination. 3020 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3021 * new migration after a failed migration, ram_list. 3022 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3023 * guest memory. 3024 */ 3025 block->bmap = bitmap_new(pages); 3026 bitmap_set(block->bmap, 0, pages); 3027 block->clear_bmap_shift = shift; 3028 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3029 } 3030 } 3031 } 3032 3033 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3034 { 3035 unsigned long pages; 3036 RAMBlock *rb; 3037 3038 RCU_READ_LOCK_GUARD(); 3039 3040 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3041 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3042 rs->migration_dirty_pages -= pages; 3043 } 3044 } 3045 3046 static void ram_init_bitmaps(RAMState *rs) 3047 { 3048 /* For memory_global_dirty_log_start below. */ 3049 qemu_mutex_lock_iothread(); 3050 qemu_mutex_lock_ramlist(); 3051 3052 WITH_RCU_READ_LOCK_GUARD() { 3053 ram_list_init_bitmaps(); 3054 /* We don't use dirty log with background snapshots */ 3055 if (!migrate_background_snapshot()) { 3056 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3057 migration_bitmap_sync_precopy(rs); 3058 } 3059 } 3060 qemu_mutex_unlock_ramlist(); 3061 qemu_mutex_unlock_iothread(); 3062 3063 /* 3064 * After an eventual first bitmap sync, fixup the initial bitmap 3065 * containing all 1s to exclude any discarded pages from migration. 3066 */ 3067 migration_bitmap_clear_discarded_pages(rs); 3068 } 3069 3070 static int ram_init_all(RAMState **rsp) 3071 { 3072 if (ram_state_init(rsp)) { 3073 return -1; 3074 } 3075 3076 if (xbzrle_init()) { 3077 ram_state_cleanup(rsp); 3078 return -1; 3079 } 3080 3081 ram_init_bitmaps(*rsp); 3082 3083 return 0; 3084 } 3085 3086 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3087 { 3088 RAMBlock *block; 3089 uint64_t pages = 0; 3090 3091 /* 3092 * Postcopy is not using xbzrle/compression, so no need for that. 3093 * Also, since source are already halted, we don't need to care 3094 * about dirty page logging as well. 3095 */ 3096 3097 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3098 pages += bitmap_count_one(block->bmap, 3099 block->used_length >> TARGET_PAGE_BITS); 3100 } 3101 3102 /* This may not be aligned with current bitmaps. Recalculate. */ 3103 rs->migration_dirty_pages = pages; 3104 3105 ram_state_reset(rs); 3106 3107 /* Update RAMState cache of output QEMUFile */ 3108 rs->f = out; 3109 3110 trace_ram_state_resume_prepare(pages); 3111 } 3112 3113 /* 3114 * This function clears bits of the free pages reported by the caller from the 3115 * migration dirty bitmap. @addr is the host address corresponding to the 3116 * start of the continuous guest free pages, and @len is the total bytes of 3117 * those pages. 3118 */ 3119 void qemu_guest_free_page_hint(void *addr, size_t len) 3120 { 3121 RAMBlock *block; 3122 ram_addr_t offset; 3123 size_t used_len, start, npages; 3124 MigrationState *s = migrate_get_current(); 3125 3126 /* This function is currently expected to be used during live migration */ 3127 if (!migration_is_setup_or_active(s->state)) { 3128 return; 3129 } 3130 3131 for (; len > 0; len -= used_len, addr += used_len) { 3132 block = qemu_ram_block_from_host(addr, false, &offset); 3133 if (unlikely(!block || offset >= block->used_length)) { 3134 /* 3135 * The implementation might not support RAMBlock resize during 3136 * live migration, but it could happen in theory with future 3137 * updates. So we add a check here to capture that case. 3138 */ 3139 error_report_once("%s unexpected error", __func__); 3140 return; 3141 } 3142 3143 if (len <= block->used_length - offset) { 3144 used_len = len; 3145 } else { 3146 used_len = block->used_length - offset; 3147 } 3148 3149 start = offset >> TARGET_PAGE_BITS; 3150 npages = used_len >> TARGET_PAGE_BITS; 3151 3152 qemu_mutex_lock(&ram_state->bitmap_mutex); 3153 /* 3154 * The skipped free pages are equavalent to be sent from clear_bmap's 3155 * perspective, so clear the bits from the memory region bitmap which 3156 * are initially set. Otherwise those skipped pages will be sent in 3157 * the next round after syncing from the memory region bitmap. 3158 */ 3159 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3160 ram_state->migration_dirty_pages -= 3161 bitmap_count_one_with_offset(block->bmap, start, npages); 3162 bitmap_clear(block->bmap, start, npages); 3163 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3164 } 3165 } 3166 3167 /* 3168 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3169 * long-running RCU critical section. When rcu-reclaims in the code 3170 * start to become numerous it will be necessary to reduce the 3171 * granularity of these critical sections. 3172 */ 3173 3174 /** 3175 * ram_save_setup: Setup RAM for migration 3176 * 3177 * Returns zero to indicate success and negative for error 3178 * 3179 * @f: QEMUFile where to send the data 3180 * @opaque: RAMState pointer 3181 */ 3182 static int ram_save_setup(QEMUFile *f, void *opaque) 3183 { 3184 RAMState **rsp = opaque; 3185 RAMBlock *block; 3186 int ret; 3187 3188 if (compress_threads_save_setup()) { 3189 return -1; 3190 } 3191 3192 /* migration has already setup the bitmap, reuse it. */ 3193 if (!migration_in_colo_state()) { 3194 if (ram_init_all(rsp) != 0) { 3195 compress_threads_save_cleanup(); 3196 return -1; 3197 } 3198 } 3199 (*rsp)->f = f; 3200 3201 WITH_RCU_READ_LOCK_GUARD() { 3202 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 3203 3204 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3205 qemu_put_byte(f, strlen(block->idstr)); 3206 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3207 qemu_put_be64(f, block->used_length); 3208 if (migrate_postcopy_ram() && block->page_size != 3209 qemu_host_page_size) { 3210 qemu_put_be64(f, block->page_size); 3211 } 3212 if (migrate_ignore_shared()) { 3213 qemu_put_be64(f, block->mr->addr); 3214 } 3215 } 3216 } 3217 3218 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3219 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3220 3221 ret = multifd_send_sync_main(f); 3222 if (ret < 0) { 3223 return ret; 3224 } 3225 3226 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3227 qemu_fflush(f); 3228 3229 return 0; 3230 } 3231 3232 /** 3233 * ram_save_iterate: iterative stage for migration 3234 * 3235 * Returns zero to indicate success and negative for error 3236 * 3237 * @f: QEMUFile where to send the data 3238 * @opaque: RAMState pointer 3239 */ 3240 static int ram_save_iterate(QEMUFile *f, void *opaque) 3241 { 3242 RAMState **temp = opaque; 3243 RAMState *rs = *temp; 3244 int ret = 0; 3245 int i; 3246 int64_t t0; 3247 int done = 0; 3248 3249 if (blk_mig_bulk_active()) { 3250 /* Avoid transferring ram during bulk phase of block migration as 3251 * the bulk phase will usually take a long time and transferring 3252 * ram updates during that time is pointless. */ 3253 goto out; 3254 } 3255 3256 /* 3257 * We'll take this lock a little bit long, but it's okay for two reasons. 3258 * Firstly, the only possible other thread to take it is who calls 3259 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3260 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3261 * guarantees that we'll at least released it in a regular basis. 3262 */ 3263 qemu_mutex_lock(&rs->bitmap_mutex); 3264 WITH_RCU_READ_LOCK_GUARD() { 3265 if (ram_list.version != rs->last_version) { 3266 ram_state_reset(rs); 3267 } 3268 3269 /* Read version before ram_list.blocks */ 3270 smp_rmb(); 3271 3272 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3273 3274 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3275 i = 0; 3276 while ((ret = qemu_file_rate_limit(f)) == 0 || 3277 postcopy_has_request(rs)) { 3278 int pages; 3279 3280 if (qemu_file_get_error(f)) { 3281 break; 3282 } 3283 3284 pages = ram_find_and_save_block(rs); 3285 /* no more pages to sent */ 3286 if (pages == 0) { 3287 done = 1; 3288 break; 3289 } 3290 3291 if (pages < 0) { 3292 qemu_file_set_error(f, pages); 3293 break; 3294 } 3295 3296 rs->target_page_count += pages; 3297 3298 /* 3299 * During postcopy, it is necessary to make sure one whole host 3300 * page is sent in one chunk. 3301 */ 3302 if (migrate_postcopy_ram()) { 3303 flush_compressed_data(rs); 3304 } 3305 3306 /* 3307 * we want to check in the 1st loop, just in case it was the 1st 3308 * time and we had to sync the dirty bitmap. 3309 * qemu_clock_get_ns() is a bit expensive, so we only check each 3310 * some iterations 3311 */ 3312 if ((i & 63) == 0) { 3313 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3314 1000000; 3315 if (t1 > MAX_WAIT) { 3316 trace_ram_save_iterate_big_wait(t1, i); 3317 break; 3318 } 3319 } 3320 i++; 3321 } 3322 } 3323 qemu_mutex_unlock(&rs->bitmap_mutex); 3324 3325 postcopy_preempt_reset_channel(rs); 3326 3327 /* 3328 * Must occur before EOS (or any QEMUFile operation) 3329 * because of RDMA protocol. 3330 */ 3331 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3332 3333 out: 3334 if (ret >= 0 3335 && migration_is_setup_or_active(migrate_get_current()->state)) { 3336 ret = multifd_send_sync_main(rs->f); 3337 if (ret < 0) { 3338 return ret; 3339 } 3340 3341 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3342 qemu_fflush(f); 3343 ram_transferred_add(8); 3344 3345 ret = qemu_file_get_error(f); 3346 } 3347 if (ret < 0) { 3348 return ret; 3349 } 3350 3351 return done; 3352 } 3353 3354 /** 3355 * ram_save_complete: function called to send the remaining amount of ram 3356 * 3357 * Returns zero to indicate success or negative on error 3358 * 3359 * Called with iothread lock 3360 * 3361 * @f: QEMUFile where to send the data 3362 * @opaque: RAMState pointer 3363 */ 3364 static int ram_save_complete(QEMUFile *f, void *opaque) 3365 { 3366 RAMState **temp = opaque; 3367 RAMState *rs = *temp; 3368 int ret = 0; 3369 3370 rs->last_stage = !migration_in_colo_state(); 3371 3372 WITH_RCU_READ_LOCK_GUARD() { 3373 if (!migration_in_postcopy()) { 3374 migration_bitmap_sync_precopy(rs); 3375 } 3376 3377 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3378 3379 /* try transferring iterative blocks of memory */ 3380 3381 /* flush all remaining blocks regardless of rate limiting */ 3382 while (true) { 3383 int pages; 3384 3385 pages = ram_find_and_save_block(rs); 3386 /* no more blocks to sent */ 3387 if (pages == 0) { 3388 break; 3389 } 3390 if (pages < 0) { 3391 ret = pages; 3392 break; 3393 } 3394 } 3395 3396 flush_compressed_data(rs); 3397 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3398 } 3399 3400 if (ret < 0) { 3401 return ret; 3402 } 3403 3404 postcopy_preempt_reset_channel(rs); 3405 3406 ret = multifd_send_sync_main(rs->f); 3407 if (ret < 0) { 3408 return ret; 3409 } 3410 3411 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3412 qemu_fflush(f); 3413 3414 return 0; 3415 } 3416 3417 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3418 uint64_t *res_precopy_only, 3419 uint64_t *res_compatible, 3420 uint64_t *res_postcopy_only) 3421 { 3422 RAMState **temp = opaque; 3423 RAMState *rs = *temp; 3424 uint64_t remaining_size; 3425 3426 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3427 3428 if (!migration_in_postcopy() && 3429 remaining_size < max_size) { 3430 qemu_mutex_lock_iothread(); 3431 WITH_RCU_READ_LOCK_GUARD() { 3432 migration_bitmap_sync_precopy(rs); 3433 } 3434 qemu_mutex_unlock_iothread(); 3435 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3436 } 3437 3438 if (migrate_postcopy_ram()) { 3439 /* We can do postcopy, and all the data is postcopiable */ 3440 *res_compatible += remaining_size; 3441 } else { 3442 *res_precopy_only += remaining_size; 3443 } 3444 } 3445 3446 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3447 { 3448 unsigned int xh_len; 3449 int xh_flags; 3450 uint8_t *loaded_data; 3451 3452 /* extract RLE header */ 3453 xh_flags = qemu_get_byte(f); 3454 xh_len = qemu_get_be16(f); 3455 3456 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3457 error_report("Failed to load XBZRLE page - wrong compression!"); 3458 return -1; 3459 } 3460 3461 if (xh_len > TARGET_PAGE_SIZE) { 3462 error_report("Failed to load XBZRLE page - len overflow!"); 3463 return -1; 3464 } 3465 loaded_data = XBZRLE.decoded_buf; 3466 /* load data and decode */ 3467 /* it can change loaded_data to point to an internal buffer */ 3468 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3469 3470 /* decode RLE */ 3471 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3472 TARGET_PAGE_SIZE) == -1) { 3473 error_report("Failed to load XBZRLE page - decode error!"); 3474 return -1; 3475 } 3476 3477 return 0; 3478 } 3479 3480 /** 3481 * ram_block_from_stream: read a RAMBlock id from the migration stream 3482 * 3483 * Must be called from within a rcu critical section. 3484 * 3485 * Returns a pointer from within the RCU-protected ram_list. 3486 * 3487 * @mis: the migration incoming state pointer 3488 * @f: QEMUFile where to read the data from 3489 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3490 * @channel: the channel we're using 3491 */ 3492 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3493 QEMUFile *f, int flags, 3494 int channel) 3495 { 3496 RAMBlock *block = mis->last_recv_block[channel]; 3497 char id[256]; 3498 uint8_t len; 3499 3500 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3501 if (!block) { 3502 error_report("Ack, bad migration stream!"); 3503 return NULL; 3504 } 3505 return block; 3506 } 3507 3508 len = qemu_get_byte(f); 3509 qemu_get_buffer(f, (uint8_t *)id, len); 3510 id[len] = 0; 3511 3512 block = qemu_ram_block_by_name(id); 3513 if (!block) { 3514 error_report("Can't find block %s", id); 3515 return NULL; 3516 } 3517 3518 if (ramblock_is_ignored(block)) { 3519 error_report("block %s should not be migrated !", id); 3520 return NULL; 3521 } 3522 3523 mis->last_recv_block[channel] = block; 3524 3525 return block; 3526 } 3527 3528 static inline void *host_from_ram_block_offset(RAMBlock *block, 3529 ram_addr_t offset) 3530 { 3531 if (!offset_in_ramblock(block, offset)) { 3532 return NULL; 3533 } 3534 3535 return block->host + offset; 3536 } 3537 3538 static void *host_page_from_ram_block_offset(RAMBlock *block, 3539 ram_addr_t offset) 3540 { 3541 /* Note: Explicitly no check against offset_in_ramblock(). */ 3542 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3543 block->page_size); 3544 } 3545 3546 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3547 ram_addr_t offset) 3548 { 3549 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3550 } 3551 3552 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3553 ram_addr_t offset, bool record_bitmap) 3554 { 3555 if (!offset_in_ramblock(block, offset)) { 3556 return NULL; 3557 } 3558 if (!block->colo_cache) { 3559 error_report("%s: colo_cache is NULL in block :%s", 3560 __func__, block->idstr); 3561 return NULL; 3562 } 3563 3564 /* 3565 * During colo checkpoint, we need bitmap of these migrated pages. 3566 * It help us to decide which pages in ram cache should be flushed 3567 * into VM's RAM later. 3568 */ 3569 if (record_bitmap && 3570 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3571 ram_state->migration_dirty_pages++; 3572 } 3573 return block->colo_cache + offset; 3574 } 3575 3576 /** 3577 * ram_handle_compressed: handle the zero page case 3578 * 3579 * If a page (or a whole RDMA chunk) has been 3580 * determined to be zero, then zap it. 3581 * 3582 * @host: host address for the zero page 3583 * @ch: what the page is filled from. We only support zero 3584 * @size: size of the zero page 3585 */ 3586 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3587 { 3588 if (ch != 0 || !buffer_is_zero(host, size)) { 3589 memset(host, ch, size); 3590 } 3591 } 3592 3593 /* return the size after decompression, or negative value on error */ 3594 static int 3595 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3596 const uint8_t *source, size_t source_len) 3597 { 3598 int err; 3599 3600 err = inflateReset(stream); 3601 if (err != Z_OK) { 3602 return -1; 3603 } 3604 3605 stream->avail_in = source_len; 3606 stream->next_in = (uint8_t *)source; 3607 stream->avail_out = dest_len; 3608 stream->next_out = dest; 3609 3610 err = inflate(stream, Z_NO_FLUSH); 3611 if (err != Z_STREAM_END) { 3612 return -1; 3613 } 3614 3615 return stream->total_out; 3616 } 3617 3618 static void *do_data_decompress(void *opaque) 3619 { 3620 DecompressParam *param = opaque; 3621 unsigned long pagesize; 3622 uint8_t *des; 3623 int len, ret; 3624 3625 qemu_mutex_lock(¶m->mutex); 3626 while (!param->quit) { 3627 if (param->des) { 3628 des = param->des; 3629 len = param->len; 3630 param->des = 0; 3631 qemu_mutex_unlock(¶m->mutex); 3632 3633 pagesize = TARGET_PAGE_SIZE; 3634 3635 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3636 param->compbuf, len); 3637 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3638 error_report("decompress data failed"); 3639 qemu_file_set_error(decomp_file, ret); 3640 } 3641 3642 qemu_mutex_lock(&decomp_done_lock); 3643 param->done = true; 3644 qemu_cond_signal(&decomp_done_cond); 3645 qemu_mutex_unlock(&decomp_done_lock); 3646 3647 qemu_mutex_lock(¶m->mutex); 3648 } else { 3649 qemu_cond_wait(¶m->cond, ¶m->mutex); 3650 } 3651 } 3652 qemu_mutex_unlock(¶m->mutex); 3653 3654 return NULL; 3655 } 3656 3657 static int wait_for_decompress_done(void) 3658 { 3659 int idx, thread_count; 3660 3661 if (!migrate_use_compression()) { 3662 return 0; 3663 } 3664 3665 thread_count = migrate_decompress_threads(); 3666 qemu_mutex_lock(&decomp_done_lock); 3667 for (idx = 0; idx < thread_count; idx++) { 3668 while (!decomp_param[idx].done) { 3669 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3670 } 3671 } 3672 qemu_mutex_unlock(&decomp_done_lock); 3673 return qemu_file_get_error(decomp_file); 3674 } 3675 3676 static void compress_threads_load_cleanup(void) 3677 { 3678 int i, thread_count; 3679 3680 if (!migrate_use_compression()) { 3681 return; 3682 } 3683 thread_count = migrate_decompress_threads(); 3684 for (i = 0; i < thread_count; i++) { 3685 /* 3686 * we use it as a indicator which shows if the thread is 3687 * properly init'd or not 3688 */ 3689 if (!decomp_param[i].compbuf) { 3690 break; 3691 } 3692 3693 qemu_mutex_lock(&decomp_param[i].mutex); 3694 decomp_param[i].quit = true; 3695 qemu_cond_signal(&decomp_param[i].cond); 3696 qemu_mutex_unlock(&decomp_param[i].mutex); 3697 } 3698 for (i = 0; i < thread_count; i++) { 3699 if (!decomp_param[i].compbuf) { 3700 break; 3701 } 3702 3703 qemu_thread_join(decompress_threads + i); 3704 qemu_mutex_destroy(&decomp_param[i].mutex); 3705 qemu_cond_destroy(&decomp_param[i].cond); 3706 inflateEnd(&decomp_param[i].stream); 3707 g_free(decomp_param[i].compbuf); 3708 decomp_param[i].compbuf = NULL; 3709 } 3710 g_free(decompress_threads); 3711 g_free(decomp_param); 3712 decompress_threads = NULL; 3713 decomp_param = NULL; 3714 decomp_file = NULL; 3715 } 3716 3717 static int compress_threads_load_setup(QEMUFile *f) 3718 { 3719 int i, thread_count; 3720 3721 if (!migrate_use_compression()) { 3722 return 0; 3723 } 3724 3725 thread_count = migrate_decompress_threads(); 3726 decompress_threads = g_new0(QemuThread, thread_count); 3727 decomp_param = g_new0(DecompressParam, thread_count); 3728 qemu_mutex_init(&decomp_done_lock); 3729 qemu_cond_init(&decomp_done_cond); 3730 decomp_file = f; 3731 for (i = 0; i < thread_count; i++) { 3732 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3733 goto exit; 3734 } 3735 3736 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3737 qemu_mutex_init(&decomp_param[i].mutex); 3738 qemu_cond_init(&decomp_param[i].cond); 3739 decomp_param[i].done = true; 3740 decomp_param[i].quit = false; 3741 qemu_thread_create(decompress_threads + i, "decompress", 3742 do_data_decompress, decomp_param + i, 3743 QEMU_THREAD_JOINABLE); 3744 } 3745 return 0; 3746 exit: 3747 compress_threads_load_cleanup(); 3748 return -1; 3749 } 3750 3751 static void decompress_data_with_multi_threads(QEMUFile *f, 3752 void *host, int len) 3753 { 3754 int idx, thread_count; 3755 3756 thread_count = migrate_decompress_threads(); 3757 QEMU_LOCK_GUARD(&decomp_done_lock); 3758 while (true) { 3759 for (idx = 0; idx < thread_count; idx++) { 3760 if (decomp_param[idx].done) { 3761 decomp_param[idx].done = false; 3762 qemu_mutex_lock(&decomp_param[idx].mutex); 3763 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3764 decomp_param[idx].des = host; 3765 decomp_param[idx].len = len; 3766 qemu_cond_signal(&decomp_param[idx].cond); 3767 qemu_mutex_unlock(&decomp_param[idx].mutex); 3768 break; 3769 } 3770 } 3771 if (idx < thread_count) { 3772 break; 3773 } else { 3774 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3775 } 3776 } 3777 } 3778 3779 static void colo_init_ram_state(void) 3780 { 3781 ram_state_init(&ram_state); 3782 } 3783 3784 /* 3785 * colo cache: this is for secondary VM, we cache the whole 3786 * memory of the secondary VM, it is need to hold the global lock 3787 * to call this helper. 3788 */ 3789 int colo_init_ram_cache(void) 3790 { 3791 RAMBlock *block; 3792 3793 WITH_RCU_READ_LOCK_GUARD() { 3794 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3795 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3796 NULL, false, false); 3797 if (!block->colo_cache) { 3798 error_report("%s: Can't alloc memory for COLO cache of block %s," 3799 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3800 block->used_length); 3801 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3802 if (block->colo_cache) { 3803 qemu_anon_ram_free(block->colo_cache, block->used_length); 3804 block->colo_cache = NULL; 3805 } 3806 } 3807 return -errno; 3808 } 3809 if (!machine_dump_guest_core(current_machine)) { 3810 qemu_madvise(block->colo_cache, block->used_length, 3811 QEMU_MADV_DONTDUMP); 3812 } 3813 } 3814 } 3815 3816 /* 3817 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3818 * with to decide which page in cache should be flushed into SVM's RAM. Here 3819 * we use the same name 'ram_bitmap' as for migration. 3820 */ 3821 if (ram_bytes_total()) { 3822 RAMBlock *block; 3823 3824 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3825 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3826 block->bmap = bitmap_new(pages); 3827 } 3828 } 3829 3830 colo_init_ram_state(); 3831 return 0; 3832 } 3833 3834 /* TODO: duplicated with ram_init_bitmaps */ 3835 void colo_incoming_start_dirty_log(void) 3836 { 3837 RAMBlock *block = NULL; 3838 /* For memory_global_dirty_log_start below. */ 3839 qemu_mutex_lock_iothread(); 3840 qemu_mutex_lock_ramlist(); 3841 3842 memory_global_dirty_log_sync(); 3843 WITH_RCU_READ_LOCK_GUARD() { 3844 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3845 ramblock_sync_dirty_bitmap(ram_state, block); 3846 /* Discard this dirty bitmap record */ 3847 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3848 } 3849 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3850 } 3851 ram_state->migration_dirty_pages = 0; 3852 qemu_mutex_unlock_ramlist(); 3853 qemu_mutex_unlock_iothread(); 3854 } 3855 3856 /* It is need to hold the global lock to call this helper */ 3857 void colo_release_ram_cache(void) 3858 { 3859 RAMBlock *block; 3860 3861 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3862 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3863 g_free(block->bmap); 3864 block->bmap = NULL; 3865 } 3866 3867 WITH_RCU_READ_LOCK_GUARD() { 3868 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3869 if (block->colo_cache) { 3870 qemu_anon_ram_free(block->colo_cache, block->used_length); 3871 block->colo_cache = NULL; 3872 } 3873 } 3874 } 3875 ram_state_cleanup(&ram_state); 3876 } 3877 3878 /** 3879 * ram_load_setup: Setup RAM for migration incoming side 3880 * 3881 * Returns zero to indicate success and negative for error 3882 * 3883 * @f: QEMUFile where to receive the data 3884 * @opaque: RAMState pointer 3885 */ 3886 static int ram_load_setup(QEMUFile *f, void *opaque) 3887 { 3888 if (compress_threads_load_setup(f)) { 3889 return -1; 3890 } 3891 3892 xbzrle_load_setup(); 3893 ramblock_recv_map_init(); 3894 3895 return 0; 3896 } 3897 3898 static int ram_load_cleanup(void *opaque) 3899 { 3900 RAMBlock *rb; 3901 3902 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3903 qemu_ram_block_writeback(rb); 3904 } 3905 3906 xbzrle_load_cleanup(); 3907 compress_threads_load_cleanup(); 3908 3909 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3910 g_free(rb->receivedmap); 3911 rb->receivedmap = NULL; 3912 } 3913 3914 return 0; 3915 } 3916 3917 /** 3918 * ram_postcopy_incoming_init: allocate postcopy data structures 3919 * 3920 * Returns 0 for success and negative if there was one error 3921 * 3922 * @mis: current migration incoming state 3923 * 3924 * Allocate data structures etc needed by incoming migration with 3925 * postcopy-ram. postcopy-ram's similarly names 3926 * postcopy_ram_incoming_init does the work. 3927 */ 3928 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3929 { 3930 return postcopy_ram_incoming_init(mis); 3931 } 3932 3933 /** 3934 * ram_load_postcopy: load a page in postcopy case 3935 * 3936 * Returns 0 for success or -errno in case of error 3937 * 3938 * Called in postcopy mode by ram_load(). 3939 * rcu_read_lock is taken prior to this being called. 3940 * 3941 * @f: QEMUFile where to send the data 3942 * @channel: the channel to use for loading 3943 */ 3944 int ram_load_postcopy(QEMUFile *f, int channel) 3945 { 3946 int flags = 0, ret = 0; 3947 bool place_needed = false; 3948 bool matches_target_page_size = false; 3949 MigrationIncomingState *mis = migration_incoming_get_current(); 3950 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3951 3952 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3953 ram_addr_t addr; 3954 void *page_buffer = NULL; 3955 void *place_source = NULL; 3956 RAMBlock *block = NULL; 3957 uint8_t ch; 3958 int len; 3959 3960 addr = qemu_get_be64(f); 3961 3962 /* 3963 * If qemu file error, we should stop here, and then "addr" 3964 * may be invalid 3965 */ 3966 ret = qemu_file_get_error(f); 3967 if (ret) { 3968 break; 3969 } 3970 3971 flags = addr & ~TARGET_PAGE_MASK; 3972 addr &= TARGET_PAGE_MASK; 3973 3974 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3975 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3976 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3977 block = ram_block_from_stream(mis, f, flags, channel); 3978 if (!block) { 3979 ret = -EINVAL; 3980 break; 3981 } 3982 3983 /* 3984 * Relying on used_length is racy and can result in false positives. 3985 * We might place pages beyond used_length in case RAM was shrunk 3986 * while in postcopy, which is fine - trying to place via 3987 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3988 */ 3989 if (!block->host || addr >= block->postcopy_length) { 3990 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3991 ret = -EINVAL; 3992 break; 3993 } 3994 tmp_page->target_pages++; 3995 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3996 /* 3997 * Postcopy requires that we place whole host pages atomically; 3998 * these may be huge pages for RAMBlocks that are backed by 3999 * hugetlbfs. 4000 * To make it atomic, the data is read into a temporary page 4001 * that's moved into place later. 4002 * The migration protocol uses, possibly smaller, target-pages 4003 * however the source ensures it always sends all the components 4004 * of a host page in one chunk. 4005 */ 4006 page_buffer = tmp_page->tmp_huge_page + 4007 host_page_offset_from_ram_block_offset(block, addr); 4008 /* If all TP are zero then we can optimise the place */ 4009 if (tmp_page->target_pages == 1) { 4010 tmp_page->host_addr = 4011 host_page_from_ram_block_offset(block, addr); 4012 } else if (tmp_page->host_addr != 4013 host_page_from_ram_block_offset(block, addr)) { 4014 /* not the 1st TP within the HP */ 4015 error_report("Non-same host page detected on channel %d: " 4016 "Target host page %p, received host page %p " 4017 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4018 channel, tmp_page->host_addr, 4019 host_page_from_ram_block_offset(block, addr), 4020 block->idstr, addr, tmp_page->target_pages); 4021 ret = -EINVAL; 4022 break; 4023 } 4024 4025 /* 4026 * If it's the last part of a host page then we place the host 4027 * page 4028 */ 4029 if (tmp_page->target_pages == 4030 (block->page_size / TARGET_PAGE_SIZE)) { 4031 place_needed = true; 4032 } 4033 place_source = tmp_page->tmp_huge_page; 4034 } 4035 4036 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4037 case RAM_SAVE_FLAG_ZERO: 4038 ch = qemu_get_byte(f); 4039 /* 4040 * Can skip to set page_buffer when 4041 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4042 */ 4043 if (ch || !matches_target_page_size) { 4044 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4045 } 4046 if (ch) { 4047 tmp_page->all_zero = false; 4048 } 4049 break; 4050 4051 case RAM_SAVE_FLAG_PAGE: 4052 tmp_page->all_zero = false; 4053 if (!matches_target_page_size) { 4054 /* For huge pages, we always use temporary buffer */ 4055 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4056 } else { 4057 /* 4058 * For small pages that matches target page size, we 4059 * avoid the qemu_file copy. Instead we directly use 4060 * the buffer of QEMUFile to place the page. Note: we 4061 * cannot do any QEMUFile operation before using that 4062 * buffer to make sure the buffer is valid when 4063 * placing the page. 4064 */ 4065 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4066 TARGET_PAGE_SIZE); 4067 } 4068 break; 4069 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4070 tmp_page->all_zero = false; 4071 len = qemu_get_be32(f); 4072 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4073 error_report("Invalid compressed data length: %d", len); 4074 ret = -EINVAL; 4075 break; 4076 } 4077 decompress_data_with_multi_threads(f, page_buffer, len); 4078 break; 4079 4080 case RAM_SAVE_FLAG_EOS: 4081 /* normal exit */ 4082 multifd_recv_sync_main(); 4083 break; 4084 default: 4085 error_report("Unknown combination of migration flags: 0x%x" 4086 " (postcopy mode)", flags); 4087 ret = -EINVAL; 4088 break; 4089 } 4090 4091 /* Got the whole host page, wait for decompress before placing. */ 4092 if (place_needed) { 4093 ret |= wait_for_decompress_done(); 4094 } 4095 4096 /* Detect for any possible file errors */ 4097 if (!ret && qemu_file_get_error(f)) { 4098 ret = qemu_file_get_error(f); 4099 } 4100 4101 if (!ret && place_needed) { 4102 if (tmp_page->all_zero) { 4103 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4104 } else { 4105 ret = postcopy_place_page(mis, tmp_page->host_addr, 4106 place_source, block); 4107 } 4108 place_needed = false; 4109 postcopy_temp_page_reset(tmp_page); 4110 } 4111 } 4112 4113 return ret; 4114 } 4115 4116 static bool postcopy_is_advised(void) 4117 { 4118 PostcopyState ps = postcopy_state_get(); 4119 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 4120 } 4121 4122 static bool postcopy_is_running(void) 4123 { 4124 PostcopyState ps = postcopy_state_get(); 4125 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4126 } 4127 4128 /* 4129 * Flush content of RAM cache into SVM's memory. 4130 * Only flush the pages that be dirtied by PVM or SVM or both. 4131 */ 4132 void colo_flush_ram_cache(void) 4133 { 4134 RAMBlock *block = NULL; 4135 void *dst_host; 4136 void *src_host; 4137 unsigned long offset = 0; 4138 4139 memory_global_dirty_log_sync(); 4140 WITH_RCU_READ_LOCK_GUARD() { 4141 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4142 ramblock_sync_dirty_bitmap(ram_state, block); 4143 } 4144 } 4145 4146 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4147 WITH_RCU_READ_LOCK_GUARD() { 4148 block = QLIST_FIRST_RCU(&ram_list.blocks); 4149 4150 while (block) { 4151 unsigned long num = 0; 4152 4153 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4154 if (!offset_in_ramblock(block, 4155 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4156 offset = 0; 4157 num = 0; 4158 block = QLIST_NEXT_RCU(block, next); 4159 } else { 4160 unsigned long i = 0; 4161 4162 for (i = 0; i < num; i++) { 4163 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4164 } 4165 dst_host = block->host 4166 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4167 src_host = block->colo_cache 4168 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4169 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4170 offset += num; 4171 } 4172 } 4173 } 4174 trace_colo_flush_ram_cache_end(); 4175 } 4176 4177 /** 4178 * ram_load_precopy: load pages in precopy case 4179 * 4180 * Returns 0 for success or -errno in case of error 4181 * 4182 * Called in precopy mode by ram_load(). 4183 * rcu_read_lock is taken prior to this being called. 4184 * 4185 * @f: QEMUFile where to send the data 4186 */ 4187 static int ram_load_precopy(QEMUFile *f) 4188 { 4189 MigrationIncomingState *mis = migration_incoming_get_current(); 4190 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4191 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4192 bool postcopy_advised = postcopy_is_advised(); 4193 if (!migrate_use_compression()) { 4194 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4195 } 4196 4197 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4198 ram_addr_t addr, total_ram_bytes; 4199 void *host = NULL, *host_bak = NULL; 4200 uint8_t ch; 4201 4202 /* 4203 * Yield periodically to let main loop run, but an iteration of 4204 * the main loop is expensive, so do it each some iterations 4205 */ 4206 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4207 aio_co_schedule(qemu_get_current_aio_context(), 4208 qemu_coroutine_self()); 4209 qemu_coroutine_yield(); 4210 } 4211 i++; 4212 4213 addr = qemu_get_be64(f); 4214 flags = addr & ~TARGET_PAGE_MASK; 4215 addr &= TARGET_PAGE_MASK; 4216 4217 if (flags & invalid_flags) { 4218 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4219 error_report("Received an unexpected compressed page"); 4220 } 4221 4222 ret = -EINVAL; 4223 break; 4224 } 4225 4226 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4227 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4228 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4229 RAM_CHANNEL_PRECOPY); 4230 4231 host = host_from_ram_block_offset(block, addr); 4232 /* 4233 * After going into COLO stage, we should not load the page 4234 * into SVM's memory directly, we put them into colo_cache firstly. 4235 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4236 * Previously, we copied all these memory in preparing stage of COLO 4237 * while we need to stop VM, which is a time-consuming process. 4238 * Here we optimize it by a trick, back-up every page while in 4239 * migration process while COLO is enabled, though it affects the 4240 * speed of the migration, but it obviously reduce the downtime of 4241 * back-up all SVM'S memory in COLO preparing stage. 4242 */ 4243 if (migration_incoming_colo_enabled()) { 4244 if (migration_incoming_in_colo_state()) { 4245 /* In COLO stage, put all pages into cache temporarily */ 4246 host = colo_cache_from_block_offset(block, addr, true); 4247 } else { 4248 /* 4249 * In migration stage but before COLO stage, 4250 * Put all pages into both cache and SVM's memory. 4251 */ 4252 host_bak = colo_cache_from_block_offset(block, addr, false); 4253 } 4254 } 4255 if (!host) { 4256 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4257 ret = -EINVAL; 4258 break; 4259 } 4260 if (!migration_incoming_in_colo_state()) { 4261 ramblock_recv_bitmap_set(block, host); 4262 } 4263 4264 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4265 } 4266 4267 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4268 case RAM_SAVE_FLAG_MEM_SIZE: 4269 /* Synchronize RAM block list */ 4270 total_ram_bytes = addr; 4271 while (!ret && total_ram_bytes) { 4272 RAMBlock *block; 4273 char id[256]; 4274 ram_addr_t length; 4275 4276 len = qemu_get_byte(f); 4277 qemu_get_buffer(f, (uint8_t *)id, len); 4278 id[len] = 0; 4279 length = qemu_get_be64(f); 4280 4281 block = qemu_ram_block_by_name(id); 4282 if (block && !qemu_ram_is_migratable(block)) { 4283 error_report("block %s should not be migrated !", id); 4284 ret = -EINVAL; 4285 } else if (block) { 4286 if (length != block->used_length) { 4287 Error *local_err = NULL; 4288 4289 ret = qemu_ram_resize(block, length, 4290 &local_err); 4291 if (local_err) { 4292 error_report_err(local_err); 4293 } 4294 } 4295 /* For postcopy we need to check hugepage sizes match */ 4296 if (postcopy_advised && migrate_postcopy_ram() && 4297 block->page_size != qemu_host_page_size) { 4298 uint64_t remote_page_size = qemu_get_be64(f); 4299 if (remote_page_size != block->page_size) { 4300 error_report("Mismatched RAM page size %s " 4301 "(local) %zd != %" PRId64, 4302 id, block->page_size, 4303 remote_page_size); 4304 ret = -EINVAL; 4305 } 4306 } 4307 if (migrate_ignore_shared()) { 4308 hwaddr addr = qemu_get_be64(f); 4309 if (ramblock_is_ignored(block) && 4310 block->mr->addr != addr) { 4311 error_report("Mismatched GPAs for block %s " 4312 "%" PRId64 "!= %" PRId64, 4313 id, (uint64_t)addr, 4314 (uint64_t)block->mr->addr); 4315 ret = -EINVAL; 4316 } 4317 } 4318 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4319 block->idstr); 4320 } else { 4321 error_report("Unknown ramblock \"%s\", cannot " 4322 "accept migration", id); 4323 ret = -EINVAL; 4324 } 4325 4326 total_ram_bytes -= length; 4327 } 4328 break; 4329 4330 case RAM_SAVE_FLAG_ZERO: 4331 ch = qemu_get_byte(f); 4332 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4333 break; 4334 4335 case RAM_SAVE_FLAG_PAGE: 4336 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4337 break; 4338 4339 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4340 len = qemu_get_be32(f); 4341 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4342 error_report("Invalid compressed data length: %d", len); 4343 ret = -EINVAL; 4344 break; 4345 } 4346 decompress_data_with_multi_threads(f, host, len); 4347 break; 4348 4349 case RAM_SAVE_FLAG_XBZRLE: 4350 if (load_xbzrle(f, addr, host) < 0) { 4351 error_report("Failed to decompress XBZRLE page at " 4352 RAM_ADDR_FMT, addr); 4353 ret = -EINVAL; 4354 break; 4355 } 4356 break; 4357 case RAM_SAVE_FLAG_EOS: 4358 /* normal exit */ 4359 multifd_recv_sync_main(); 4360 break; 4361 default: 4362 if (flags & RAM_SAVE_FLAG_HOOK) { 4363 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4364 } else { 4365 error_report("Unknown combination of migration flags: 0x%x", 4366 flags); 4367 ret = -EINVAL; 4368 } 4369 } 4370 if (!ret) { 4371 ret = qemu_file_get_error(f); 4372 } 4373 if (!ret && host_bak) { 4374 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4375 } 4376 } 4377 4378 ret |= wait_for_decompress_done(); 4379 return ret; 4380 } 4381 4382 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4383 { 4384 int ret = 0; 4385 static uint64_t seq_iter; 4386 /* 4387 * If system is running in postcopy mode, page inserts to host memory must 4388 * be atomic 4389 */ 4390 bool postcopy_running = postcopy_is_running(); 4391 4392 seq_iter++; 4393 4394 if (version_id != 4) { 4395 return -EINVAL; 4396 } 4397 4398 /* 4399 * This RCU critical section can be very long running. 4400 * When RCU reclaims in the code start to become numerous, 4401 * it will be necessary to reduce the granularity of this 4402 * critical section. 4403 */ 4404 WITH_RCU_READ_LOCK_GUARD() { 4405 if (postcopy_running) { 4406 /* 4407 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4408 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4409 * service fast page faults. 4410 */ 4411 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4412 } else { 4413 ret = ram_load_precopy(f); 4414 } 4415 } 4416 trace_ram_load_complete(ret, seq_iter); 4417 4418 return ret; 4419 } 4420 4421 static bool ram_has_postcopy(void *opaque) 4422 { 4423 RAMBlock *rb; 4424 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4425 if (ramblock_is_pmem(rb)) { 4426 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4427 "is not supported now!", rb->idstr, rb->host); 4428 return false; 4429 } 4430 } 4431 4432 return migrate_postcopy_ram(); 4433 } 4434 4435 /* Sync all the dirty bitmap with destination VM. */ 4436 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4437 { 4438 RAMBlock *block; 4439 QEMUFile *file = s->to_dst_file; 4440 int ramblock_count = 0; 4441 4442 trace_ram_dirty_bitmap_sync_start(); 4443 4444 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4445 qemu_savevm_send_recv_bitmap(file, block->idstr); 4446 trace_ram_dirty_bitmap_request(block->idstr); 4447 ramblock_count++; 4448 } 4449 4450 trace_ram_dirty_bitmap_sync_wait(); 4451 4452 /* Wait until all the ramblocks' dirty bitmap synced */ 4453 while (ramblock_count--) { 4454 qemu_sem_wait(&s->rp_state.rp_sem); 4455 } 4456 4457 trace_ram_dirty_bitmap_sync_complete(); 4458 4459 return 0; 4460 } 4461 4462 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4463 { 4464 qemu_sem_post(&s->rp_state.rp_sem); 4465 } 4466 4467 /* 4468 * Read the received bitmap, revert it as the initial dirty bitmap. 4469 * This is only used when the postcopy migration is paused but wants 4470 * to resume from a middle point. 4471 */ 4472 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4473 { 4474 int ret = -EINVAL; 4475 /* from_dst_file is always valid because we're within rp_thread */ 4476 QEMUFile *file = s->rp_state.from_dst_file; 4477 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4478 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4479 uint64_t size, end_mark; 4480 4481 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4482 4483 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4484 error_report("%s: incorrect state %s", __func__, 4485 MigrationStatus_str(s->state)); 4486 return -EINVAL; 4487 } 4488 4489 /* 4490 * Note: see comments in ramblock_recv_bitmap_send() on why we 4491 * need the endianness conversion, and the paddings. 4492 */ 4493 local_size = ROUND_UP(local_size, 8); 4494 4495 /* Add paddings */ 4496 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4497 4498 size = qemu_get_be64(file); 4499 4500 /* The size of the bitmap should match with our ramblock */ 4501 if (size != local_size) { 4502 error_report("%s: ramblock '%s' bitmap size mismatch " 4503 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4504 block->idstr, size, local_size); 4505 ret = -EINVAL; 4506 goto out; 4507 } 4508 4509 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4510 end_mark = qemu_get_be64(file); 4511 4512 ret = qemu_file_get_error(file); 4513 if (ret || size != local_size) { 4514 error_report("%s: read bitmap failed for ramblock '%s': %d" 4515 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4516 __func__, block->idstr, ret, local_size, size); 4517 ret = -EIO; 4518 goto out; 4519 } 4520 4521 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4522 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4523 __func__, block->idstr, end_mark); 4524 ret = -EINVAL; 4525 goto out; 4526 } 4527 4528 /* 4529 * Endianness conversion. We are during postcopy (though paused). 4530 * The dirty bitmap won't change. We can directly modify it. 4531 */ 4532 bitmap_from_le(block->bmap, le_bitmap, nbits); 4533 4534 /* 4535 * What we received is "received bitmap". Revert it as the initial 4536 * dirty bitmap for this ramblock. 4537 */ 4538 bitmap_complement(block->bmap, block->bmap, nbits); 4539 4540 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4541 ramblock_dirty_bitmap_clear_discarded_pages(block); 4542 4543 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4544 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4545 4546 /* 4547 * We succeeded to sync bitmap for current ramblock. If this is 4548 * the last one to sync, we need to notify the main send thread. 4549 */ 4550 ram_dirty_bitmap_reload_notify(s); 4551 4552 ret = 0; 4553 out: 4554 g_free(le_bitmap); 4555 return ret; 4556 } 4557 4558 static int ram_resume_prepare(MigrationState *s, void *opaque) 4559 { 4560 RAMState *rs = *(RAMState **)opaque; 4561 int ret; 4562 4563 ret = ram_dirty_bitmap_sync_all(s, rs); 4564 if (ret) { 4565 return ret; 4566 } 4567 4568 ram_state_resume_prepare(rs, s->to_dst_file); 4569 4570 return 0; 4571 } 4572 4573 void postcopy_preempt_shutdown_file(MigrationState *s) 4574 { 4575 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4576 qemu_fflush(s->postcopy_qemufile_src); 4577 } 4578 4579 static SaveVMHandlers savevm_ram_handlers = { 4580 .save_setup = ram_save_setup, 4581 .save_live_iterate = ram_save_iterate, 4582 .save_live_complete_postcopy = ram_save_complete, 4583 .save_live_complete_precopy = ram_save_complete, 4584 .has_postcopy = ram_has_postcopy, 4585 .save_live_pending = ram_save_pending, 4586 .load_state = ram_load, 4587 .save_cleanup = ram_save_cleanup, 4588 .load_setup = ram_load_setup, 4589 .load_cleanup = ram_load_cleanup, 4590 .resume_prepare = ram_resume_prepare, 4591 }; 4592 4593 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4594 size_t old_size, size_t new_size) 4595 { 4596 PostcopyState ps = postcopy_state_get(); 4597 ram_addr_t offset; 4598 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4599 Error *err = NULL; 4600 4601 if (ramblock_is_ignored(rb)) { 4602 return; 4603 } 4604 4605 if (!migration_is_idle()) { 4606 /* 4607 * Precopy code on the source cannot deal with the size of RAM blocks 4608 * changing at random points in time - especially after sending the 4609 * RAM block sizes in the migration stream, they must no longer change. 4610 * Abort and indicate a proper reason. 4611 */ 4612 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4613 migration_cancel(err); 4614 error_free(err); 4615 } 4616 4617 switch (ps) { 4618 case POSTCOPY_INCOMING_ADVISE: 4619 /* 4620 * Update what ram_postcopy_incoming_init()->init_range() does at the 4621 * time postcopy was advised. Syncing RAM blocks with the source will 4622 * result in RAM resizes. 4623 */ 4624 if (old_size < new_size) { 4625 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4626 error_report("RAM block '%s' discard of resized RAM failed", 4627 rb->idstr); 4628 } 4629 } 4630 rb->postcopy_length = new_size; 4631 break; 4632 case POSTCOPY_INCOMING_NONE: 4633 case POSTCOPY_INCOMING_RUNNING: 4634 case POSTCOPY_INCOMING_END: 4635 /* 4636 * Once our guest is running, postcopy does no longer care about 4637 * resizes. When growing, the new memory was not available on the 4638 * source, no handler needed. 4639 */ 4640 break; 4641 default: 4642 error_report("RAM block '%s' resized during postcopy state: %d", 4643 rb->idstr, ps); 4644 exit(-1); 4645 } 4646 } 4647 4648 static RAMBlockNotifier ram_mig_ram_notifier = { 4649 .ram_block_resized = ram_mig_ram_block_resized, 4650 }; 4651 4652 void ram_mig_init(void) 4653 { 4654 qemu_mutex_init(&XBZRLE.lock); 4655 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4656 ram_block_notifier_add(&ram_mig_ram_notifier); 4657 } 4658