1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #include "hw/boards.h" /* for machine_dump_guest_core() */ 62 63 #if defined(__linux__) 64 #include "qemu/userfaultfd.h" 65 #endif /* defined(__linux__) */ 66 67 /***********************************************************/ 68 /* ram save/restore */ 69 70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 71 * worked for pages that where filled with the same char. We switched 72 * it to only search for the zero value. And to avoid confusion with 73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 74 */ 75 76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 77 #define RAM_SAVE_FLAG_ZERO 0x02 78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 79 #define RAM_SAVE_FLAG_PAGE 0x08 80 #define RAM_SAVE_FLAG_EOS 0x10 81 #define RAM_SAVE_FLAG_CONTINUE 0x20 82 #define RAM_SAVE_FLAG_XBZRLE 0x40 83 /* 0x80 is reserved in migration.h start with 0x100 next */ 84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 85 86 XBZRLECacheStats xbzrle_counters; 87 88 /* used by the search for pages to send */ 89 struct PageSearchStatus { 90 /* The migration channel used for a specific host page */ 91 QEMUFile *pss_channel; 92 /* Last block from where we have sent data */ 93 RAMBlock *last_sent_block; 94 /* Current block being searched */ 95 RAMBlock *block; 96 /* Current page to search from */ 97 unsigned long page; 98 /* Set once we wrap around */ 99 bool complete_round; 100 /* Whether we're sending a host page */ 101 bool host_page_sending; 102 /* The start/end of current host page. Invalid if host_page_sending==false */ 103 unsigned long host_page_start; 104 unsigned long host_page_end; 105 }; 106 typedef struct PageSearchStatus PageSearchStatus; 107 108 /* struct contains XBZRLE cache and a static page 109 used by the compression */ 110 static struct { 111 /* buffer used for XBZRLE encoding */ 112 uint8_t *encoded_buf; 113 /* buffer for storing page content */ 114 uint8_t *current_buf; 115 /* Cache for XBZRLE, Protected by lock. */ 116 PageCache *cache; 117 QemuMutex lock; 118 /* it will store a page full of zeros */ 119 uint8_t *zero_target_page; 120 /* buffer used for XBZRLE decoding */ 121 uint8_t *decoded_buf; 122 } XBZRLE; 123 124 static void XBZRLE_cache_lock(void) 125 { 126 if (migrate_use_xbzrle()) { 127 qemu_mutex_lock(&XBZRLE.lock); 128 } 129 } 130 131 static void XBZRLE_cache_unlock(void) 132 { 133 if (migrate_use_xbzrle()) { 134 qemu_mutex_unlock(&XBZRLE.lock); 135 } 136 } 137 138 /** 139 * xbzrle_cache_resize: resize the xbzrle cache 140 * 141 * This function is called from migrate_params_apply in main 142 * thread, possibly while a migration is in progress. A running 143 * migration may be using the cache and might finish during this call, 144 * hence changes to the cache are protected by XBZRLE.lock(). 145 * 146 * Returns 0 for success or -1 for error 147 * 148 * @new_size: new cache size 149 * @errp: set *errp if the check failed, with reason 150 */ 151 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 152 { 153 PageCache *new_cache; 154 int64_t ret = 0; 155 156 /* Check for truncation */ 157 if (new_size != (size_t)new_size) { 158 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 159 "exceeding address space"); 160 return -1; 161 } 162 163 if (new_size == migrate_xbzrle_cache_size()) { 164 /* nothing to do */ 165 return 0; 166 } 167 168 XBZRLE_cache_lock(); 169 170 if (XBZRLE.cache != NULL) { 171 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 172 if (!new_cache) { 173 ret = -1; 174 goto out; 175 } 176 177 cache_fini(XBZRLE.cache); 178 XBZRLE.cache = new_cache; 179 } 180 out: 181 XBZRLE_cache_unlock(); 182 return ret; 183 } 184 185 static bool postcopy_preempt_active(void) 186 { 187 return migrate_postcopy_preempt() && migration_in_postcopy(); 188 } 189 190 bool ramblock_is_ignored(RAMBlock *block) 191 { 192 return !qemu_ram_is_migratable(block) || 193 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 194 } 195 196 #undef RAMBLOCK_FOREACH 197 198 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 199 { 200 RAMBlock *block; 201 int ret = 0; 202 203 RCU_READ_LOCK_GUARD(); 204 205 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 206 ret = func(block, opaque); 207 if (ret) { 208 break; 209 } 210 } 211 return ret; 212 } 213 214 static void ramblock_recv_map_init(void) 215 { 216 RAMBlock *rb; 217 218 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 219 assert(!rb->receivedmap); 220 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 221 } 222 } 223 224 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 225 { 226 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 227 rb->receivedmap); 228 } 229 230 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 231 { 232 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 233 } 234 235 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 236 { 237 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 238 } 239 240 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 241 size_t nr) 242 { 243 bitmap_set_atomic(rb->receivedmap, 244 ramblock_recv_bitmap_offset(host_addr, rb), 245 nr); 246 } 247 248 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 249 250 /* 251 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 252 * 253 * Returns >0 if success with sent bytes, or <0 if error. 254 */ 255 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 256 const char *block_name) 257 { 258 RAMBlock *block = qemu_ram_block_by_name(block_name); 259 unsigned long *le_bitmap, nbits; 260 uint64_t size; 261 262 if (!block) { 263 error_report("%s: invalid block name: %s", __func__, block_name); 264 return -1; 265 } 266 267 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 268 269 /* 270 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 271 * machines we may need 4 more bytes for padding (see below 272 * comment). So extend it a bit before hand. 273 */ 274 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 275 276 /* 277 * Always use little endian when sending the bitmap. This is 278 * required that when source and destination VMs are not using the 279 * same endianness. (Note: big endian won't work.) 280 */ 281 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 282 283 /* Size of the bitmap, in bytes */ 284 size = DIV_ROUND_UP(nbits, 8); 285 286 /* 287 * size is always aligned to 8 bytes for 64bit machines, but it 288 * may not be true for 32bit machines. We need this padding to 289 * make sure the migration can survive even between 32bit and 290 * 64bit machines. 291 */ 292 size = ROUND_UP(size, 8); 293 294 qemu_put_be64(file, size); 295 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 296 /* 297 * Mark as an end, in case the middle part is screwed up due to 298 * some "mysterious" reason. 299 */ 300 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 301 qemu_fflush(file); 302 303 g_free(le_bitmap); 304 305 if (qemu_file_get_error(file)) { 306 return qemu_file_get_error(file); 307 } 308 309 return size + sizeof(size); 310 } 311 312 /* 313 * An outstanding page request, on the source, having been received 314 * and queued 315 */ 316 struct RAMSrcPageRequest { 317 RAMBlock *rb; 318 hwaddr offset; 319 hwaddr len; 320 321 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 322 }; 323 324 /* State of RAM for migration */ 325 struct RAMState { 326 /* 327 * PageSearchStatus structures for the channels when send pages. 328 * Protected by the bitmap_mutex. 329 */ 330 PageSearchStatus pss[RAM_CHANNEL_MAX]; 331 /* UFFD file descriptor, used in 'write-tracking' migration */ 332 int uffdio_fd; 333 /* Last block that we have visited searching for dirty pages */ 334 RAMBlock *last_seen_block; 335 /* Last dirty target page we have sent */ 336 ram_addr_t last_page; 337 /* last ram version we have seen */ 338 uint32_t last_version; 339 /* How many times we have dirty too many pages */ 340 int dirty_rate_high_cnt; 341 /* these variables are used for bitmap sync */ 342 /* last time we did a full bitmap_sync */ 343 int64_t time_last_bitmap_sync; 344 /* bytes transferred at start_time */ 345 uint64_t bytes_xfer_prev; 346 /* number of dirty pages since start_time */ 347 uint64_t num_dirty_pages_period; 348 /* xbzrle misses since the beginning of the period */ 349 uint64_t xbzrle_cache_miss_prev; 350 /* Amount of xbzrle pages since the beginning of the period */ 351 uint64_t xbzrle_pages_prev; 352 /* Amount of xbzrle encoded bytes since the beginning of the period */ 353 uint64_t xbzrle_bytes_prev; 354 /* Start using XBZRLE (e.g., after the first round). */ 355 bool xbzrle_enabled; 356 /* Are we on the last stage of migration */ 357 bool last_stage; 358 /* compression statistics since the beginning of the period */ 359 /* amount of count that no free thread to compress data */ 360 uint64_t compress_thread_busy_prev; 361 /* amount bytes after compression */ 362 uint64_t compressed_size_prev; 363 /* amount of compressed pages */ 364 uint64_t compress_pages_prev; 365 366 /* total handled target pages at the beginning of period */ 367 uint64_t target_page_count_prev; 368 /* total handled target pages since start */ 369 uint64_t target_page_count; 370 /* number of dirty bits in the bitmap */ 371 uint64_t migration_dirty_pages; 372 /* 373 * Protects: 374 * - dirty/clear bitmap 375 * - migration_dirty_pages 376 * - pss structures 377 */ 378 QemuMutex bitmap_mutex; 379 /* The RAMBlock used in the last src_page_requests */ 380 RAMBlock *last_req_rb; 381 /* Queue of outstanding page requests from the destination */ 382 QemuMutex src_page_req_mutex; 383 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 384 }; 385 typedef struct RAMState RAMState; 386 387 static RAMState *ram_state; 388 389 static NotifierWithReturnList precopy_notifier_list; 390 391 /* Whether postcopy has queued requests? */ 392 static bool postcopy_has_request(RAMState *rs) 393 { 394 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 395 } 396 397 void precopy_infrastructure_init(void) 398 { 399 notifier_with_return_list_init(&precopy_notifier_list); 400 } 401 402 void precopy_add_notifier(NotifierWithReturn *n) 403 { 404 notifier_with_return_list_add(&precopy_notifier_list, n); 405 } 406 407 void precopy_remove_notifier(NotifierWithReturn *n) 408 { 409 notifier_with_return_remove(n); 410 } 411 412 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 413 { 414 PrecopyNotifyData pnd; 415 pnd.reason = reason; 416 pnd.errp = errp; 417 418 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 419 } 420 421 uint64_t ram_bytes_remaining(void) 422 { 423 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 424 0; 425 } 426 427 /* 428 * NOTE: not all stats in ram_counters are used in reality. See comments 429 * for struct MigrationAtomicStats. The ultimate result of ram migration 430 * counters will be a merged version with both ram_counters and the atomic 431 * fields in ram_atomic_counters. 432 */ 433 MigrationStats ram_counters; 434 MigrationAtomicStats ram_atomic_counters; 435 436 void ram_transferred_add(uint64_t bytes) 437 { 438 if (runstate_is_running()) { 439 ram_counters.precopy_bytes += bytes; 440 } else if (migration_in_postcopy()) { 441 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes); 442 } else { 443 ram_counters.downtime_bytes += bytes; 444 } 445 stat64_add(&ram_atomic_counters.transferred, bytes); 446 } 447 448 void dirty_sync_missed_zero_copy(void) 449 { 450 ram_counters.dirty_sync_missed_zero_copy++; 451 } 452 453 CompressionStats compression_counters; 454 455 struct CompressParam { 456 bool done; 457 bool quit; 458 bool zero_page; 459 QEMUFile *file; 460 QemuMutex mutex; 461 QemuCond cond; 462 RAMBlock *block; 463 ram_addr_t offset; 464 465 /* internally used fields */ 466 z_stream stream; 467 uint8_t *originbuf; 468 }; 469 typedef struct CompressParam CompressParam; 470 471 struct DecompressParam { 472 bool done; 473 bool quit; 474 QemuMutex mutex; 475 QemuCond cond; 476 void *des; 477 uint8_t *compbuf; 478 int len; 479 z_stream stream; 480 }; 481 typedef struct DecompressParam DecompressParam; 482 483 static CompressParam *comp_param; 484 static QemuThread *compress_threads; 485 /* comp_done_cond is used to wake up the migration thread when 486 * one of the compression threads has finished the compression. 487 * comp_done_lock is used to co-work with comp_done_cond. 488 */ 489 static QemuMutex comp_done_lock; 490 static QemuCond comp_done_cond; 491 492 static QEMUFile *decomp_file; 493 static DecompressParam *decomp_param; 494 static QemuThread *decompress_threads; 495 static QemuMutex decomp_done_lock; 496 static QemuCond decomp_done_cond; 497 498 static int ram_save_host_page_urgent(PageSearchStatus *pss); 499 500 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 501 ram_addr_t offset, uint8_t *source_buf); 502 503 /* NOTE: page is the PFN not real ram_addr_t. */ 504 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 505 { 506 pss->block = rb; 507 pss->page = page; 508 pss->complete_round = false; 509 } 510 511 /* 512 * Check whether two PSSs are actively sending the same page. Return true 513 * if it is, false otherwise. 514 */ 515 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 516 { 517 return pss1->host_page_sending && pss2->host_page_sending && 518 (pss1->host_page_start == pss2->host_page_start); 519 } 520 521 static void *do_data_compress(void *opaque) 522 { 523 CompressParam *param = opaque; 524 RAMBlock *block; 525 ram_addr_t offset; 526 bool zero_page; 527 528 qemu_mutex_lock(¶m->mutex); 529 while (!param->quit) { 530 if (param->block) { 531 block = param->block; 532 offset = param->offset; 533 param->block = NULL; 534 qemu_mutex_unlock(¶m->mutex); 535 536 zero_page = do_compress_ram_page(param->file, ¶m->stream, 537 block, offset, param->originbuf); 538 539 qemu_mutex_lock(&comp_done_lock); 540 param->done = true; 541 param->zero_page = zero_page; 542 qemu_cond_signal(&comp_done_cond); 543 qemu_mutex_unlock(&comp_done_lock); 544 545 qemu_mutex_lock(¶m->mutex); 546 } else { 547 qemu_cond_wait(¶m->cond, ¶m->mutex); 548 } 549 } 550 qemu_mutex_unlock(¶m->mutex); 551 552 return NULL; 553 } 554 555 static void compress_threads_save_cleanup(void) 556 { 557 int i, thread_count; 558 559 if (!migrate_use_compression() || !comp_param) { 560 return; 561 } 562 563 thread_count = migrate_compress_threads(); 564 for (i = 0; i < thread_count; i++) { 565 /* 566 * we use it as a indicator which shows if the thread is 567 * properly init'd or not 568 */ 569 if (!comp_param[i].file) { 570 break; 571 } 572 573 qemu_mutex_lock(&comp_param[i].mutex); 574 comp_param[i].quit = true; 575 qemu_cond_signal(&comp_param[i].cond); 576 qemu_mutex_unlock(&comp_param[i].mutex); 577 578 qemu_thread_join(compress_threads + i); 579 qemu_mutex_destroy(&comp_param[i].mutex); 580 qemu_cond_destroy(&comp_param[i].cond); 581 deflateEnd(&comp_param[i].stream); 582 g_free(comp_param[i].originbuf); 583 qemu_fclose(comp_param[i].file); 584 comp_param[i].file = NULL; 585 } 586 qemu_mutex_destroy(&comp_done_lock); 587 qemu_cond_destroy(&comp_done_cond); 588 g_free(compress_threads); 589 g_free(comp_param); 590 compress_threads = NULL; 591 comp_param = NULL; 592 } 593 594 static int compress_threads_save_setup(void) 595 { 596 int i, thread_count; 597 598 if (!migrate_use_compression()) { 599 return 0; 600 } 601 thread_count = migrate_compress_threads(); 602 compress_threads = g_new0(QemuThread, thread_count); 603 comp_param = g_new0(CompressParam, thread_count); 604 qemu_cond_init(&comp_done_cond); 605 qemu_mutex_init(&comp_done_lock); 606 for (i = 0; i < thread_count; i++) { 607 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 608 if (!comp_param[i].originbuf) { 609 goto exit; 610 } 611 612 if (deflateInit(&comp_param[i].stream, 613 migrate_compress_level()) != Z_OK) { 614 g_free(comp_param[i].originbuf); 615 goto exit; 616 } 617 618 /* comp_param[i].file is just used as a dummy buffer to save data, 619 * set its ops to empty. 620 */ 621 comp_param[i].file = qemu_file_new_output( 622 QIO_CHANNEL(qio_channel_null_new())); 623 comp_param[i].done = true; 624 comp_param[i].quit = false; 625 qemu_mutex_init(&comp_param[i].mutex); 626 qemu_cond_init(&comp_param[i].cond); 627 qemu_thread_create(compress_threads + i, "compress", 628 do_data_compress, comp_param + i, 629 QEMU_THREAD_JOINABLE); 630 } 631 return 0; 632 633 exit: 634 compress_threads_save_cleanup(); 635 return -1; 636 } 637 638 /** 639 * save_page_header: write page header to wire 640 * 641 * If this is the 1st block, it also writes the block identification 642 * 643 * Returns the number of bytes written 644 * 645 * @pss: current PSS channel status 646 * @block: block that contains the page we want to send 647 * @offset: offset inside the block for the page 648 * in the lower bits, it contains flags 649 */ 650 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block, 651 ram_addr_t offset) 652 { 653 size_t size, len; 654 bool same_block = (block == pss->last_sent_block); 655 QEMUFile *f = pss->pss_channel; 656 657 if (same_block) { 658 offset |= RAM_SAVE_FLAG_CONTINUE; 659 } 660 qemu_put_be64(f, offset); 661 size = 8; 662 663 if (!same_block) { 664 len = strlen(block->idstr); 665 qemu_put_byte(f, len); 666 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 667 size += 1 + len; 668 pss->last_sent_block = block; 669 } 670 return size; 671 } 672 673 /** 674 * mig_throttle_guest_down: throttle down the guest 675 * 676 * Reduce amount of guest cpu execution to hopefully slow down memory 677 * writes. If guest dirty memory rate is reduced below the rate at 678 * which we can transfer pages to the destination then we should be 679 * able to complete migration. Some workloads dirty memory way too 680 * fast and will not effectively converge, even with auto-converge. 681 */ 682 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 683 uint64_t bytes_dirty_threshold) 684 { 685 MigrationState *s = migrate_get_current(); 686 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 687 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 688 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 689 int pct_max = s->parameters.max_cpu_throttle; 690 691 uint64_t throttle_now = cpu_throttle_get_percentage(); 692 uint64_t cpu_now, cpu_ideal, throttle_inc; 693 694 /* We have not started throttling yet. Let's start it. */ 695 if (!cpu_throttle_active()) { 696 cpu_throttle_set(pct_initial); 697 } else { 698 /* Throttling already on, just increase the rate */ 699 if (!pct_tailslow) { 700 throttle_inc = pct_increment; 701 } else { 702 /* Compute the ideal CPU percentage used by Guest, which may 703 * make the dirty rate match the dirty rate threshold. */ 704 cpu_now = 100 - throttle_now; 705 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 706 bytes_dirty_period); 707 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 708 } 709 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 710 } 711 } 712 713 void mig_throttle_counter_reset(void) 714 { 715 RAMState *rs = ram_state; 716 717 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 718 rs->num_dirty_pages_period = 0; 719 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); 720 } 721 722 /** 723 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 724 * 725 * @rs: current RAM state 726 * @current_addr: address for the zero page 727 * 728 * Update the xbzrle cache to reflect a page that's been sent as all 0. 729 * The important thing is that a stale (not-yet-0'd) page be replaced 730 * by the new data. 731 * As a bonus, if the page wasn't in the cache it gets added so that 732 * when a small write is made into the 0'd page it gets XBZRLE sent. 733 */ 734 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 735 { 736 /* We don't care if this fails to allocate a new cache page 737 * as long as it updated an old one */ 738 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 739 ram_counters.dirty_sync_count); 740 } 741 742 #define ENCODING_FLAG_XBZRLE 0x1 743 744 /** 745 * save_xbzrle_page: compress and send current page 746 * 747 * Returns: 1 means that we wrote the page 748 * 0 means that page is identical to the one already sent 749 * -1 means that xbzrle would be longer than normal 750 * 751 * @rs: current RAM state 752 * @pss: current PSS channel 753 * @current_data: pointer to the address of the page contents 754 * @current_addr: addr of the page 755 * @block: block that contains the page we want to send 756 * @offset: offset inside the block for the page 757 */ 758 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 759 uint8_t **current_data, ram_addr_t current_addr, 760 RAMBlock *block, ram_addr_t offset) 761 { 762 int encoded_len = 0, bytes_xbzrle; 763 uint8_t *prev_cached_page; 764 QEMUFile *file = pss->pss_channel; 765 766 if (!cache_is_cached(XBZRLE.cache, current_addr, 767 ram_counters.dirty_sync_count)) { 768 xbzrle_counters.cache_miss++; 769 if (!rs->last_stage) { 770 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 771 ram_counters.dirty_sync_count) == -1) { 772 return -1; 773 } else { 774 /* update *current_data when the page has been 775 inserted into cache */ 776 *current_data = get_cached_data(XBZRLE.cache, current_addr); 777 } 778 } 779 return -1; 780 } 781 782 /* 783 * Reaching here means the page has hit the xbzrle cache, no matter what 784 * encoding result it is (normal encoding, overflow or skipping the page), 785 * count the page as encoded. This is used to calculate the encoding rate. 786 * 787 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 788 * 2nd page turns out to be skipped (i.e. no new bytes written to the 789 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 790 * skipped page included. In this way, the encoding rate can tell if the 791 * guest page is good for xbzrle encoding. 792 */ 793 xbzrle_counters.pages++; 794 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 795 796 /* save current buffer into memory */ 797 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 798 799 /* XBZRLE encoding (if there is no overflow) */ 800 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 801 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 802 TARGET_PAGE_SIZE); 803 804 /* 805 * Update the cache contents, so that it corresponds to the data 806 * sent, in all cases except where we skip the page. 807 */ 808 if (!rs->last_stage && encoded_len != 0) { 809 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 810 /* 811 * In the case where we couldn't compress, ensure that the caller 812 * sends the data from the cache, since the guest might have 813 * changed the RAM since we copied it. 814 */ 815 *current_data = prev_cached_page; 816 } 817 818 if (encoded_len == 0) { 819 trace_save_xbzrle_page_skipping(); 820 return 0; 821 } else if (encoded_len == -1) { 822 trace_save_xbzrle_page_overflow(); 823 xbzrle_counters.overflow++; 824 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 825 return -1; 826 } 827 828 /* Send XBZRLE based compressed page */ 829 bytes_xbzrle = save_page_header(pss, block, 830 offset | RAM_SAVE_FLAG_XBZRLE); 831 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 832 qemu_put_be16(file, encoded_len); 833 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 834 bytes_xbzrle += encoded_len + 1 + 2; 835 /* 836 * Like compressed_size (please see update_compress_thread_counts), 837 * the xbzrle encoded bytes don't count the 8 byte header with 838 * RAM_SAVE_FLAG_CONTINUE. 839 */ 840 xbzrle_counters.bytes += bytes_xbzrle - 8; 841 ram_transferred_add(bytes_xbzrle); 842 843 return 1; 844 } 845 846 /** 847 * pss_find_next_dirty: find the next dirty page of current ramblock 848 * 849 * This function updates pss->page to point to the next dirty page index 850 * within the ramblock to migrate, or the end of ramblock when nothing 851 * found. Note that when pss->host_page_sending==true it means we're 852 * during sending a host page, so we won't look for dirty page that is 853 * outside the host page boundary. 854 * 855 * @pss: the current page search status 856 */ 857 static void pss_find_next_dirty(PageSearchStatus *pss) 858 { 859 RAMBlock *rb = pss->block; 860 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 861 unsigned long *bitmap = rb->bmap; 862 863 if (ramblock_is_ignored(rb)) { 864 /* Points directly to the end, so we know no dirty page */ 865 pss->page = size; 866 return; 867 } 868 869 /* 870 * If during sending a host page, only look for dirty pages within the 871 * current host page being send. 872 */ 873 if (pss->host_page_sending) { 874 assert(pss->host_page_end); 875 size = MIN(size, pss->host_page_end); 876 } 877 878 pss->page = find_next_bit(bitmap, size, pss->page); 879 } 880 881 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 882 unsigned long page) 883 { 884 uint8_t shift; 885 hwaddr size, start; 886 887 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 888 return; 889 } 890 891 shift = rb->clear_bmap_shift; 892 /* 893 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 894 * can make things easier sometimes since then start address 895 * of the small chunk will always be 64 pages aligned so the 896 * bitmap will always be aligned to unsigned long. We should 897 * even be able to remove this restriction but I'm simply 898 * keeping it. 899 */ 900 assert(shift >= 6); 901 902 size = 1ULL << (TARGET_PAGE_BITS + shift); 903 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 904 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 905 memory_region_clear_dirty_bitmap(rb->mr, start, size); 906 } 907 908 static void 909 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 910 unsigned long start, 911 unsigned long npages) 912 { 913 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 914 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 915 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 916 917 /* 918 * Clear pages from start to start + npages - 1, so the end boundary is 919 * exclusive. 920 */ 921 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 922 migration_clear_memory_region_dirty_bitmap(rb, i); 923 } 924 } 925 926 /* 927 * colo_bitmap_find_diry:find contiguous dirty pages from start 928 * 929 * Returns the page offset within memory region of the start of the contiguout 930 * dirty page 931 * 932 * @rs: current RAM state 933 * @rb: RAMBlock where to search for dirty pages 934 * @start: page where we start the search 935 * @num: the number of contiguous dirty pages 936 */ 937 static inline 938 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 939 unsigned long start, unsigned long *num) 940 { 941 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 942 unsigned long *bitmap = rb->bmap; 943 unsigned long first, next; 944 945 *num = 0; 946 947 if (ramblock_is_ignored(rb)) { 948 return size; 949 } 950 951 first = find_next_bit(bitmap, size, start); 952 if (first >= size) { 953 return first; 954 } 955 next = find_next_zero_bit(bitmap, size, first + 1); 956 assert(next >= first); 957 *num = next - first; 958 return first; 959 } 960 961 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 962 RAMBlock *rb, 963 unsigned long page) 964 { 965 bool ret; 966 967 /* 968 * Clear dirty bitmap if needed. This _must_ be called before we 969 * send any of the page in the chunk because we need to make sure 970 * we can capture further page content changes when we sync dirty 971 * log the next time. So as long as we are going to send any of 972 * the page in the chunk we clear the remote dirty bitmap for all. 973 * Clearing it earlier won't be a problem, but too late will. 974 */ 975 migration_clear_memory_region_dirty_bitmap(rb, page); 976 977 ret = test_and_clear_bit(page, rb->bmap); 978 if (ret) { 979 rs->migration_dirty_pages--; 980 } 981 982 return ret; 983 } 984 985 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 986 void *opaque) 987 { 988 const hwaddr offset = section->offset_within_region; 989 const hwaddr size = int128_get64(section->size); 990 const unsigned long start = offset >> TARGET_PAGE_BITS; 991 const unsigned long npages = size >> TARGET_PAGE_BITS; 992 RAMBlock *rb = section->mr->ram_block; 993 uint64_t *cleared_bits = opaque; 994 995 /* 996 * We don't grab ram_state->bitmap_mutex because we expect to run 997 * only when starting migration or during postcopy recovery where 998 * we don't have concurrent access. 999 */ 1000 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1001 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1002 } 1003 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1004 bitmap_clear(rb->bmap, start, npages); 1005 } 1006 1007 /* 1008 * Exclude all dirty pages from migration that fall into a discarded range as 1009 * managed by a RamDiscardManager responsible for the mapped memory region of 1010 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1011 * 1012 * Discarded pages ("logically unplugged") have undefined content and must 1013 * not get migrated, because even reading these pages for migration might 1014 * result in undesired behavior. 1015 * 1016 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1017 * 1018 * Note: The result is only stable while migrating (precopy/postcopy). 1019 */ 1020 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1021 { 1022 uint64_t cleared_bits = 0; 1023 1024 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1025 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1026 MemoryRegionSection section = { 1027 .mr = rb->mr, 1028 .offset_within_region = 0, 1029 .size = int128_make64(qemu_ram_get_used_length(rb)), 1030 }; 1031 1032 ram_discard_manager_replay_discarded(rdm, §ion, 1033 dirty_bitmap_clear_section, 1034 &cleared_bits); 1035 } 1036 return cleared_bits; 1037 } 1038 1039 /* 1040 * Check if a host-page aligned page falls into a discarded range as managed by 1041 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1042 * 1043 * Note: The result is only stable while migrating (precopy/postcopy). 1044 */ 1045 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1046 { 1047 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1048 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1049 MemoryRegionSection section = { 1050 .mr = rb->mr, 1051 .offset_within_region = start, 1052 .size = int128_make64(qemu_ram_pagesize(rb)), 1053 }; 1054 1055 return !ram_discard_manager_is_populated(rdm, §ion); 1056 } 1057 return false; 1058 } 1059 1060 /* Called with RCU critical section */ 1061 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1062 { 1063 uint64_t new_dirty_pages = 1064 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1065 1066 rs->migration_dirty_pages += new_dirty_pages; 1067 rs->num_dirty_pages_period += new_dirty_pages; 1068 } 1069 1070 /** 1071 * ram_pagesize_summary: calculate all the pagesizes of a VM 1072 * 1073 * Returns a summary bitmap of the page sizes of all RAMBlocks 1074 * 1075 * For VMs with just normal pages this is equivalent to the host page 1076 * size. If it's got some huge pages then it's the OR of all the 1077 * different page sizes. 1078 */ 1079 uint64_t ram_pagesize_summary(void) 1080 { 1081 RAMBlock *block; 1082 uint64_t summary = 0; 1083 1084 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1085 summary |= block->page_size; 1086 } 1087 1088 return summary; 1089 } 1090 1091 uint64_t ram_get_total_transferred_pages(void) 1092 { 1093 return stat64_get(&ram_atomic_counters.normal) + 1094 stat64_get(&ram_atomic_counters.duplicate) + 1095 compression_counters.pages + xbzrle_counters.pages; 1096 } 1097 1098 static void migration_update_rates(RAMState *rs, int64_t end_time) 1099 { 1100 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1101 double compressed_size; 1102 1103 /* calculate period counters */ 1104 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1105 / (end_time - rs->time_last_bitmap_sync); 1106 1107 if (!page_count) { 1108 return; 1109 } 1110 1111 if (migrate_use_xbzrle()) { 1112 double encoded_size, unencoded_size; 1113 1114 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1115 rs->xbzrle_cache_miss_prev) / page_count; 1116 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1117 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1118 TARGET_PAGE_SIZE; 1119 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1120 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1121 xbzrle_counters.encoding_rate = 0; 1122 } else { 1123 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1124 } 1125 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1126 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1127 } 1128 1129 if (migrate_use_compression()) { 1130 compression_counters.busy_rate = (double)(compression_counters.busy - 1131 rs->compress_thread_busy_prev) / page_count; 1132 rs->compress_thread_busy_prev = compression_counters.busy; 1133 1134 compressed_size = compression_counters.compressed_size - 1135 rs->compressed_size_prev; 1136 if (compressed_size) { 1137 double uncompressed_size = (compression_counters.pages - 1138 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1139 1140 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1141 compression_counters.compression_rate = 1142 uncompressed_size / compressed_size; 1143 1144 rs->compress_pages_prev = compression_counters.pages; 1145 rs->compressed_size_prev = compression_counters.compressed_size; 1146 } 1147 } 1148 } 1149 1150 static void migration_trigger_throttle(RAMState *rs) 1151 { 1152 MigrationState *s = migrate_get_current(); 1153 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1154 uint64_t bytes_xfer_period = 1155 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev; 1156 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1157 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1158 1159 /* During block migration the auto-converge logic incorrectly detects 1160 * that ram migration makes no progress. Avoid this by disabling the 1161 * throttling logic during the bulk phase of block migration. */ 1162 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1163 /* The following detection logic can be refined later. For now: 1164 Check to see if the ratio between dirtied bytes and the approx. 1165 amount of bytes that just got transferred since the last time 1166 we were in this routine reaches the threshold. If that happens 1167 twice, start or increase throttling. */ 1168 1169 if ((bytes_dirty_period > bytes_dirty_threshold) && 1170 (++rs->dirty_rate_high_cnt >= 2)) { 1171 trace_migration_throttle(); 1172 rs->dirty_rate_high_cnt = 0; 1173 mig_throttle_guest_down(bytes_dirty_period, 1174 bytes_dirty_threshold); 1175 } 1176 } 1177 } 1178 1179 static void migration_bitmap_sync(RAMState *rs) 1180 { 1181 RAMBlock *block; 1182 int64_t end_time; 1183 1184 ram_counters.dirty_sync_count++; 1185 1186 if (!rs->time_last_bitmap_sync) { 1187 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1188 } 1189 1190 trace_migration_bitmap_sync_start(); 1191 memory_global_dirty_log_sync(); 1192 1193 qemu_mutex_lock(&rs->bitmap_mutex); 1194 WITH_RCU_READ_LOCK_GUARD() { 1195 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1196 ramblock_sync_dirty_bitmap(rs, block); 1197 } 1198 ram_counters.remaining = ram_bytes_remaining(); 1199 } 1200 qemu_mutex_unlock(&rs->bitmap_mutex); 1201 1202 memory_global_after_dirty_log_sync(); 1203 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1204 1205 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1206 1207 /* more than 1 second = 1000 millisecons */ 1208 if (end_time > rs->time_last_bitmap_sync + 1000) { 1209 migration_trigger_throttle(rs); 1210 1211 migration_update_rates(rs, end_time); 1212 1213 rs->target_page_count_prev = rs->target_page_count; 1214 1215 /* reset period counters */ 1216 rs->time_last_bitmap_sync = end_time; 1217 rs->num_dirty_pages_period = 0; 1218 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); 1219 } 1220 if (migrate_use_events()) { 1221 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1222 } 1223 } 1224 1225 static void migration_bitmap_sync_precopy(RAMState *rs) 1226 { 1227 Error *local_err = NULL; 1228 1229 /* 1230 * The current notifier usage is just an optimization to migration, so we 1231 * don't stop the normal migration process in the error case. 1232 */ 1233 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1234 error_report_err(local_err); 1235 local_err = NULL; 1236 } 1237 1238 migration_bitmap_sync(rs); 1239 1240 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1241 error_report_err(local_err); 1242 } 1243 } 1244 1245 void ram_release_page(const char *rbname, uint64_t offset) 1246 { 1247 if (!migrate_release_ram() || !migration_in_postcopy()) { 1248 return; 1249 } 1250 1251 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1252 } 1253 1254 /** 1255 * save_zero_page_to_file: send the zero page to the file 1256 * 1257 * Returns the size of data written to the file, 0 means the page is not 1258 * a zero page 1259 * 1260 * @pss: current PSS channel 1261 * @block: block that contains the page we want to send 1262 * @offset: offset inside the block for the page 1263 */ 1264 static int save_zero_page_to_file(PageSearchStatus *pss, 1265 RAMBlock *block, ram_addr_t offset) 1266 { 1267 uint8_t *p = block->host + offset; 1268 QEMUFile *file = pss->pss_channel; 1269 int len = 0; 1270 1271 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1272 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO); 1273 qemu_put_byte(file, 0); 1274 len += 1; 1275 ram_release_page(block->idstr, offset); 1276 } 1277 return len; 1278 } 1279 1280 /** 1281 * save_zero_page: send the zero page to the stream 1282 * 1283 * Returns the number of pages written. 1284 * 1285 * @pss: current PSS channel 1286 * @block: block that contains the page we want to send 1287 * @offset: offset inside the block for the page 1288 */ 1289 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block, 1290 ram_addr_t offset) 1291 { 1292 int len = save_zero_page_to_file(pss, block, offset); 1293 1294 if (len) { 1295 stat64_add(&ram_atomic_counters.duplicate, 1); 1296 ram_transferred_add(len); 1297 return 1; 1298 } 1299 return -1; 1300 } 1301 1302 /* 1303 * @pages: the number of pages written by the control path, 1304 * < 0 - error 1305 * > 0 - number of pages written 1306 * 1307 * Return true if the pages has been saved, otherwise false is returned. 1308 */ 1309 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1310 ram_addr_t offset, int *pages) 1311 { 1312 uint64_t bytes_xmit = 0; 1313 int ret; 1314 1315 *pages = -1; 1316 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1317 TARGET_PAGE_SIZE, &bytes_xmit); 1318 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1319 return false; 1320 } 1321 1322 if (bytes_xmit) { 1323 ram_transferred_add(bytes_xmit); 1324 *pages = 1; 1325 } 1326 1327 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1328 return true; 1329 } 1330 1331 if (bytes_xmit > 0) { 1332 stat64_add(&ram_atomic_counters.normal, 1); 1333 } else if (bytes_xmit == 0) { 1334 stat64_add(&ram_atomic_counters.duplicate, 1); 1335 } 1336 1337 return true; 1338 } 1339 1340 /* 1341 * directly send the page to the stream 1342 * 1343 * Returns the number of pages written. 1344 * 1345 * @pss: current PSS channel 1346 * @block: block that contains the page we want to send 1347 * @offset: offset inside the block for the page 1348 * @buf: the page to be sent 1349 * @async: send to page asyncly 1350 */ 1351 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1352 ram_addr_t offset, uint8_t *buf, bool async) 1353 { 1354 QEMUFile *file = pss->pss_channel; 1355 1356 ram_transferred_add(save_page_header(pss, block, 1357 offset | RAM_SAVE_FLAG_PAGE)); 1358 if (async) { 1359 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1360 migrate_release_ram() && 1361 migration_in_postcopy()); 1362 } else { 1363 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1364 } 1365 ram_transferred_add(TARGET_PAGE_SIZE); 1366 stat64_add(&ram_atomic_counters.normal, 1); 1367 return 1; 1368 } 1369 1370 /** 1371 * ram_save_page: send the given page to the stream 1372 * 1373 * Returns the number of pages written. 1374 * < 0 - error 1375 * >=0 - Number of pages written - this might legally be 0 1376 * if xbzrle noticed the page was the same. 1377 * 1378 * @rs: current RAM state 1379 * @block: block that contains the page we want to send 1380 * @offset: offset inside the block for the page 1381 */ 1382 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1383 { 1384 int pages = -1; 1385 uint8_t *p; 1386 bool send_async = true; 1387 RAMBlock *block = pss->block; 1388 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1389 ram_addr_t current_addr = block->offset + offset; 1390 1391 p = block->host + offset; 1392 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1393 1394 XBZRLE_cache_lock(); 1395 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1396 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1397 block, offset); 1398 if (!rs->last_stage) { 1399 /* Can't send this cached data async, since the cache page 1400 * might get updated before it gets to the wire 1401 */ 1402 send_async = false; 1403 } 1404 } 1405 1406 /* XBZRLE overflow or normal page */ 1407 if (pages == -1) { 1408 pages = save_normal_page(pss, block, offset, p, send_async); 1409 } 1410 1411 XBZRLE_cache_unlock(); 1412 1413 return pages; 1414 } 1415 1416 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1417 ram_addr_t offset) 1418 { 1419 if (multifd_queue_page(file, block, offset) < 0) { 1420 return -1; 1421 } 1422 stat64_add(&ram_atomic_counters.normal, 1); 1423 1424 return 1; 1425 } 1426 1427 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1428 ram_addr_t offset, uint8_t *source_buf) 1429 { 1430 RAMState *rs = ram_state; 1431 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1432 uint8_t *p = block->host + offset; 1433 int ret; 1434 1435 if (save_zero_page_to_file(pss, block, offset)) { 1436 return true; 1437 } 1438 1439 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1440 1441 /* 1442 * copy it to a internal buffer to avoid it being modified by VM 1443 * so that we can catch up the error during compression and 1444 * decompression 1445 */ 1446 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1447 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1448 if (ret < 0) { 1449 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1450 error_report("compressed data failed!"); 1451 } 1452 return false; 1453 } 1454 1455 static void 1456 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1457 { 1458 ram_transferred_add(bytes_xmit); 1459 1460 if (param->zero_page) { 1461 stat64_add(&ram_atomic_counters.duplicate, 1); 1462 return; 1463 } 1464 1465 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1466 compression_counters.compressed_size += bytes_xmit - 8; 1467 compression_counters.pages++; 1468 } 1469 1470 static bool save_page_use_compression(RAMState *rs); 1471 1472 static void flush_compressed_data(RAMState *rs) 1473 { 1474 MigrationState *ms = migrate_get_current(); 1475 int idx, len, thread_count; 1476 1477 if (!save_page_use_compression(rs)) { 1478 return; 1479 } 1480 thread_count = migrate_compress_threads(); 1481 1482 qemu_mutex_lock(&comp_done_lock); 1483 for (idx = 0; idx < thread_count; idx++) { 1484 while (!comp_param[idx].done) { 1485 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1486 } 1487 } 1488 qemu_mutex_unlock(&comp_done_lock); 1489 1490 for (idx = 0; idx < thread_count; idx++) { 1491 qemu_mutex_lock(&comp_param[idx].mutex); 1492 if (!comp_param[idx].quit) { 1493 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1494 /* 1495 * it's safe to fetch zero_page without holding comp_done_lock 1496 * as there is no further request submitted to the thread, 1497 * i.e, the thread should be waiting for a request at this point. 1498 */ 1499 update_compress_thread_counts(&comp_param[idx], len); 1500 } 1501 qemu_mutex_unlock(&comp_param[idx].mutex); 1502 } 1503 } 1504 1505 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1506 ram_addr_t offset) 1507 { 1508 param->block = block; 1509 param->offset = offset; 1510 } 1511 1512 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1513 { 1514 int idx, thread_count, bytes_xmit = -1, pages = -1; 1515 bool wait = migrate_compress_wait_thread(); 1516 MigrationState *ms = migrate_get_current(); 1517 1518 thread_count = migrate_compress_threads(); 1519 qemu_mutex_lock(&comp_done_lock); 1520 retry: 1521 for (idx = 0; idx < thread_count; idx++) { 1522 if (comp_param[idx].done) { 1523 comp_param[idx].done = false; 1524 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1525 comp_param[idx].file); 1526 qemu_mutex_lock(&comp_param[idx].mutex); 1527 set_compress_params(&comp_param[idx], block, offset); 1528 qemu_cond_signal(&comp_param[idx].cond); 1529 qemu_mutex_unlock(&comp_param[idx].mutex); 1530 pages = 1; 1531 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1532 break; 1533 } 1534 } 1535 1536 /* 1537 * wait for the free thread if the user specifies 'compress-wait-thread', 1538 * otherwise we will post the page out in the main thread as normal page. 1539 */ 1540 if (pages < 0 && wait) { 1541 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1542 goto retry; 1543 } 1544 qemu_mutex_unlock(&comp_done_lock); 1545 1546 return pages; 1547 } 1548 1549 /** 1550 * find_dirty_block: find the next dirty page and update any state 1551 * associated with the search process. 1552 * 1553 * Returns true if a page is found 1554 * 1555 * @rs: current RAM state 1556 * @pss: data about the state of the current dirty page scan 1557 * @again: set to false if the search has scanned the whole of RAM 1558 */ 1559 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1560 { 1561 /* Update pss->page for the next dirty bit in ramblock */ 1562 pss_find_next_dirty(pss); 1563 1564 if (pss->complete_round && pss->block == rs->last_seen_block && 1565 pss->page >= rs->last_page) { 1566 /* 1567 * We've been once around the RAM and haven't found anything. 1568 * Give up. 1569 */ 1570 *again = false; 1571 return false; 1572 } 1573 if (!offset_in_ramblock(pss->block, 1574 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1575 /* Didn't find anything in this RAM Block */ 1576 pss->page = 0; 1577 pss->block = QLIST_NEXT_RCU(pss->block, next); 1578 if (!pss->block) { 1579 /* 1580 * If memory migration starts over, we will meet a dirtied page 1581 * which may still exists in compression threads's ring, so we 1582 * should flush the compressed data to make sure the new page 1583 * is not overwritten by the old one in the destination. 1584 * 1585 * Also If xbzrle is on, stop using the data compression at this 1586 * point. In theory, xbzrle can do better than compression. 1587 */ 1588 flush_compressed_data(rs); 1589 1590 /* Hit the end of the list */ 1591 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1592 /* Flag that we've looped */ 1593 pss->complete_round = true; 1594 /* After the first round, enable XBZRLE. */ 1595 if (migrate_use_xbzrle()) { 1596 rs->xbzrle_enabled = true; 1597 } 1598 } 1599 /* Didn't find anything this time, but try again on the new block */ 1600 *again = true; 1601 return false; 1602 } else { 1603 /* Can go around again, but... */ 1604 *again = true; 1605 /* We've found something so probably don't need to */ 1606 return true; 1607 } 1608 } 1609 1610 /** 1611 * unqueue_page: gets a page of the queue 1612 * 1613 * Helper for 'get_queued_page' - gets a page off the queue 1614 * 1615 * Returns the block of the page (or NULL if none available) 1616 * 1617 * @rs: current RAM state 1618 * @offset: used to return the offset within the RAMBlock 1619 */ 1620 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1621 { 1622 struct RAMSrcPageRequest *entry; 1623 RAMBlock *block = NULL; 1624 1625 if (!postcopy_has_request(rs)) { 1626 return NULL; 1627 } 1628 1629 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1630 1631 /* 1632 * This should _never_ change even after we take the lock, because no one 1633 * should be taking anything off the request list other than us. 1634 */ 1635 assert(postcopy_has_request(rs)); 1636 1637 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1638 block = entry->rb; 1639 *offset = entry->offset; 1640 1641 if (entry->len > TARGET_PAGE_SIZE) { 1642 entry->len -= TARGET_PAGE_SIZE; 1643 entry->offset += TARGET_PAGE_SIZE; 1644 } else { 1645 memory_region_unref(block->mr); 1646 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1647 g_free(entry); 1648 migration_consume_urgent_request(); 1649 } 1650 1651 return block; 1652 } 1653 1654 #if defined(__linux__) 1655 /** 1656 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1657 * is found, return RAM block pointer and page offset 1658 * 1659 * Returns pointer to the RAMBlock containing faulting page, 1660 * NULL if no write faults are pending 1661 * 1662 * @rs: current RAM state 1663 * @offset: page offset from the beginning of the block 1664 */ 1665 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1666 { 1667 struct uffd_msg uffd_msg; 1668 void *page_address; 1669 RAMBlock *block; 1670 int res; 1671 1672 if (!migrate_background_snapshot()) { 1673 return NULL; 1674 } 1675 1676 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1677 if (res <= 0) { 1678 return NULL; 1679 } 1680 1681 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1682 block = qemu_ram_block_from_host(page_address, false, offset); 1683 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1684 return block; 1685 } 1686 1687 /** 1688 * ram_save_release_protection: release UFFD write protection after 1689 * a range of pages has been saved 1690 * 1691 * @rs: current RAM state 1692 * @pss: page-search-status structure 1693 * @start_page: index of the first page in the range relative to pss->block 1694 * 1695 * Returns 0 on success, negative value in case of an error 1696 */ 1697 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1698 unsigned long start_page) 1699 { 1700 int res = 0; 1701 1702 /* Check if page is from UFFD-managed region. */ 1703 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1704 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1705 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1706 1707 /* Flush async buffers before un-protect. */ 1708 qemu_fflush(pss->pss_channel); 1709 /* Un-protect memory range. */ 1710 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1711 false, false); 1712 } 1713 1714 return res; 1715 } 1716 1717 /* ram_write_tracking_available: check if kernel supports required UFFD features 1718 * 1719 * Returns true if supports, false otherwise 1720 */ 1721 bool ram_write_tracking_available(void) 1722 { 1723 uint64_t uffd_features; 1724 int res; 1725 1726 res = uffd_query_features(&uffd_features); 1727 return (res == 0 && 1728 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1729 } 1730 1731 /* ram_write_tracking_compatible: check if guest configuration is 1732 * compatible with 'write-tracking' 1733 * 1734 * Returns true if compatible, false otherwise 1735 */ 1736 bool ram_write_tracking_compatible(void) 1737 { 1738 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1739 int uffd_fd; 1740 RAMBlock *block; 1741 bool ret = false; 1742 1743 /* Open UFFD file descriptor */ 1744 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1745 if (uffd_fd < 0) { 1746 return false; 1747 } 1748 1749 RCU_READ_LOCK_GUARD(); 1750 1751 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1752 uint64_t uffd_ioctls; 1753 1754 /* Nothing to do with read-only and MMIO-writable regions */ 1755 if (block->mr->readonly || block->mr->rom_device) { 1756 continue; 1757 } 1758 /* Try to register block memory via UFFD-IO to track writes */ 1759 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1760 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1761 goto out; 1762 } 1763 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1764 goto out; 1765 } 1766 } 1767 ret = true; 1768 1769 out: 1770 uffd_close_fd(uffd_fd); 1771 return ret; 1772 } 1773 1774 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1775 ram_addr_t size) 1776 { 1777 /* 1778 * We read one byte of each page; this will preallocate page tables if 1779 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1780 * where no page was populated yet. This might require adaption when 1781 * supporting other mappings, like shmem. 1782 */ 1783 for (; offset < size; offset += block->page_size) { 1784 char tmp = *((char *)block->host + offset); 1785 1786 /* Don't optimize the read out */ 1787 asm volatile("" : "+r" (tmp)); 1788 } 1789 } 1790 1791 static inline int populate_read_section(MemoryRegionSection *section, 1792 void *opaque) 1793 { 1794 const hwaddr size = int128_get64(section->size); 1795 hwaddr offset = section->offset_within_region; 1796 RAMBlock *block = section->mr->ram_block; 1797 1798 populate_read_range(block, offset, size); 1799 return 0; 1800 } 1801 1802 /* 1803 * ram_block_populate_read: preallocate page tables and populate pages in the 1804 * RAM block by reading a byte of each page. 1805 * 1806 * Since it's solely used for userfault_fd WP feature, here we just 1807 * hardcode page size to qemu_real_host_page_size. 1808 * 1809 * @block: RAM block to populate 1810 */ 1811 static void ram_block_populate_read(RAMBlock *rb) 1812 { 1813 /* 1814 * Skip populating all pages that fall into a discarded range as managed by 1815 * a RamDiscardManager responsible for the mapped memory region of the 1816 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1817 * must not get populated automatically. We don't have to track 1818 * modifications via userfaultfd WP reliably, because these pages will 1819 * not be part of the migration stream either way -- see 1820 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1821 * 1822 * Note: The result is only stable while migrating (precopy/postcopy). 1823 */ 1824 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1825 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1826 MemoryRegionSection section = { 1827 .mr = rb->mr, 1828 .offset_within_region = 0, 1829 .size = rb->mr->size, 1830 }; 1831 1832 ram_discard_manager_replay_populated(rdm, §ion, 1833 populate_read_section, NULL); 1834 } else { 1835 populate_read_range(rb, 0, rb->used_length); 1836 } 1837 } 1838 1839 /* 1840 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1841 */ 1842 void ram_write_tracking_prepare(void) 1843 { 1844 RAMBlock *block; 1845 1846 RCU_READ_LOCK_GUARD(); 1847 1848 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1849 /* Nothing to do with read-only and MMIO-writable regions */ 1850 if (block->mr->readonly || block->mr->rom_device) { 1851 continue; 1852 } 1853 1854 /* 1855 * Populate pages of the RAM block before enabling userfault_fd 1856 * write protection. 1857 * 1858 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1859 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1860 * pages with pte_none() entries in page table. 1861 */ 1862 ram_block_populate_read(block); 1863 } 1864 } 1865 1866 /* 1867 * ram_write_tracking_start: start UFFD-WP memory tracking 1868 * 1869 * Returns 0 for success or negative value in case of error 1870 */ 1871 int ram_write_tracking_start(void) 1872 { 1873 int uffd_fd; 1874 RAMState *rs = ram_state; 1875 RAMBlock *block; 1876 1877 /* Open UFFD file descriptor */ 1878 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1879 if (uffd_fd < 0) { 1880 return uffd_fd; 1881 } 1882 rs->uffdio_fd = uffd_fd; 1883 1884 RCU_READ_LOCK_GUARD(); 1885 1886 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1887 /* Nothing to do with read-only and MMIO-writable regions */ 1888 if (block->mr->readonly || block->mr->rom_device) { 1889 continue; 1890 } 1891 1892 /* Register block memory with UFFD to track writes */ 1893 if (uffd_register_memory(rs->uffdio_fd, block->host, 1894 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1895 goto fail; 1896 } 1897 /* Apply UFFD write protection to the block memory range */ 1898 if (uffd_change_protection(rs->uffdio_fd, block->host, 1899 block->max_length, true, false)) { 1900 goto fail; 1901 } 1902 block->flags |= RAM_UF_WRITEPROTECT; 1903 memory_region_ref(block->mr); 1904 1905 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1906 block->host, block->max_length); 1907 } 1908 1909 return 0; 1910 1911 fail: 1912 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1913 1914 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1915 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1916 continue; 1917 } 1918 /* 1919 * In case some memory block failed to be write-protected 1920 * remove protection and unregister all succeeded RAM blocks 1921 */ 1922 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1923 false, false); 1924 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1925 /* Cleanup flags and remove reference */ 1926 block->flags &= ~RAM_UF_WRITEPROTECT; 1927 memory_region_unref(block->mr); 1928 } 1929 1930 uffd_close_fd(uffd_fd); 1931 rs->uffdio_fd = -1; 1932 return -1; 1933 } 1934 1935 /** 1936 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1937 */ 1938 void ram_write_tracking_stop(void) 1939 { 1940 RAMState *rs = ram_state; 1941 RAMBlock *block; 1942 1943 RCU_READ_LOCK_GUARD(); 1944 1945 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1946 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1947 continue; 1948 } 1949 /* Remove protection and unregister all affected RAM blocks */ 1950 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1951 false, false); 1952 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1953 1954 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1955 block->host, block->max_length); 1956 1957 /* Cleanup flags and remove reference */ 1958 block->flags &= ~RAM_UF_WRITEPROTECT; 1959 memory_region_unref(block->mr); 1960 } 1961 1962 /* Finally close UFFD file descriptor */ 1963 uffd_close_fd(rs->uffdio_fd); 1964 rs->uffdio_fd = -1; 1965 } 1966 1967 #else 1968 /* No target OS support, stubs just fail or ignore */ 1969 1970 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1971 { 1972 (void) rs; 1973 (void) offset; 1974 1975 return NULL; 1976 } 1977 1978 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1979 unsigned long start_page) 1980 { 1981 (void) rs; 1982 (void) pss; 1983 (void) start_page; 1984 1985 return 0; 1986 } 1987 1988 bool ram_write_tracking_available(void) 1989 { 1990 return false; 1991 } 1992 1993 bool ram_write_tracking_compatible(void) 1994 { 1995 assert(0); 1996 return false; 1997 } 1998 1999 int ram_write_tracking_start(void) 2000 { 2001 assert(0); 2002 return -1; 2003 } 2004 2005 void ram_write_tracking_stop(void) 2006 { 2007 assert(0); 2008 } 2009 #endif /* defined(__linux__) */ 2010 2011 /** 2012 * get_queued_page: unqueue a page from the postcopy requests 2013 * 2014 * Skips pages that are already sent (!dirty) 2015 * 2016 * Returns true if a queued page is found 2017 * 2018 * @rs: current RAM state 2019 * @pss: data about the state of the current dirty page scan 2020 */ 2021 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2022 { 2023 RAMBlock *block; 2024 ram_addr_t offset; 2025 bool dirty; 2026 2027 do { 2028 block = unqueue_page(rs, &offset); 2029 /* 2030 * We're sending this page, and since it's postcopy nothing else 2031 * will dirty it, and we must make sure it doesn't get sent again 2032 * even if this queue request was received after the background 2033 * search already sent it. 2034 */ 2035 if (block) { 2036 unsigned long page; 2037 2038 page = offset >> TARGET_PAGE_BITS; 2039 dirty = test_bit(page, block->bmap); 2040 if (!dirty) { 2041 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2042 page); 2043 } else { 2044 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2045 } 2046 } 2047 2048 } while (block && !dirty); 2049 2050 if (!block) { 2051 /* 2052 * Poll write faults too if background snapshot is enabled; that's 2053 * when we have vcpus got blocked by the write protected pages. 2054 */ 2055 block = poll_fault_page(rs, &offset); 2056 } 2057 2058 if (block) { 2059 /* 2060 * We want the background search to continue from the queued page 2061 * since the guest is likely to want other pages near to the page 2062 * it just requested. 2063 */ 2064 pss->block = block; 2065 pss->page = offset >> TARGET_PAGE_BITS; 2066 2067 /* 2068 * This unqueued page would break the "one round" check, even is 2069 * really rare. 2070 */ 2071 pss->complete_round = false; 2072 } 2073 2074 return !!block; 2075 } 2076 2077 /** 2078 * migration_page_queue_free: drop any remaining pages in the ram 2079 * request queue 2080 * 2081 * It should be empty at the end anyway, but in error cases there may 2082 * be some left. in case that there is any page left, we drop it. 2083 * 2084 */ 2085 static void migration_page_queue_free(RAMState *rs) 2086 { 2087 struct RAMSrcPageRequest *mspr, *next_mspr; 2088 /* This queue generally should be empty - but in the case of a failed 2089 * migration might have some droppings in. 2090 */ 2091 RCU_READ_LOCK_GUARD(); 2092 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2093 memory_region_unref(mspr->rb->mr); 2094 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2095 g_free(mspr); 2096 } 2097 } 2098 2099 /** 2100 * ram_save_queue_pages: queue the page for transmission 2101 * 2102 * A request from postcopy destination for example. 2103 * 2104 * Returns zero on success or negative on error 2105 * 2106 * @rbname: Name of the RAMBLock of the request. NULL means the 2107 * same that last one. 2108 * @start: starting address from the start of the RAMBlock 2109 * @len: length (in bytes) to send 2110 */ 2111 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2112 { 2113 RAMBlock *ramblock; 2114 RAMState *rs = ram_state; 2115 2116 ram_counters.postcopy_requests++; 2117 RCU_READ_LOCK_GUARD(); 2118 2119 if (!rbname) { 2120 /* Reuse last RAMBlock */ 2121 ramblock = rs->last_req_rb; 2122 2123 if (!ramblock) { 2124 /* 2125 * Shouldn't happen, we can't reuse the last RAMBlock if 2126 * it's the 1st request. 2127 */ 2128 error_report("ram_save_queue_pages no previous block"); 2129 return -1; 2130 } 2131 } else { 2132 ramblock = qemu_ram_block_by_name(rbname); 2133 2134 if (!ramblock) { 2135 /* We shouldn't be asked for a non-existent RAMBlock */ 2136 error_report("ram_save_queue_pages no block '%s'", rbname); 2137 return -1; 2138 } 2139 rs->last_req_rb = ramblock; 2140 } 2141 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2142 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2143 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2144 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2145 __func__, start, len, ramblock->used_length); 2146 return -1; 2147 } 2148 2149 /* 2150 * When with postcopy preempt, we send back the page directly in the 2151 * rp-return thread. 2152 */ 2153 if (postcopy_preempt_active()) { 2154 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2155 size_t page_size = qemu_ram_pagesize(ramblock); 2156 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2157 int ret = 0; 2158 2159 qemu_mutex_lock(&rs->bitmap_mutex); 2160 2161 pss_init(pss, ramblock, page_start); 2162 /* 2163 * Always use the preempt channel, and make sure it's there. It's 2164 * safe to access without lock, because when rp-thread is running 2165 * we should be the only one who operates on the qemufile 2166 */ 2167 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2168 assert(pss->pss_channel); 2169 2170 /* 2171 * It must be either one or multiple of host page size. Just 2172 * assert; if something wrong we're mostly split brain anyway. 2173 */ 2174 assert(len % page_size == 0); 2175 while (len) { 2176 if (ram_save_host_page_urgent(pss)) { 2177 error_report("%s: ram_save_host_page_urgent() failed: " 2178 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2179 __func__, ramblock->idstr, start); 2180 ret = -1; 2181 break; 2182 } 2183 /* 2184 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2185 * will automatically be moved and point to the next host page 2186 * we're going to send, so no need to update here. 2187 * 2188 * Normally QEMU never sends >1 host page in requests, so 2189 * logically we don't even need that as the loop should only 2190 * run once, but just to be consistent. 2191 */ 2192 len -= page_size; 2193 }; 2194 qemu_mutex_unlock(&rs->bitmap_mutex); 2195 2196 return ret; 2197 } 2198 2199 struct RAMSrcPageRequest *new_entry = 2200 g_new0(struct RAMSrcPageRequest, 1); 2201 new_entry->rb = ramblock; 2202 new_entry->offset = start; 2203 new_entry->len = len; 2204 2205 memory_region_ref(ramblock->mr); 2206 qemu_mutex_lock(&rs->src_page_req_mutex); 2207 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2208 migration_make_urgent_request(); 2209 qemu_mutex_unlock(&rs->src_page_req_mutex); 2210 2211 return 0; 2212 } 2213 2214 static bool save_page_use_compression(RAMState *rs) 2215 { 2216 if (!migrate_use_compression()) { 2217 return false; 2218 } 2219 2220 /* 2221 * If xbzrle is enabled (e.g., after first round of migration), stop 2222 * using the data compression. In theory, xbzrle can do better than 2223 * compression. 2224 */ 2225 if (rs->xbzrle_enabled) { 2226 return false; 2227 } 2228 2229 return true; 2230 } 2231 2232 /* 2233 * try to compress the page before posting it out, return true if the page 2234 * has been properly handled by compression, otherwise needs other 2235 * paths to handle it 2236 */ 2237 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2238 RAMBlock *block, ram_addr_t offset) 2239 { 2240 if (!save_page_use_compression(rs)) { 2241 return false; 2242 } 2243 2244 /* 2245 * When starting the process of a new block, the first page of 2246 * the block should be sent out before other pages in the same 2247 * block, and all the pages in last block should have been sent 2248 * out, keeping this order is important, because the 'cont' flag 2249 * is used to avoid resending the block name. 2250 * 2251 * We post the fist page as normal page as compression will take 2252 * much CPU resource. 2253 */ 2254 if (block != pss->last_sent_block) { 2255 flush_compressed_data(rs); 2256 return false; 2257 } 2258 2259 if (compress_page_with_multi_thread(block, offset) > 0) { 2260 return true; 2261 } 2262 2263 compression_counters.busy++; 2264 return false; 2265 } 2266 2267 /** 2268 * ram_save_target_page: save one target page 2269 * 2270 * Returns the number of pages written 2271 * 2272 * @rs: current RAM state 2273 * @pss: data about the page we want to send 2274 */ 2275 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 2276 { 2277 RAMBlock *block = pss->block; 2278 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2279 int res; 2280 2281 if (control_save_page(pss, block, offset, &res)) { 2282 return res; 2283 } 2284 2285 if (save_compress_page(rs, pss, block, offset)) { 2286 return 1; 2287 } 2288 2289 res = save_zero_page(pss, block, offset); 2290 if (res > 0) { 2291 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2292 * page would be stale 2293 */ 2294 if (rs->xbzrle_enabled) { 2295 XBZRLE_cache_lock(); 2296 xbzrle_cache_zero_page(rs, block->offset + offset); 2297 XBZRLE_cache_unlock(); 2298 } 2299 return res; 2300 } 2301 2302 /* 2303 * Do not use multifd in postcopy as one whole host page should be 2304 * placed. Meanwhile postcopy requires atomic update of pages, so even 2305 * if host page size == guest page size the dest guest during run may 2306 * still see partially copied pages which is data corruption. 2307 */ 2308 if (migrate_use_multifd() && !migration_in_postcopy()) { 2309 return ram_save_multifd_page(pss->pss_channel, block, offset); 2310 } 2311 2312 return ram_save_page(rs, pss); 2313 } 2314 2315 /* Should be called before sending a host page */ 2316 static void pss_host_page_prepare(PageSearchStatus *pss) 2317 { 2318 /* How many guest pages are there in one host page? */ 2319 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2320 2321 pss->host_page_sending = true; 2322 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2323 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2324 } 2325 2326 /* 2327 * Whether the page pointed by PSS is within the host page being sent. 2328 * Must be called after a previous pss_host_page_prepare(). 2329 */ 2330 static bool pss_within_range(PageSearchStatus *pss) 2331 { 2332 ram_addr_t ram_addr; 2333 2334 assert(pss->host_page_sending); 2335 2336 /* Over host-page boundary? */ 2337 if (pss->page >= pss->host_page_end) { 2338 return false; 2339 } 2340 2341 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2342 2343 return offset_in_ramblock(pss->block, ram_addr); 2344 } 2345 2346 static void pss_host_page_finish(PageSearchStatus *pss) 2347 { 2348 pss->host_page_sending = false; 2349 /* This is not needed, but just to reset it */ 2350 pss->host_page_start = pss->host_page_end = 0; 2351 } 2352 2353 /* 2354 * Send an urgent host page specified by `pss'. Need to be called with 2355 * bitmap_mutex held. 2356 * 2357 * Returns 0 if save host page succeeded, false otherwise. 2358 */ 2359 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2360 { 2361 bool page_dirty, sent = false; 2362 RAMState *rs = ram_state; 2363 int ret = 0; 2364 2365 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2366 pss_host_page_prepare(pss); 2367 2368 /* 2369 * If precopy is sending the same page, let it be done in precopy, or 2370 * we could send the same page in two channels and none of them will 2371 * receive the whole page. 2372 */ 2373 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2374 trace_postcopy_preempt_hit(pss->block->idstr, 2375 pss->page << TARGET_PAGE_BITS); 2376 return 0; 2377 } 2378 2379 do { 2380 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2381 2382 if (page_dirty) { 2383 /* Be strict to return code; it must be 1, or what else? */ 2384 if (ram_save_target_page(rs, pss) != 1) { 2385 error_report_once("%s: ram_save_target_page failed", __func__); 2386 ret = -1; 2387 goto out; 2388 } 2389 sent = true; 2390 } 2391 pss_find_next_dirty(pss); 2392 } while (pss_within_range(pss)); 2393 out: 2394 pss_host_page_finish(pss); 2395 /* For urgent requests, flush immediately if sent */ 2396 if (sent) { 2397 qemu_fflush(pss->pss_channel); 2398 } 2399 return ret; 2400 } 2401 2402 /** 2403 * ram_save_host_page: save a whole host page 2404 * 2405 * Starting at *offset send pages up to the end of the current host 2406 * page. It's valid for the initial offset to point into the middle of 2407 * a host page in which case the remainder of the hostpage is sent. 2408 * Only dirty target pages are sent. Note that the host page size may 2409 * be a huge page for this block. 2410 * 2411 * The saving stops at the boundary of the used_length of the block 2412 * if the RAMBlock isn't a multiple of the host page size. 2413 * 2414 * The caller must be with ram_state.bitmap_mutex held to call this 2415 * function. Note that this function can temporarily release the lock, but 2416 * when the function is returned it'll make sure the lock is still held. 2417 * 2418 * Returns the number of pages written or negative on error 2419 * 2420 * @rs: current RAM state 2421 * @pss: data about the page we want to send 2422 */ 2423 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2424 { 2425 bool page_dirty, preempt_active = postcopy_preempt_active(); 2426 int tmppages, pages = 0; 2427 size_t pagesize_bits = 2428 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2429 unsigned long start_page = pss->page; 2430 int res; 2431 2432 if (ramblock_is_ignored(pss->block)) { 2433 error_report("block %s should not be migrated !", pss->block->idstr); 2434 return 0; 2435 } 2436 2437 /* Update host page boundary information */ 2438 pss_host_page_prepare(pss); 2439 2440 do { 2441 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2442 2443 /* Check the pages is dirty and if it is send it */ 2444 if (page_dirty) { 2445 /* 2446 * Properly yield the lock only in postcopy preempt mode 2447 * because both migration thread and rp-return thread can 2448 * operate on the bitmaps. 2449 */ 2450 if (preempt_active) { 2451 qemu_mutex_unlock(&rs->bitmap_mutex); 2452 } 2453 tmppages = ram_save_target_page(rs, pss); 2454 if (tmppages >= 0) { 2455 pages += tmppages; 2456 /* 2457 * Allow rate limiting to happen in the middle of huge pages if 2458 * something is sent in the current iteration. 2459 */ 2460 if (pagesize_bits > 1 && tmppages > 0) { 2461 migration_rate_limit(); 2462 } 2463 } 2464 if (preempt_active) { 2465 qemu_mutex_lock(&rs->bitmap_mutex); 2466 } 2467 } else { 2468 tmppages = 0; 2469 } 2470 2471 if (tmppages < 0) { 2472 pss_host_page_finish(pss); 2473 return tmppages; 2474 } 2475 2476 pss_find_next_dirty(pss); 2477 } while (pss_within_range(pss)); 2478 2479 pss_host_page_finish(pss); 2480 2481 res = ram_save_release_protection(rs, pss, start_page); 2482 return (res < 0 ? res : pages); 2483 } 2484 2485 /** 2486 * ram_find_and_save_block: finds a dirty page and sends it to f 2487 * 2488 * Called within an RCU critical section. 2489 * 2490 * Returns the number of pages written where zero means no dirty pages, 2491 * or negative on error 2492 * 2493 * @rs: current RAM state 2494 * 2495 * On systems where host-page-size > target-page-size it will send all the 2496 * pages in a host page that are dirty. 2497 */ 2498 static int ram_find_and_save_block(RAMState *rs) 2499 { 2500 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2501 int pages = 0; 2502 bool again, found; 2503 2504 /* No dirty page as there is zero RAM */ 2505 if (!ram_bytes_total()) { 2506 return pages; 2507 } 2508 2509 /* 2510 * Always keep last_seen_block/last_page valid during this procedure, 2511 * because find_dirty_block() relies on these values (e.g., we compare 2512 * last_seen_block with pss.block to see whether we searched all the 2513 * ramblocks) to detect the completion of migration. Having NULL value 2514 * of last_seen_block can conditionally cause below loop to run forever. 2515 */ 2516 if (!rs->last_seen_block) { 2517 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2518 rs->last_page = 0; 2519 } 2520 2521 pss_init(pss, rs->last_seen_block, rs->last_page); 2522 2523 do { 2524 again = true; 2525 found = get_queued_page(rs, pss); 2526 2527 if (!found) { 2528 /* priority queue empty, so just search for something dirty */ 2529 found = find_dirty_block(rs, pss, &again); 2530 } 2531 2532 if (found) { 2533 pages = ram_save_host_page(rs, pss); 2534 } 2535 } while (!pages && again); 2536 2537 rs->last_seen_block = pss->block; 2538 rs->last_page = pss->page; 2539 2540 return pages; 2541 } 2542 2543 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2544 { 2545 uint64_t pages = size / TARGET_PAGE_SIZE; 2546 2547 if (zero) { 2548 stat64_add(&ram_atomic_counters.duplicate, pages); 2549 } else { 2550 stat64_add(&ram_atomic_counters.normal, pages); 2551 ram_transferred_add(size); 2552 qemu_file_credit_transfer(f, size); 2553 } 2554 } 2555 2556 static uint64_t ram_bytes_total_common(bool count_ignored) 2557 { 2558 RAMBlock *block; 2559 uint64_t total = 0; 2560 2561 RCU_READ_LOCK_GUARD(); 2562 2563 if (count_ignored) { 2564 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2565 total += block->used_length; 2566 } 2567 } else { 2568 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2569 total += block->used_length; 2570 } 2571 } 2572 return total; 2573 } 2574 2575 uint64_t ram_bytes_total(void) 2576 { 2577 return ram_bytes_total_common(false); 2578 } 2579 2580 static void xbzrle_load_setup(void) 2581 { 2582 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2583 } 2584 2585 static void xbzrle_load_cleanup(void) 2586 { 2587 g_free(XBZRLE.decoded_buf); 2588 XBZRLE.decoded_buf = NULL; 2589 } 2590 2591 static void ram_state_cleanup(RAMState **rsp) 2592 { 2593 if (*rsp) { 2594 migration_page_queue_free(*rsp); 2595 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2596 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2597 g_free(*rsp); 2598 *rsp = NULL; 2599 } 2600 } 2601 2602 static void xbzrle_cleanup(void) 2603 { 2604 XBZRLE_cache_lock(); 2605 if (XBZRLE.cache) { 2606 cache_fini(XBZRLE.cache); 2607 g_free(XBZRLE.encoded_buf); 2608 g_free(XBZRLE.current_buf); 2609 g_free(XBZRLE.zero_target_page); 2610 XBZRLE.cache = NULL; 2611 XBZRLE.encoded_buf = NULL; 2612 XBZRLE.current_buf = NULL; 2613 XBZRLE.zero_target_page = NULL; 2614 } 2615 XBZRLE_cache_unlock(); 2616 } 2617 2618 static void ram_save_cleanup(void *opaque) 2619 { 2620 RAMState **rsp = opaque; 2621 RAMBlock *block; 2622 2623 /* We don't use dirty log with background snapshots */ 2624 if (!migrate_background_snapshot()) { 2625 /* caller have hold iothread lock or is in a bh, so there is 2626 * no writing race against the migration bitmap 2627 */ 2628 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2629 /* 2630 * do not stop dirty log without starting it, since 2631 * memory_global_dirty_log_stop will assert that 2632 * memory_global_dirty_log_start/stop used in pairs 2633 */ 2634 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2635 } 2636 } 2637 2638 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2639 g_free(block->clear_bmap); 2640 block->clear_bmap = NULL; 2641 g_free(block->bmap); 2642 block->bmap = NULL; 2643 } 2644 2645 xbzrle_cleanup(); 2646 compress_threads_save_cleanup(); 2647 ram_state_cleanup(rsp); 2648 } 2649 2650 static void ram_state_reset(RAMState *rs) 2651 { 2652 int i; 2653 2654 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2655 rs->pss[i].last_sent_block = NULL; 2656 } 2657 2658 rs->last_seen_block = NULL; 2659 rs->last_page = 0; 2660 rs->last_version = ram_list.version; 2661 rs->xbzrle_enabled = false; 2662 } 2663 2664 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2665 2666 /* **** functions for postcopy ***** */ 2667 2668 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2669 { 2670 struct RAMBlock *block; 2671 2672 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2673 unsigned long *bitmap = block->bmap; 2674 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2675 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2676 2677 while (run_start < range) { 2678 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2679 ram_discard_range(block->idstr, 2680 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2681 ((ram_addr_t)(run_end - run_start)) 2682 << TARGET_PAGE_BITS); 2683 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2684 } 2685 } 2686 } 2687 2688 /** 2689 * postcopy_send_discard_bm_ram: discard a RAMBlock 2690 * 2691 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2692 * 2693 * @ms: current migration state 2694 * @block: RAMBlock to discard 2695 */ 2696 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2697 { 2698 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2699 unsigned long current; 2700 unsigned long *bitmap = block->bmap; 2701 2702 for (current = 0; current < end; ) { 2703 unsigned long one = find_next_bit(bitmap, end, current); 2704 unsigned long zero, discard_length; 2705 2706 if (one >= end) { 2707 break; 2708 } 2709 2710 zero = find_next_zero_bit(bitmap, end, one + 1); 2711 2712 if (zero >= end) { 2713 discard_length = end - one; 2714 } else { 2715 discard_length = zero - one; 2716 } 2717 postcopy_discard_send_range(ms, one, discard_length); 2718 current = one + discard_length; 2719 } 2720 } 2721 2722 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2723 2724 /** 2725 * postcopy_each_ram_send_discard: discard all RAMBlocks 2726 * 2727 * Utility for the outgoing postcopy code. 2728 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2729 * passing it bitmap indexes and name. 2730 * (qemu_ram_foreach_block ends up passing unscaled lengths 2731 * which would mean postcopy code would have to deal with target page) 2732 * 2733 * @ms: current migration state 2734 */ 2735 static void postcopy_each_ram_send_discard(MigrationState *ms) 2736 { 2737 struct RAMBlock *block; 2738 2739 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2740 postcopy_discard_send_init(ms, block->idstr); 2741 2742 /* 2743 * Deal with TPS != HPS and huge pages. It discard any partially sent 2744 * host-page size chunks, mark any partially dirty host-page size 2745 * chunks as all dirty. In this case the host-page is the host-page 2746 * for the particular RAMBlock, i.e. it might be a huge page. 2747 */ 2748 postcopy_chunk_hostpages_pass(ms, block); 2749 2750 /* 2751 * Postcopy sends chunks of bitmap over the wire, but it 2752 * just needs indexes at this point, avoids it having 2753 * target page specific code. 2754 */ 2755 postcopy_send_discard_bm_ram(ms, block); 2756 postcopy_discard_send_finish(ms); 2757 } 2758 } 2759 2760 /** 2761 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2762 * 2763 * Helper for postcopy_chunk_hostpages; it's called twice to 2764 * canonicalize the two bitmaps, that are similar, but one is 2765 * inverted. 2766 * 2767 * Postcopy requires that all target pages in a hostpage are dirty or 2768 * clean, not a mix. This function canonicalizes the bitmaps. 2769 * 2770 * @ms: current migration state 2771 * @block: block that contains the page we want to canonicalize 2772 */ 2773 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2774 { 2775 RAMState *rs = ram_state; 2776 unsigned long *bitmap = block->bmap; 2777 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2778 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2779 unsigned long run_start; 2780 2781 if (block->page_size == TARGET_PAGE_SIZE) { 2782 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2783 return; 2784 } 2785 2786 /* Find a dirty page */ 2787 run_start = find_next_bit(bitmap, pages, 0); 2788 2789 while (run_start < pages) { 2790 2791 /* 2792 * If the start of this run of pages is in the middle of a host 2793 * page, then we need to fixup this host page. 2794 */ 2795 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2796 /* Find the end of this run */ 2797 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2798 /* 2799 * If the end isn't at the start of a host page, then the 2800 * run doesn't finish at the end of a host page 2801 * and we need to discard. 2802 */ 2803 } 2804 2805 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2806 unsigned long page; 2807 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2808 host_ratio); 2809 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2810 2811 /* Clean up the bitmap */ 2812 for (page = fixup_start_addr; 2813 page < fixup_start_addr + host_ratio; page++) { 2814 /* 2815 * Remark them as dirty, updating the count for any pages 2816 * that weren't previously dirty. 2817 */ 2818 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2819 } 2820 } 2821 2822 /* Find the next dirty page for the next iteration */ 2823 run_start = find_next_bit(bitmap, pages, run_start); 2824 } 2825 } 2826 2827 /** 2828 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2829 * 2830 * Transmit the set of pages to be discarded after precopy to the target 2831 * these are pages that: 2832 * a) Have been previously transmitted but are now dirty again 2833 * b) Pages that have never been transmitted, this ensures that 2834 * any pages on the destination that have been mapped by background 2835 * tasks get discarded (transparent huge pages is the specific concern) 2836 * Hopefully this is pretty sparse 2837 * 2838 * @ms: current migration state 2839 */ 2840 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2841 { 2842 RAMState *rs = ram_state; 2843 2844 RCU_READ_LOCK_GUARD(); 2845 2846 /* This should be our last sync, the src is now paused */ 2847 migration_bitmap_sync(rs); 2848 2849 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2850 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2851 rs->last_seen_block = NULL; 2852 rs->last_page = 0; 2853 2854 postcopy_each_ram_send_discard(ms); 2855 2856 trace_ram_postcopy_send_discard_bitmap(); 2857 } 2858 2859 /** 2860 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2861 * 2862 * Returns zero on success 2863 * 2864 * @rbname: name of the RAMBlock of the request. NULL means the 2865 * same that last one. 2866 * @start: RAMBlock starting page 2867 * @length: RAMBlock size 2868 */ 2869 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2870 { 2871 trace_ram_discard_range(rbname, start, length); 2872 2873 RCU_READ_LOCK_GUARD(); 2874 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2875 2876 if (!rb) { 2877 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2878 return -1; 2879 } 2880 2881 /* 2882 * On source VM, we don't need to update the received bitmap since 2883 * we don't even have one. 2884 */ 2885 if (rb->receivedmap) { 2886 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2887 length >> qemu_target_page_bits()); 2888 } 2889 2890 return ram_block_discard_range(rb, start, length); 2891 } 2892 2893 /* 2894 * For every allocation, we will try not to crash the VM if the 2895 * allocation failed. 2896 */ 2897 static int xbzrle_init(void) 2898 { 2899 Error *local_err = NULL; 2900 2901 if (!migrate_use_xbzrle()) { 2902 return 0; 2903 } 2904 2905 XBZRLE_cache_lock(); 2906 2907 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2908 if (!XBZRLE.zero_target_page) { 2909 error_report("%s: Error allocating zero page", __func__); 2910 goto err_out; 2911 } 2912 2913 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2914 TARGET_PAGE_SIZE, &local_err); 2915 if (!XBZRLE.cache) { 2916 error_report_err(local_err); 2917 goto free_zero_page; 2918 } 2919 2920 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2921 if (!XBZRLE.encoded_buf) { 2922 error_report("%s: Error allocating encoded_buf", __func__); 2923 goto free_cache; 2924 } 2925 2926 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2927 if (!XBZRLE.current_buf) { 2928 error_report("%s: Error allocating current_buf", __func__); 2929 goto free_encoded_buf; 2930 } 2931 2932 /* We are all good */ 2933 XBZRLE_cache_unlock(); 2934 return 0; 2935 2936 free_encoded_buf: 2937 g_free(XBZRLE.encoded_buf); 2938 XBZRLE.encoded_buf = NULL; 2939 free_cache: 2940 cache_fini(XBZRLE.cache); 2941 XBZRLE.cache = NULL; 2942 free_zero_page: 2943 g_free(XBZRLE.zero_target_page); 2944 XBZRLE.zero_target_page = NULL; 2945 err_out: 2946 XBZRLE_cache_unlock(); 2947 return -ENOMEM; 2948 } 2949 2950 static int ram_state_init(RAMState **rsp) 2951 { 2952 *rsp = g_try_new0(RAMState, 1); 2953 2954 if (!*rsp) { 2955 error_report("%s: Init ramstate fail", __func__); 2956 return -1; 2957 } 2958 2959 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2960 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2961 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2962 2963 /* 2964 * Count the total number of pages used by ram blocks not including any 2965 * gaps due to alignment or unplugs. 2966 * This must match with the initial values of dirty bitmap. 2967 */ 2968 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2969 ram_state_reset(*rsp); 2970 2971 return 0; 2972 } 2973 2974 static void ram_list_init_bitmaps(void) 2975 { 2976 MigrationState *ms = migrate_get_current(); 2977 RAMBlock *block; 2978 unsigned long pages; 2979 uint8_t shift; 2980 2981 /* Skip setting bitmap if there is no RAM */ 2982 if (ram_bytes_total()) { 2983 shift = ms->clear_bitmap_shift; 2984 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2985 error_report("clear_bitmap_shift (%u) too big, using " 2986 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2987 shift = CLEAR_BITMAP_SHIFT_MAX; 2988 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2989 error_report("clear_bitmap_shift (%u) too small, using " 2990 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2991 shift = CLEAR_BITMAP_SHIFT_MIN; 2992 } 2993 2994 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2995 pages = block->max_length >> TARGET_PAGE_BITS; 2996 /* 2997 * The initial dirty bitmap for migration must be set with all 2998 * ones to make sure we'll migrate every guest RAM page to 2999 * destination. 3000 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3001 * new migration after a failed migration, ram_list. 3002 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3003 * guest memory. 3004 */ 3005 block->bmap = bitmap_new(pages); 3006 bitmap_set(block->bmap, 0, pages); 3007 block->clear_bmap_shift = shift; 3008 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3009 } 3010 } 3011 } 3012 3013 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3014 { 3015 unsigned long pages; 3016 RAMBlock *rb; 3017 3018 RCU_READ_LOCK_GUARD(); 3019 3020 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3021 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3022 rs->migration_dirty_pages -= pages; 3023 } 3024 } 3025 3026 static void ram_init_bitmaps(RAMState *rs) 3027 { 3028 /* For memory_global_dirty_log_start below. */ 3029 qemu_mutex_lock_iothread(); 3030 qemu_mutex_lock_ramlist(); 3031 3032 WITH_RCU_READ_LOCK_GUARD() { 3033 ram_list_init_bitmaps(); 3034 /* We don't use dirty log with background snapshots */ 3035 if (!migrate_background_snapshot()) { 3036 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3037 migration_bitmap_sync_precopy(rs); 3038 } 3039 } 3040 qemu_mutex_unlock_ramlist(); 3041 qemu_mutex_unlock_iothread(); 3042 3043 /* 3044 * After an eventual first bitmap sync, fixup the initial bitmap 3045 * containing all 1s to exclude any discarded pages from migration. 3046 */ 3047 migration_bitmap_clear_discarded_pages(rs); 3048 } 3049 3050 static int ram_init_all(RAMState **rsp) 3051 { 3052 if (ram_state_init(rsp)) { 3053 return -1; 3054 } 3055 3056 if (xbzrle_init()) { 3057 ram_state_cleanup(rsp); 3058 return -1; 3059 } 3060 3061 ram_init_bitmaps(*rsp); 3062 3063 return 0; 3064 } 3065 3066 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3067 { 3068 RAMBlock *block; 3069 uint64_t pages = 0; 3070 3071 /* 3072 * Postcopy is not using xbzrle/compression, so no need for that. 3073 * Also, since source are already halted, we don't need to care 3074 * about dirty page logging as well. 3075 */ 3076 3077 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3078 pages += bitmap_count_one(block->bmap, 3079 block->used_length >> TARGET_PAGE_BITS); 3080 } 3081 3082 /* This may not be aligned with current bitmaps. Recalculate. */ 3083 rs->migration_dirty_pages = pages; 3084 3085 ram_state_reset(rs); 3086 3087 /* Update RAMState cache of output QEMUFile */ 3088 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3089 3090 trace_ram_state_resume_prepare(pages); 3091 } 3092 3093 /* 3094 * This function clears bits of the free pages reported by the caller from the 3095 * migration dirty bitmap. @addr is the host address corresponding to the 3096 * start of the continuous guest free pages, and @len is the total bytes of 3097 * those pages. 3098 */ 3099 void qemu_guest_free_page_hint(void *addr, size_t len) 3100 { 3101 RAMBlock *block; 3102 ram_addr_t offset; 3103 size_t used_len, start, npages; 3104 MigrationState *s = migrate_get_current(); 3105 3106 /* This function is currently expected to be used during live migration */ 3107 if (!migration_is_setup_or_active(s->state)) { 3108 return; 3109 } 3110 3111 for (; len > 0; len -= used_len, addr += used_len) { 3112 block = qemu_ram_block_from_host(addr, false, &offset); 3113 if (unlikely(!block || offset >= block->used_length)) { 3114 /* 3115 * The implementation might not support RAMBlock resize during 3116 * live migration, but it could happen in theory with future 3117 * updates. So we add a check here to capture that case. 3118 */ 3119 error_report_once("%s unexpected error", __func__); 3120 return; 3121 } 3122 3123 if (len <= block->used_length - offset) { 3124 used_len = len; 3125 } else { 3126 used_len = block->used_length - offset; 3127 } 3128 3129 start = offset >> TARGET_PAGE_BITS; 3130 npages = used_len >> TARGET_PAGE_BITS; 3131 3132 qemu_mutex_lock(&ram_state->bitmap_mutex); 3133 /* 3134 * The skipped free pages are equavalent to be sent from clear_bmap's 3135 * perspective, so clear the bits from the memory region bitmap which 3136 * are initially set. Otherwise those skipped pages will be sent in 3137 * the next round after syncing from the memory region bitmap. 3138 */ 3139 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3140 ram_state->migration_dirty_pages -= 3141 bitmap_count_one_with_offset(block->bmap, start, npages); 3142 bitmap_clear(block->bmap, start, npages); 3143 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3144 } 3145 } 3146 3147 /* 3148 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3149 * long-running RCU critical section. When rcu-reclaims in the code 3150 * start to become numerous it will be necessary to reduce the 3151 * granularity of these critical sections. 3152 */ 3153 3154 /** 3155 * ram_save_setup: Setup RAM for migration 3156 * 3157 * Returns zero to indicate success and negative for error 3158 * 3159 * @f: QEMUFile where to send the data 3160 * @opaque: RAMState pointer 3161 */ 3162 static int ram_save_setup(QEMUFile *f, void *opaque) 3163 { 3164 RAMState **rsp = opaque; 3165 RAMBlock *block; 3166 int ret; 3167 3168 if (compress_threads_save_setup()) { 3169 return -1; 3170 } 3171 3172 /* migration has already setup the bitmap, reuse it. */ 3173 if (!migration_in_colo_state()) { 3174 if (ram_init_all(rsp) != 0) { 3175 compress_threads_save_cleanup(); 3176 return -1; 3177 } 3178 } 3179 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3180 3181 WITH_RCU_READ_LOCK_GUARD() { 3182 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 3183 3184 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3185 qemu_put_byte(f, strlen(block->idstr)); 3186 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3187 qemu_put_be64(f, block->used_length); 3188 if (migrate_postcopy_ram() && block->page_size != 3189 qemu_host_page_size) { 3190 qemu_put_be64(f, block->page_size); 3191 } 3192 if (migrate_ignore_shared()) { 3193 qemu_put_be64(f, block->mr->addr); 3194 } 3195 } 3196 } 3197 3198 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3199 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3200 3201 ret = multifd_send_sync_main(f); 3202 if (ret < 0) { 3203 return ret; 3204 } 3205 3206 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3207 qemu_fflush(f); 3208 3209 return 0; 3210 } 3211 3212 /** 3213 * ram_save_iterate: iterative stage for migration 3214 * 3215 * Returns zero to indicate success and negative for error 3216 * 3217 * @f: QEMUFile where to send the data 3218 * @opaque: RAMState pointer 3219 */ 3220 static int ram_save_iterate(QEMUFile *f, void *opaque) 3221 { 3222 RAMState **temp = opaque; 3223 RAMState *rs = *temp; 3224 int ret = 0; 3225 int i; 3226 int64_t t0; 3227 int done = 0; 3228 3229 if (blk_mig_bulk_active()) { 3230 /* Avoid transferring ram during bulk phase of block migration as 3231 * the bulk phase will usually take a long time and transferring 3232 * ram updates during that time is pointless. */ 3233 goto out; 3234 } 3235 3236 /* 3237 * We'll take this lock a little bit long, but it's okay for two reasons. 3238 * Firstly, the only possible other thread to take it is who calls 3239 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3240 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3241 * guarantees that we'll at least released it in a regular basis. 3242 */ 3243 qemu_mutex_lock(&rs->bitmap_mutex); 3244 WITH_RCU_READ_LOCK_GUARD() { 3245 if (ram_list.version != rs->last_version) { 3246 ram_state_reset(rs); 3247 } 3248 3249 /* Read version before ram_list.blocks */ 3250 smp_rmb(); 3251 3252 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3253 3254 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3255 i = 0; 3256 while ((ret = qemu_file_rate_limit(f)) == 0 || 3257 postcopy_has_request(rs)) { 3258 int pages; 3259 3260 if (qemu_file_get_error(f)) { 3261 break; 3262 } 3263 3264 pages = ram_find_and_save_block(rs); 3265 /* no more pages to sent */ 3266 if (pages == 0) { 3267 done = 1; 3268 break; 3269 } 3270 3271 if (pages < 0) { 3272 qemu_file_set_error(f, pages); 3273 break; 3274 } 3275 3276 rs->target_page_count += pages; 3277 3278 /* 3279 * During postcopy, it is necessary to make sure one whole host 3280 * page is sent in one chunk. 3281 */ 3282 if (migrate_postcopy_ram()) { 3283 flush_compressed_data(rs); 3284 } 3285 3286 /* 3287 * we want to check in the 1st loop, just in case it was the 1st 3288 * time and we had to sync the dirty bitmap. 3289 * qemu_clock_get_ns() is a bit expensive, so we only check each 3290 * some iterations 3291 */ 3292 if ((i & 63) == 0) { 3293 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3294 1000000; 3295 if (t1 > MAX_WAIT) { 3296 trace_ram_save_iterate_big_wait(t1, i); 3297 break; 3298 } 3299 } 3300 i++; 3301 } 3302 } 3303 qemu_mutex_unlock(&rs->bitmap_mutex); 3304 3305 /* 3306 * Must occur before EOS (or any QEMUFile operation) 3307 * because of RDMA protocol. 3308 */ 3309 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3310 3311 out: 3312 if (ret >= 0 3313 && migration_is_setup_or_active(migrate_get_current()->state)) { 3314 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3315 if (ret < 0) { 3316 return ret; 3317 } 3318 3319 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3320 qemu_fflush(f); 3321 ram_transferred_add(8); 3322 3323 ret = qemu_file_get_error(f); 3324 } 3325 if (ret < 0) { 3326 return ret; 3327 } 3328 3329 return done; 3330 } 3331 3332 /** 3333 * ram_save_complete: function called to send the remaining amount of ram 3334 * 3335 * Returns zero to indicate success or negative on error 3336 * 3337 * Called with iothread lock 3338 * 3339 * @f: QEMUFile where to send the data 3340 * @opaque: RAMState pointer 3341 */ 3342 static int ram_save_complete(QEMUFile *f, void *opaque) 3343 { 3344 RAMState **temp = opaque; 3345 RAMState *rs = *temp; 3346 int ret = 0; 3347 3348 rs->last_stage = !migration_in_colo_state(); 3349 3350 WITH_RCU_READ_LOCK_GUARD() { 3351 if (!migration_in_postcopy()) { 3352 migration_bitmap_sync_precopy(rs); 3353 } 3354 3355 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3356 3357 /* try transferring iterative blocks of memory */ 3358 3359 /* flush all remaining blocks regardless of rate limiting */ 3360 qemu_mutex_lock(&rs->bitmap_mutex); 3361 while (true) { 3362 int pages; 3363 3364 pages = ram_find_and_save_block(rs); 3365 /* no more blocks to sent */ 3366 if (pages == 0) { 3367 break; 3368 } 3369 if (pages < 0) { 3370 ret = pages; 3371 break; 3372 } 3373 } 3374 qemu_mutex_unlock(&rs->bitmap_mutex); 3375 3376 flush_compressed_data(rs); 3377 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3378 } 3379 3380 if (ret < 0) { 3381 return ret; 3382 } 3383 3384 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3385 if (ret < 0) { 3386 return ret; 3387 } 3388 3389 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3390 qemu_fflush(f); 3391 3392 return 0; 3393 } 3394 3395 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3396 uint64_t *res_precopy_only, 3397 uint64_t *res_compatible, 3398 uint64_t *res_postcopy_only) 3399 { 3400 RAMState **temp = opaque; 3401 RAMState *rs = *temp; 3402 uint64_t remaining_size; 3403 3404 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3405 3406 if (!migration_in_postcopy() && 3407 remaining_size < max_size) { 3408 qemu_mutex_lock_iothread(); 3409 WITH_RCU_READ_LOCK_GUARD() { 3410 migration_bitmap_sync_precopy(rs); 3411 } 3412 qemu_mutex_unlock_iothread(); 3413 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3414 } 3415 3416 if (migrate_postcopy_ram()) { 3417 /* We can do postcopy, and all the data is postcopiable */ 3418 *res_compatible += remaining_size; 3419 } else { 3420 *res_precopy_only += remaining_size; 3421 } 3422 } 3423 3424 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3425 { 3426 unsigned int xh_len; 3427 int xh_flags; 3428 uint8_t *loaded_data; 3429 3430 /* extract RLE header */ 3431 xh_flags = qemu_get_byte(f); 3432 xh_len = qemu_get_be16(f); 3433 3434 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3435 error_report("Failed to load XBZRLE page - wrong compression!"); 3436 return -1; 3437 } 3438 3439 if (xh_len > TARGET_PAGE_SIZE) { 3440 error_report("Failed to load XBZRLE page - len overflow!"); 3441 return -1; 3442 } 3443 loaded_data = XBZRLE.decoded_buf; 3444 /* load data and decode */ 3445 /* it can change loaded_data to point to an internal buffer */ 3446 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3447 3448 /* decode RLE */ 3449 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3450 TARGET_PAGE_SIZE) == -1) { 3451 error_report("Failed to load XBZRLE page - decode error!"); 3452 return -1; 3453 } 3454 3455 return 0; 3456 } 3457 3458 /** 3459 * ram_block_from_stream: read a RAMBlock id from the migration stream 3460 * 3461 * Must be called from within a rcu critical section. 3462 * 3463 * Returns a pointer from within the RCU-protected ram_list. 3464 * 3465 * @mis: the migration incoming state pointer 3466 * @f: QEMUFile where to read the data from 3467 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3468 * @channel: the channel we're using 3469 */ 3470 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3471 QEMUFile *f, int flags, 3472 int channel) 3473 { 3474 RAMBlock *block = mis->last_recv_block[channel]; 3475 char id[256]; 3476 uint8_t len; 3477 3478 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3479 if (!block) { 3480 error_report("Ack, bad migration stream!"); 3481 return NULL; 3482 } 3483 return block; 3484 } 3485 3486 len = qemu_get_byte(f); 3487 qemu_get_buffer(f, (uint8_t *)id, len); 3488 id[len] = 0; 3489 3490 block = qemu_ram_block_by_name(id); 3491 if (!block) { 3492 error_report("Can't find block %s", id); 3493 return NULL; 3494 } 3495 3496 if (ramblock_is_ignored(block)) { 3497 error_report("block %s should not be migrated !", id); 3498 return NULL; 3499 } 3500 3501 mis->last_recv_block[channel] = block; 3502 3503 return block; 3504 } 3505 3506 static inline void *host_from_ram_block_offset(RAMBlock *block, 3507 ram_addr_t offset) 3508 { 3509 if (!offset_in_ramblock(block, offset)) { 3510 return NULL; 3511 } 3512 3513 return block->host + offset; 3514 } 3515 3516 static void *host_page_from_ram_block_offset(RAMBlock *block, 3517 ram_addr_t offset) 3518 { 3519 /* Note: Explicitly no check against offset_in_ramblock(). */ 3520 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3521 block->page_size); 3522 } 3523 3524 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3525 ram_addr_t offset) 3526 { 3527 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3528 } 3529 3530 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3531 ram_addr_t offset, bool record_bitmap) 3532 { 3533 if (!offset_in_ramblock(block, offset)) { 3534 return NULL; 3535 } 3536 if (!block->colo_cache) { 3537 error_report("%s: colo_cache is NULL in block :%s", 3538 __func__, block->idstr); 3539 return NULL; 3540 } 3541 3542 /* 3543 * During colo checkpoint, we need bitmap of these migrated pages. 3544 * It help us to decide which pages in ram cache should be flushed 3545 * into VM's RAM later. 3546 */ 3547 if (record_bitmap && 3548 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3549 ram_state->migration_dirty_pages++; 3550 } 3551 return block->colo_cache + offset; 3552 } 3553 3554 /** 3555 * ram_handle_compressed: handle the zero page case 3556 * 3557 * If a page (or a whole RDMA chunk) has been 3558 * determined to be zero, then zap it. 3559 * 3560 * @host: host address for the zero page 3561 * @ch: what the page is filled from. We only support zero 3562 * @size: size of the zero page 3563 */ 3564 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3565 { 3566 if (ch != 0 || !buffer_is_zero(host, size)) { 3567 memset(host, ch, size); 3568 } 3569 } 3570 3571 /* return the size after decompression, or negative value on error */ 3572 static int 3573 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3574 const uint8_t *source, size_t source_len) 3575 { 3576 int err; 3577 3578 err = inflateReset(stream); 3579 if (err != Z_OK) { 3580 return -1; 3581 } 3582 3583 stream->avail_in = source_len; 3584 stream->next_in = (uint8_t *)source; 3585 stream->avail_out = dest_len; 3586 stream->next_out = dest; 3587 3588 err = inflate(stream, Z_NO_FLUSH); 3589 if (err != Z_STREAM_END) { 3590 return -1; 3591 } 3592 3593 return stream->total_out; 3594 } 3595 3596 static void *do_data_decompress(void *opaque) 3597 { 3598 DecompressParam *param = opaque; 3599 unsigned long pagesize; 3600 uint8_t *des; 3601 int len, ret; 3602 3603 qemu_mutex_lock(¶m->mutex); 3604 while (!param->quit) { 3605 if (param->des) { 3606 des = param->des; 3607 len = param->len; 3608 param->des = 0; 3609 qemu_mutex_unlock(¶m->mutex); 3610 3611 pagesize = TARGET_PAGE_SIZE; 3612 3613 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3614 param->compbuf, len); 3615 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3616 error_report("decompress data failed"); 3617 qemu_file_set_error(decomp_file, ret); 3618 } 3619 3620 qemu_mutex_lock(&decomp_done_lock); 3621 param->done = true; 3622 qemu_cond_signal(&decomp_done_cond); 3623 qemu_mutex_unlock(&decomp_done_lock); 3624 3625 qemu_mutex_lock(¶m->mutex); 3626 } else { 3627 qemu_cond_wait(¶m->cond, ¶m->mutex); 3628 } 3629 } 3630 qemu_mutex_unlock(¶m->mutex); 3631 3632 return NULL; 3633 } 3634 3635 static int wait_for_decompress_done(void) 3636 { 3637 int idx, thread_count; 3638 3639 if (!migrate_use_compression()) { 3640 return 0; 3641 } 3642 3643 thread_count = migrate_decompress_threads(); 3644 qemu_mutex_lock(&decomp_done_lock); 3645 for (idx = 0; idx < thread_count; idx++) { 3646 while (!decomp_param[idx].done) { 3647 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3648 } 3649 } 3650 qemu_mutex_unlock(&decomp_done_lock); 3651 return qemu_file_get_error(decomp_file); 3652 } 3653 3654 static void compress_threads_load_cleanup(void) 3655 { 3656 int i, thread_count; 3657 3658 if (!migrate_use_compression()) { 3659 return; 3660 } 3661 thread_count = migrate_decompress_threads(); 3662 for (i = 0; i < thread_count; i++) { 3663 /* 3664 * we use it as a indicator which shows if the thread is 3665 * properly init'd or not 3666 */ 3667 if (!decomp_param[i].compbuf) { 3668 break; 3669 } 3670 3671 qemu_mutex_lock(&decomp_param[i].mutex); 3672 decomp_param[i].quit = true; 3673 qemu_cond_signal(&decomp_param[i].cond); 3674 qemu_mutex_unlock(&decomp_param[i].mutex); 3675 } 3676 for (i = 0; i < thread_count; i++) { 3677 if (!decomp_param[i].compbuf) { 3678 break; 3679 } 3680 3681 qemu_thread_join(decompress_threads + i); 3682 qemu_mutex_destroy(&decomp_param[i].mutex); 3683 qemu_cond_destroy(&decomp_param[i].cond); 3684 inflateEnd(&decomp_param[i].stream); 3685 g_free(decomp_param[i].compbuf); 3686 decomp_param[i].compbuf = NULL; 3687 } 3688 g_free(decompress_threads); 3689 g_free(decomp_param); 3690 decompress_threads = NULL; 3691 decomp_param = NULL; 3692 decomp_file = NULL; 3693 } 3694 3695 static int compress_threads_load_setup(QEMUFile *f) 3696 { 3697 int i, thread_count; 3698 3699 if (!migrate_use_compression()) { 3700 return 0; 3701 } 3702 3703 thread_count = migrate_decompress_threads(); 3704 decompress_threads = g_new0(QemuThread, thread_count); 3705 decomp_param = g_new0(DecompressParam, thread_count); 3706 qemu_mutex_init(&decomp_done_lock); 3707 qemu_cond_init(&decomp_done_cond); 3708 decomp_file = f; 3709 for (i = 0; i < thread_count; i++) { 3710 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3711 goto exit; 3712 } 3713 3714 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3715 qemu_mutex_init(&decomp_param[i].mutex); 3716 qemu_cond_init(&decomp_param[i].cond); 3717 decomp_param[i].done = true; 3718 decomp_param[i].quit = false; 3719 qemu_thread_create(decompress_threads + i, "decompress", 3720 do_data_decompress, decomp_param + i, 3721 QEMU_THREAD_JOINABLE); 3722 } 3723 return 0; 3724 exit: 3725 compress_threads_load_cleanup(); 3726 return -1; 3727 } 3728 3729 static void decompress_data_with_multi_threads(QEMUFile *f, 3730 void *host, int len) 3731 { 3732 int idx, thread_count; 3733 3734 thread_count = migrate_decompress_threads(); 3735 QEMU_LOCK_GUARD(&decomp_done_lock); 3736 while (true) { 3737 for (idx = 0; idx < thread_count; idx++) { 3738 if (decomp_param[idx].done) { 3739 decomp_param[idx].done = false; 3740 qemu_mutex_lock(&decomp_param[idx].mutex); 3741 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3742 decomp_param[idx].des = host; 3743 decomp_param[idx].len = len; 3744 qemu_cond_signal(&decomp_param[idx].cond); 3745 qemu_mutex_unlock(&decomp_param[idx].mutex); 3746 break; 3747 } 3748 } 3749 if (idx < thread_count) { 3750 break; 3751 } else { 3752 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3753 } 3754 } 3755 } 3756 3757 static void colo_init_ram_state(void) 3758 { 3759 ram_state_init(&ram_state); 3760 } 3761 3762 /* 3763 * colo cache: this is for secondary VM, we cache the whole 3764 * memory of the secondary VM, it is need to hold the global lock 3765 * to call this helper. 3766 */ 3767 int colo_init_ram_cache(void) 3768 { 3769 RAMBlock *block; 3770 3771 WITH_RCU_READ_LOCK_GUARD() { 3772 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3773 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3774 NULL, false, false); 3775 if (!block->colo_cache) { 3776 error_report("%s: Can't alloc memory for COLO cache of block %s," 3777 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3778 block->used_length); 3779 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3780 if (block->colo_cache) { 3781 qemu_anon_ram_free(block->colo_cache, block->used_length); 3782 block->colo_cache = NULL; 3783 } 3784 } 3785 return -errno; 3786 } 3787 if (!machine_dump_guest_core(current_machine)) { 3788 qemu_madvise(block->colo_cache, block->used_length, 3789 QEMU_MADV_DONTDUMP); 3790 } 3791 } 3792 } 3793 3794 /* 3795 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3796 * with to decide which page in cache should be flushed into SVM's RAM. Here 3797 * we use the same name 'ram_bitmap' as for migration. 3798 */ 3799 if (ram_bytes_total()) { 3800 RAMBlock *block; 3801 3802 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3803 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3804 block->bmap = bitmap_new(pages); 3805 } 3806 } 3807 3808 colo_init_ram_state(); 3809 return 0; 3810 } 3811 3812 /* TODO: duplicated with ram_init_bitmaps */ 3813 void colo_incoming_start_dirty_log(void) 3814 { 3815 RAMBlock *block = NULL; 3816 /* For memory_global_dirty_log_start below. */ 3817 qemu_mutex_lock_iothread(); 3818 qemu_mutex_lock_ramlist(); 3819 3820 memory_global_dirty_log_sync(); 3821 WITH_RCU_READ_LOCK_GUARD() { 3822 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3823 ramblock_sync_dirty_bitmap(ram_state, block); 3824 /* Discard this dirty bitmap record */ 3825 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3826 } 3827 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3828 } 3829 ram_state->migration_dirty_pages = 0; 3830 qemu_mutex_unlock_ramlist(); 3831 qemu_mutex_unlock_iothread(); 3832 } 3833 3834 /* It is need to hold the global lock to call this helper */ 3835 void colo_release_ram_cache(void) 3836 { 3837 RAMBlock *block; 3838 3839 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3840 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3841 g_free(block->bmap); 3842 block->bmap = NULL; 3843 } 3844 3845 WITH_RCU_READ_LOCK_GUARD() { 3846 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3847 if (block->colo_cache) { 3848 qemu_anon_ram_free(block->colo_cache, block->used_length); 3849 block->colo_cache = NULL; 3850 } 3851 } 3852 } 3853 ram_state_cleanup(&ram_state); 3854 } 3855 3856 /** 3857 * ram_load_setup: Setup RAM for migration incoming side 3858 * 3859 * Returns zero to indicate success and negative for error 3860 * 3861 * @f: QEMUFile where to receive the data 3862 * @opaque: RAMState pointer 3863 */ 3864 static int ram_load_setup(QEMUFile *f, void *opaque) 3865 { 3866 if (compress_threads_load_setup(f)) { 3867 return -1; 3868 } 3869 3870 xbzrle_load_setup(); 3871 ramblock_recv_map_init(); 3872 3873 return 0; 3874 } 3875 3876 static int ram_load_cleanup(void *opaque) 3877 { 3878 RAMBlock *rb; 3879 3880 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3881 qemu_ram_block_writeback(rb); 3882 } 3883 3884 xbzrle_load_cleanup(); 3885 compress_threads_load_cleanup(); 3886 3887 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3888 g_free(rb->receivedmap); 3889 rb->receivedmap = NULL; 3890 } 3891 3892 return 0; 3893 } 3894 3895 /** 3896 * ram_postcopy_incoming_init: allocate postcopy data structures 3897 * 3898 * Returns 0 for success and negative if there was one error 3899 * 3900 * @mis: current migration incoming state 3901 * 3902 * Allocate data structures etc needed by incoming migration with 3903 * postcopy-ram. postcopy-ram's similarly names 3904 * postcopy_ram_incoming_init does the work. 3905 */ 3906 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3907 { 3908 return postcopy_ram_incoming_init(mis); 3909 } 3910 3911 /** 3912 * ram_load_postcopy: load a page in postcopy case 3913 * 3914 * Returns 0 for success or -errno in case of error 3915 * 3916 * Called in postcopy mode by ram_load(). 3917 * rcu_read_lock is taken prior to this being called. 3918 * 3919 * @f: QEMUFile where to send the data 3920 * @channel: the channel to use for loading 3921 */ 3922 int ram_load_postcopy(QEMUFile *f, int channel) 3923 { 3924 int flags = 0, ret = 0; 3925 bool place_needed = false; 3926 bool matches_target_page_size = false; 3927 MigrationIncomingState *mis = migration_incoming_get_current(); 3928 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3929 3930 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3931 ram_addr_t addr; 3932 void *page_buffer = NULL; 3933 void *place_source = NULL; 3934 RAMBlock *block = NULL; 3935 uint8_t ch; 3936 int len; 3937 3938 addr = qemu_get_be64(f); 3939 3940 /* 3941 * If qemu file error, we should stop here, and then "addr" 3942 * may be invalid 3943 */ 3944 ret = qemu_file_get_error(f); 3945 if (ret) { 3946 break; 3947 } 3948 3949 flags = addr & ~TARGET_PAGE_MASK; 3950 addr &= TARGET_PAGE_MASK; 3951 3952 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3953 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3954 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3955 block = ram_block_from_stream(mis, f, flags, channel); 3956 if (!block) { 3957 ret = -EINVAL; 3958 break; 3959 } 3960 3961 /* 3962 * Relying on used_length is racy and can result in false positives. 3963 * We might place pages beyond used_length in case RAM was shrunk 3964 * while in postcopy, which is fine - trying to place via 3965 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3966 */ 3967 if (!block->host || addr >= block->postcopy_length) { 3968 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3969 ret = -EINVAL; 3970 break; 3971 } 3972 tmp_page->target_pages++; 3973 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3974 /* 3975 * Postcopy requires that we place whole host pages atomically; 3976 * these may be huge pages for RAMBlocks that are backed by 3977 * hugetlbfs. 3978 * To make it atomic, the data is read into a temporary page 3979 * that's moved into place later. 3980 * The migration protocol uses, possibly smaller, target-pages 3981 * however the source ensures it always sends all the components 3982 * of a host page in one chunk. 3983 */ 3984 page_buffer = tmp_page->tmp_huge_page + 3985 host_page_offset_from_ram_block_offset(block, addr); 3986 /* If all TP are zero then we can optimise the place */ 3987 if (tmp_page->target_pages == 1) { 3988 tmp_page->host_addr = 3989 host_page_from_ram_block_offset(block, addr); 3990 } else if (tmp_page->host_addr != 3991 host_page_from_ram_block_offset(block, addr)) { 3992 /* not the 1st TP within the HP */ 3993 error_report("Non-same host page detected on channel %d: " 3994 "Target host page %p, received host page %p " 3995 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3996 channel, tmp_page->host_addr, 3997 host_page_from_ram_block_offset(block, addr), 3998 block->idstr, addr, tmp_page->target_pages); 3999 ret = -EINVAL; 4000 break; 4001 } 4002 4003 /* 4004 * If it's the last part of a host page then we place the host 4005 * page 4006 */ 4007 if (tmp_page->target_pages == 4008 (block->page_size / TARGET_PAGE_SIZE)) { 4009 place_needed = true; 4010 } 4011 place_source = tmp_page->tmp_huge_page; 4012 } 4013 4014 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4015 case RAM_SAVE_FLAG_ZERO: 4016 ch = qemu_get_byte(f); 4017 /* 4018 * Can skip to set page_buffer when 4019 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4020 */ 4021 if (ch || !matches_target_page_size) { 4022 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4023 } 4024 if (ch) { 4025 tmp_page->all_zero = false; 4026 } 4027 break; 4028 4029 case RAM_SAVE_FLAG_PAGE: 4030 tmp_page->all_zero = false; 4031 if (!matches_target_page_size) { 4032 /* For huge pages, we always use temporary buffer */ 4033 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4034 } else { 4035 /* 4036 * For small pages that matches target page size, we 4037 * avoid the qemu_file copy. Instead we directly use 4038 * the buffer of QEMUFile to place the page. Note: we 4039 * cannot do any QEMUFile operation before using that 4040 * buffer to make sure the buffer is valid when 4041 * placing the page. 4042 */ 4043 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4044 TARGET_PAGE_SIZE); 4045 } 4046 break; 4047 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4048 tmp_page->all_zero = false; 4049 len = qemu_get_be32(f); 4050 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4051 error_report("Invalid compressed data length: %d", len); 4052 ret = -EINVAL; 4053 break; 4054 } 4055 decompress_data_with_multi_threads(f, page_buffer, len); 4056 break; 4057 4058 case RAM_SAVE_FLAG_EOS: 4059 /* normal exit */ 4060 multifd_recv_sync_main(); 4061 break; 4062 default: 4063 error_report("Unknown combination of migration flags: 0x%x" 4064 " (postcopy mode)", flags); 4065 ret = -EINVAL; 4066 break; 4067 } 4068 4069 /* Got the whole host page, wait for decompress before placing. */ 4070 if (place_needed) { 4071 ret |= wait_for_decompress_done(); 4072 } 4073 4074 /* Detect for any possible file errors */ 4075 if (!ret && qemu_file_get_error(f)) { 4076 ret = qemu_file_get_error(f); 4077 } 4078 4079 if (!ret && place_needed) { 4080 if (tmp_page->all_zero) { 4081 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4082 } else { 4083 ret = postcopy_place_page(mis, tmp_page->host_addr, 4084 place_source, block); 4085 } 4086 place_needed = false; 4087 postcopy_temp_page_reset(tmp_page); 4088 } 4089 } 4090 4091 return ret; 4092 } 4093 4094 static bool postcopy_is_advised(void) 4095 { 4096 PostcopyState ps = postcopy_state_get(); 4097 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 4098 } 4099 4100 static bool postcopy_is_running(void) 4101 { 4102 PostcopyState ps = postcopy_state_get(); 4103 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4104 } 4105 4106 /* 4107 * Flush content of RAM cache into SVM's memory. 4108 * Only flush the pages that be dirtied by PVM or SVM or both. 4109 */ 4110 void colo_flush_ram_cache(void) 4111 { 4112 RAMBlock *block = NULL; 4113 void *dst_host; 4114 void *src_host; 4115 unsigned long offset = 0; 4116 4117 memory_global_dirty_log_sync(); 4118 WITH_RCU_READ_LOCK_GUARD() { 4119 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4120 ramblock_sync_dirty_bitmap(ram_state, block); 4121 } 4122 } 4123 4124 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4125 WITH_RCU_READ_LOCK_GUARD() { 4126 block = QLIST_FIRST_RCU(&ram_list.blocks); 4127 4128 while (block) { 4129 unsigned long num = 0; 4130 4131 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4132 if (!offset_in_ramblock(block, 4133 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4134 offset = 0; 4135 num = 0; 4136 block = QLIST_NEXT_RCU(block, next); 4137 } else { 4138 unsigned long i = 0; 4139 4140 for (i = 0; i < num; i++) { 4141 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4142 } 4143 dst_host = block->host 4144 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4145 src_host = block->colo_cache 4146 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4147 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4148 offset += num; 4149 } 4150 } 4151 } 4152 trace_colo_flush_ram_cache_end(); 4153 } 4154 4155 /** 4156 * ram_load_precopy: load pages in precopy case 4157 * 4158 * Returns 0 for success or -errno in case of error 4159 * 4160 * Called in precopy mode by ram_load(). 4161 * rcu_read_lock is taken prior to this being called. 4162 * 4163 * @f: QEMUFile where to send the data 4164 */ 4165 static int ram_load_precopy(QEMUFile *f) 4166 { 4167 MigrationIncomingState *mis = migration_incoming_get_current(); 4168 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4169 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4170 bool postcopy_advised = postcopy_is_advised(); 4171 if (!migrate_use_compression()) { 4172 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4173 } 4174 4175 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4176 ram_addr_t addr, total_ram_bytes; 4177 void *host = NULL, *host_bak = NULL; 4178 uint8_t ch; 4179 4180 /* 4181 * Yield periodically to let main loop run, but an iteration of 4182 * the main loop is expensive, so do it each some iterations 4183 */ 4184 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4185 aio_co_schedule(qemu_get_current_aio_context(), 4186 qemu_coroutine_self()); 4187 qemu_coroutine_yield(); 4188 } 4189 i++; 4190 4191 addr = qemu_get_be64(f); 4192 flags = addr & ~TARGET_PAGE_MASK; 4193 addr &= TARGET_PAGE_MASK; 4194 4195 if (flags & invalid_flags) { 4196 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4197 error_report("Received an unexpected compressed page"); 4198 } 4199 4200 ret = -EINVAL; 4201 break; 4202 } 4203 4204 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4205 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4206 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4207 RAM_CHANNEL_PRECOPY); 4208 4209 host = host_from_ram_block_offset(block, addr); 4210 /* 4211 * After going into COLO stage, we should not load the page 4212 * into SVM's memory directly, we put them into colo_cache firstly. 4213 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4214 * Previously, we copied all these memory in preparing stage of COLO 4215 * while we need to stop VM, which is a time-consuming process. 4216 * Here we optimize it by a trick, back-up every page while in 4217 * migration process while COLO is enabled, though it affects the 4218 * speed of the migration, but it obviously reduce the downtime of 4219 * back-up all SVM'S memory in COLO preparing stage. 4220 */ 4221 if (migration_incoming_colo_enabled()) { 4222 if (migration_incoming_in_colo_state()) { 4223 /* In COLO stage, put all pages into cache temporarily */ 4224 host = colo_cache_from_block_offset(block, addr, true); 4225 } else { 4226 /* 4227 * In migration stage but before COLO stage, 4228 * Put all pages into both cache and SVM's memory. 4229 */ 4230 host_bak = colo_cache_from_block_offset(block, addr, false); 4231 } 4232 } 4233 if (!host) { 4234 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4235 ret = -EINVAL; 4236 break; 4237 } 4238 if (!migration_incoming_in_colo_state()) { 4239 ramblock_recv_bitmap_set(block, host); 4240 } 4241 4242 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4243 } 4244 4245 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4246 case RAM_SAVE_FLAG_MEM_SIZE: 4247 /* Synchronize RAM block list */ 4248 total_ram_bytes = addr; 4249 while (!ret && total_ram_bytes) { 4250 RAMBlock *block; 4251 char id[256]; 4252 ram_addr_t length; 4253 4254 len = qemu_get_byte(f); 4255 qemu_get_buffer(f, (uint8_t *)id, len); 4256 id[len] = 0; 4257 length = qemu_get_be64(f); 4258 4259 block = qemu_ram_block_by_name(id); 4260 if (block && !qemu_ram_is_migratable(block)) { 4261 error_report("block %s should not be migrated !", id); 4262 ret = -EINVAL; 4263 } else if (block) { 4264 if (length != block->used_length) { 4265 Error *local_err = NULL; 4266 4267 ret = qemu_ram_resize(block, length, 4268 &local_err); 4269 if (local_err) { 4270 error_report_err(local_err); 4271 } 4272 } 4273 /* For postcopy we need to check hugepage sizes match */ 4274 if (postcopy_advised && migrate_postcopy_ram() && 4275 block->page_size != qemu_host_page_size) { 4276 uint64_t remote_page_size = qemu_get_be64(f); 4277 if (remote_page_size != block->page_size) { 4278 error_report("Mismatched RAM page size %s " 4279 "(local) %zd != %" PRId64, 4280 id, block->page_size, 4281 remote_page_size); 4282 ret = -EINVAL; 4283 } 4284 } 4285 if (migrate_ignore_shared()) { 4286 hwaddr addr = qemu_get_be64(f); 4287 if (ramblock_is_ignored(block) && 4288 block->mr->addr != addr) { 4289 error_report("Mismatched GPAs for block %s " 4290 "%" PRId64 "!= %" PRId64, 4291 id, (uint64_t)addr, 4292 (uint64_t)block->mr->addr); 4293 ret = -EINVAL; 4294 } 4295 } 4296 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4297 block->idstr); 4298 } else { 4299 error_report("Unknown ramblock \"%s\", cannot " 4300 "accept migration", id); 4301 ret = -EINVAL; 4302 } 4303 4304 total_ram_bytes -= length; 4305 } 4306 break; 4307 4308 case RAM_SAVE_FLAG_ZERO: 4309 ch = qemu_get_byte(f); 4310 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4311 break; 4312 4313 case RAM_SAVE_FLAG_PAGE: 4314 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4315 break; 4316 4317 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4318 len = qemu_get_be32(f); 4319 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4320 error_report("Invalid compressed data length: %d", len); 4321 ret = -EINVAL; 4322 break; 4323 } 4324 decompress_data_with_multi_threads(f, host, len); 4325 break; 4326 4327 case RAM_SAVE_FLAG_XBZRLE: 4328 if (load_xbzrle(f, addr, host) < 0) { 4329 error_report("Failed to decompress XBZRLE page at " 4330 RAM_ADDR_FMT, addr); 4331 ret = -EINVAL; 4332 break; 4333 } 4334 break; 4335 case RAM_SAVE_FLAG_EOS: 4336 /* normal exit */ 4337 multifd_recv_sync_main(); 4338 break; 4339 default: 4340 if (flags & RAM_SAVE_FLAG_HOOK) { 4341 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4342 } else { 4343 error_report("Unknown combination of migration flags: 0x%x", 4344 flags); 4345 ret = -EINVAL; 4346 } 4347 } 4348 if (!ret) { 4349 ret = qemu_file_get_error(f); 4350 } 4351 if (!ret && host_bak) { 4352 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4353 } 4354 } 4355 4356 ret |= wait_for_decompress_done(); 4357 return ret; 4358 } 4359 4360 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4361 { 4362 int ret = 0; 4363 static uint64_t seq_iter; 4364 /* 4365 * If system is running in postcopy mode, page inserts to host memory must 4366 * be atomic 4367 */ 4368 bool postcopy_running = postcopy_is_running(); 4369 4370 seq_iter++; 4371 4372 if (version_id != 4) { 4373 return -EINVAL; 4374 } 4375 4376 /* 4377 * This RCU critical section can be very long running. 4378 * When RCU reclaims in the code start to become numerous, 4379 * it will be necessary to reduce the granularity of this 4380 * critical section. 4381 */ 4382 WITH_RCU_READ_LOCK_GUARD() { 4383 if (postcopy_running) { 4384 /* 4385 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4386 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4387 * service fast page faults. 4388 */ 4389 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4390 } else { 4391 ret = ram_load_precopy(f); 4392 } 4393 } 4394 trace_ram_load_complete(ret, seq_iter); 4395 4396 return ret; 4397 } 4398 4399 static bool ram_has_postcopy(void *opaque) 4400 { 4401 RAMBlock *rb; 4402 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4403 if (ramblock_is_pmem(rb)) { 4404 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4405 "is not supported now!", rb->idstr, rb->host); 4406 return false; 4407 } 4408 } 4409 4410 return migrate_postcopy_ram(); 4411 } 4412 4413 /* Sync all the dirty bitmap with destination VM. */ 4414 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4415 { 4416 RAMBlock *block; 4417 QEMUFile *file = s->to_dst_file; 4418 int ramblock_count = 0; 4419 4420 trace_ram_dirty_bitmap_sync_start(); 4421 4422 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4423 qemu_savevm_send_recv_bitmap(file, block->idstr); 4424 trace_ram_dirty_bitmap_request(block->idstr); 4425 ramblock_count++; 4426 } 4427 4428 trace_ram_dirty_bitmap_sync_wait(); 4429 4430 /* Wait until all the ramblocks' dirty bitmap synced */ 4431 while (ramblock_count--) { 4432 qemu_sem_wait(&s->rp_state.rp_sem); 4433 } 4434 4435 trace_ram_dirty_bitmap_sync_complete(); 4436 4437 return 0; 4438 } 4439 4440 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4441 { 4442 qemu_sem_post(&s->rp_state.rp_sem); 4443 } 4444 4445 /* 4446 * Read the received bitmap, revert it as the initial dirty bitmap. 4447 * This is only used when the postcopy migration is paused but wants 4448 * to resume from a middle point. 4449 */ 4450 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4451 { 4452 int ret = -EINVAL; 4453 /* from_dst_file is always valid because we're within rp_thread */ 4454 QEMUFile *file = s->rp_state.from_dst_file; 4455 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4456 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4457 uint64_t size, end_mark; 4458 4459 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4460 4461 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4462 error_report("%s: incorrect state %s", __func__, 4463 MigrationStatus_str(s->state)); 4464 return -EINVAL; 4465 } 4466 4467 /* 4468 * Note: see comments in ramblock_recv_bitmap_send() on why we 4469 * need the endianness conversion, and the paddings. 4470 */ 4471 local_size = ROUND_UP(local_size, 8); 4472 4473 /* Add paddings */ 4474 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4475 4476 size = qemu_get_be64(file); 4477 4478 /* The size of the bitmap should match with our ramblock */ 4479 if (size != local_size) { 4480 error_report("%s: ramblock '%s' bitmap size mismatch " 4481 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4482 block->idstr, size, local_size); 4483 ret = -EINVAL; 4484 goto out; 4485 } 4486 4487 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4488 end_mark = qemu_get_be64(file); 4489 4490 ret = qemu_file_get_error(file); 4491 if (ret || size != local_size) { 4492 error_report("%s: read bitmap failed for ramblock '%s': %d" 4493 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4494 __func__, block->idstr, ret, local_size, size); 4495 ret = -EIO; 4496 goto out; 4497 } 4498 4499 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4500 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4501 __func__, block->idstr, end_mark); 4502 ret = -EINVAL; 4503 goto out; 4504 } 4505 4506 /* 4507 * Endianness conversion. We are during postcopy (though paused). 4508 * The dirty bitmap won't change. We can directly modify it. 4509 */ 4510 bitmap_from_le(block->bmap, le_bitmap, nbits); 4511 4512 /* 4513 * What we received is "received bitmap". Revert it as the initial 4514 * dirty bitmap for this ramblock. 4515 */ 4516 bitmap_complement(block->bmap, block->bmap, nbits); 4517 4518 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4519 ramblock_dirty_bitmap_clear_discarded_pages(block); 4520 4521 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4522 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4523 4524 /* 4525 * We succeeded to sync bitmap for current ramblock. If this is 4526 * the last one to sync, we need to notify the main send thread. 4527 */ 4528 ram_dirty_bitmap_reload_notify(s); 4529 4530 ret = 0; 4531 out: 4532 g_free(le_bitmap); 4533 return ret; 4534 } 4535 4536 static int ram_resume_prepare(MigrationState *s, void *opaque) 4537 { 4538 RAMState *rs = *(RAMState **)opaque; 4539 int ret; 4540 4541 ret = ram_dirty_bitmap_sync_all(s, rs); 4542 if (ret) { 4543 return ret; 4544 } 4545 4546 ram_state_resume_prepare(rs, s->to_dst_file); 4547 4548 return 0; 4549 } 4550 4551 void postcopy_preempt_shutdown_file(MigrationState *s) 4552 { 4553 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4554 qemu_fflush(s->postcopy_qemufile_src); 4555 } 4556 4557 static SaveVMHandlers savevm_ram_handlers = { 4558 .save_setup = ram_save_setup, 4559 .save_live_iterate = ram_save_iterate, 4560 .save_live_complete_postcopy = ram_save_complete, 4561 .save_live_complete_precopy = ram_save_complete, 4562 .has_postcopy = ram_has_postcopy, 4563 .save_live_pending = ram_save_pending, 4564 .load_state = ram_load, 4565 .save_cleanup = ram_save_cleanup, 4566 .load_setup = ram_load_setup, 4567 .load_cleanup = ram_load_cleanup, 4568 .resume_prepare = ram_resume_prepare, 4569 }; 4570 4571 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4572 size_t old_size, size_t new_size) 4573 { 4574 PostcopyState ps = postcopy_state_get(); 4575 ram_addr_t offset; 4576 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4577 Error *err = NULL; 4578 4579 if (ramblock_is_ignored(rb)) { 4580 return; 4581 } 4582 4583 if (!migration_is_idle()) { 4584 /* 4585 * Precopy code on the source cannot deal with the size of RAM blocks 4586 * changing at random points in time - especially after sending the 4587 * RAM block sizes in the migration stream, they must no longer change. 4588 * Abort and indicate a proper reason. 4589 */ 4590 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4591 migration_cancel(err); 4592 error_free(err); 4593 } 4594 4595 switch (ps) { 4596 case POSTCOPY_INCOMING_ADVISE: 4597 /* 4598 * Update what ram_postcopy_incoming_init()->init_range() does at the 4599 * time postcopy was advised. Syncing RAM blocks with the source will 4600 * result in RAM resizes. 4601 */ 4602 if (old_size < new_size) { 4603 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4604 error_report("RAM block '%s' discard of resized RAM failed", 4605 rb->idstr); 4606 } 4607 } 4608 rb->postcopy_length = new_size; 4609 break; 4610 case POSTCOPY_INCOMING_NONE: 4611 case POSTCOPY_INCOMING_RUNNING: 4612 case POSTCOPY_INCOMING_END: 4613 /* 4614 * Once our guest is running, postcopy does no longer care about 4615 * resizes. When growing, the new memory was not available on the 4616 * source, no handler needed. 4617 */ 4618 break; 4619 default: 4620 error_report("RAM block '%s' resized during postcopy state: %d", 4621 rb->idstr, ps); 4622 exit(-1); 4623 } 4624 } 4625 4626 static RAMBlockNotifier ram_mig_ram_notifier = { 4627 .ram_block_resized = ram_mig_ram_block_resized, 4628 }; 4629 4630 void ram_mig_init(void) 4631 { 4632 qemu_mutex_init(&XBZRLE.lock); 4633 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4634 ram_block_notifier_add(&ram_mig_ram_notifier); 4635 } 4636