1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "cpu.h" 31 #include "qemu/cutils.h" 32 #include "qemu/bitops.h" 33 #include "qemu/bitmap.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration/register.h" 39 #include "migration/misc.h" 40 #include "qemu-file.h" 41 #include "postcopy-ram.h" 42 #include "page_cache.h" 43 #include "qemu/error-report.h" 44 #include "qapi/error.h" 45 #include "qapi/qapi-types-migration.h" 46 #include "qapi/qapi-events-migration.h" 47 #include "qapi/qmp/qerror.h" 48 #include "trace.h" 49 #include "exec/ram_addr.h" 50 #include "exec/target_page.h" 51 #include "qemu/rcu_queue.h" 52 #include "migration/colo.h" 53 #include "block.h" 54 #include "sysemu/sysemu.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 60 /***********************************************************/ 61 /* ram save/restore */ 62 63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 64 * worked for pages that where filled with the same char. We switched 65 * it to only search for the zero value. And to avoid confusion with 66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 67 */ 68 69 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 70 #define RAM_SAVE_FLAG_ZERO 0x02 71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 72 #define RAM_SAVE_FLAG_PAGE 0x08 73 #define RAM_SAVE_FLAG_EOS 0x10 74 #define RAM_SAVE_FLAG_CONTINUE 0x20 75 #define RAM_SAVE_FLAG_XBZRLE 0x40 76 /* 0x80 is reserved in migration.h start with 0x100 next */ 77 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 78 79 static inline bool is_zero_range(uint8_t *p, uint64_t size) 80 { 81 return buffer_is_zero(p, size); 82 } 83 84 XBZRLECacheStats xbzrle_counters; 85 86 /* struct contains XBZRLE cache and a static page 87 used by the compression */ 88 static struct { 89 /* buffer used for XBZRLE encoding */ 90 uint8_t *encoded_buf; 91 /* buffer for storing page content */ 92 uint8_t *current_buf; 93 /* Cache for XBZRLE, Protected by lock. */ 94 PageCache *cache; 95 QemuMutex lock; 96 /* it will store a page full of zeros */ 97 uint8_t *zero_target_page; 98 /* buffer used for XBZRLE decoding */ 99 uint8_t *decoded_buf; 100 } XBZRLE; 101 102 static void XBZRLE_cache_lock(void) 103 { 104 if (migrate_use_xbzrle()) { 105 qemu_mutex_lock(&XBZRLE.lock); 106 } 107 } 108 109 static void XBZRLE_cache_unlock(void) 110 { 111 if (migrate_use_xbzrle()) { 112 qemu_mutex_unlock(&XBZRLE.lock); 113 } 114 } 115 116 /** 117 * xbzrle_cache_resize: resize the xbzrle cache 118 * 119 * This function is called from qmp_migrate_set_cache_size in main 120 * thread, possibly while a migration is in progress. A running 121 * migration may be using the cache and might finish during this call, 122 * hence changes to the cache are protected by XBZRLE.lock(). 123 * 124 * Returns 0 for success or -1 for error 125 * 126 * @new_size: new cache size 127 * @errp: set *errp if the check failed, with reason 128 */ 129 int xbzrle_cache_resize(int64_t new_size, Error **errp) 130 { 131 PageCache *new_cache; 132 int64_t ret = 0; 133 134 /* Check for truncation */ 135 if (new_size != (size_t)new_size) { 136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 137 "exceeding address space"); 138 return -1; 139 } 140 141 if (new_size == migrate_xbzrle_cache_size()) { 142 /* nothing to do */ 143 return 0; 144 } 145 146 XBZRLE_cache_lock(); 147 148 if (XBZRLE.cache != NULL) { 149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 150 if (!new_cache) { 151 ret = -1; 152 goto out; 153 } 154 155 cache_fini(XBZRLE.cache); 156 XBZRLE.cache = new_cache; 157 } 158 out: 159 XBZRLE_cache_unlock(); 160 return ret; 161 } 162 163 bool ramblock_is_ignored(RAMBlock *block) 164 { 165 return !qemu_ram_is_migratable(block) || 166 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 167 } 168 169 #undef RAMBLOCK_FOREACH 170 171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 172 { 173 RAMBlock *block; 174 int ret = 0; 175 176 RCU_READ_LOCK_GUARD(); 177 178 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 179 ret = func(block, opaque); 180 if (ret) { 181 break; 182 } 183 } 184 return ret; 185 } 186 187 static void ramblock_recv_map_init(void) 188 { 189 RAMBlock *rb; 190 191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 192 assert(!rb->receivedmap); 193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 194 } 195 } 196 197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 198 { 199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 200 rb->receivedmap); 201 } 202 203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 204 { 205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 206 } 207 208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 209 { 210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 211 } 212 213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 214 size_t nr) 215 { 216 bitmap_set_atomic(rb->receivedmap, 217 ramblock_recv_bitmap_offset(host_addr, rb), 218 nr); 219 } 220 221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 222 223 /* 224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 225 * 226 * Returns >0 if success with sent bytes, or <0 if error. 227 */ 228 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 229 const char *block_name) 230 { 231 RAMBlock *block = qemu_ram_block_by_name(block_name); 232 unsigned long *le_bitmap, nbits; 233 uint64_t size; 234 235 if (!block) { 236 error_report("%s: invalid block name: %s", __func__, block_name); 237 return -1; 238 } 239 240 nbits = block->used_length >> TARGET_PAGE_BITS; 241 242 /* 243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 244 * machines we may need 4 more bytes for padding (see below 245 * comment). So extend it a bit before hand. 246 */ 247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 248 249 /* 250 * Always use little endian when sending the bitmap. This is 251 * required that when source and destination VMs are not using the 252 * same endianness. (Note: big endian won't work.) 253 */ 254 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 255 256 /* Size of the bitmap, in bytes */ 257 size = DIV_ROUND_UP(nbits, 8); 258 259 /* 260 * size is always aligned to 8 bytes for 64bit machines, but it 261 * may not be true for 32bit machines. We need this padding to 262 * make sure the migration can survive even between 32bit and 263 * 64bit machines. 264 */ 265 size = ROUND_UP(size, 8); 266 267 qemu_put_be64(file, size); 268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 269 /* 270 * Mark as an end, in case the middle part is screwed up due to 271 * some "mysterious" reason. 272 */ 273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 274 qemu_fflush(file); 275 276 g_free(le_bitmap); 277 278 if (qemu_file_get_error(file)) { 279 return qemu_file_get_error(file); 280 } 281 282 return size + sizeof(size); 283 } 284 285 /* 286 * An outstanding page request, on the source, having been received 287 * and queued 288 */ 289 struct RAMSrcPageRequest { 290 RAMBlock *rb; 291 hwaddr offset; 292 hwaddr len; 293 294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 295 }; 296 297 /* State of RAM for migration */ 298 struct RAMState { 299 /* QEMUFile used for this migration */ 300 QEMUFile *f; 301 /* Last block that we have visited searching for dirty pages */ 302 RAMBlock *last_seen_block; 303 /* Last block from where we have sent data */ 304 RAMBlock *last_sent_block; 305 /* Last dirty target page we have sent */ 306 ram_addr_t last_page; 307 /* last ram version we have seen */ 308 uint32_t last_version; 309 /* We are in the first round */ 310 bool ram_bulk_stage; 311 /* The free page optimization is enabled */ 312 bool fpo_enabled; 313 /* How many times we have dirty too many pages */ 314 int dirty_rate_high_cnt; 315 /* these variables are used for bitmap sync */ 316 /* last time we did a full bitmap_sync */ 317 int64_t time_last_bitmap_sync; 318 /* bytes transferred at start_time */ 319 uint64_t bytes_xfer_prev; 320 /* number of dirty pages since start_time */ 321 uint64_t num_dirty_pages_period; 322 /* xbzrle misses since the beginning of the period */ 323 uint64_t xbzrle_cache_miss_prev; 324 /* Amount of xbzrle pages since the beginning of the period */ 325 uint64_t xbzrle_pages_prev; 326 /* Amount of xbzrle encoded bytes since the beginning of the period */ 327 uint64_t xbzrle_bytes_prev; 328 329 /* compression statistics since the beginning of the period */ 330 /* amount of count that no free thread to compress data */ 331 uint64_t compress_thread_busy_prev; 332 /* amount bytes after compression */ 333 uint64_t compressed_size_prev; 334 /* amount of compressed pages */ 335 uint64_t compress_pages_prev; 336 337 /* total handled target pages at the beginning of period */ 338 uint64_t target_page_count_prev; 339 /* total handled target pages since start */ 340 uint64_t target_page_count; 341 /* number of dirty bits in the bitmap */ 342 uint64_t migration_dirty_pages; 343 /* Protects modification of the bitmap and migration dirty pages */ 344 QemuMutex bitmap_mutex; 345 /* The RAMBlock used in the last src_page_requests */ 346 RAMBlock *last_req_rb; 347 /* Queue of outstanding page requests from the destination */ 348 QemuMutex src_page_req_mutex; 349 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 350 }; 351 typedef struct RAMState RAMState; 352 353 static RAMState *ram_state; 354 355 static NotifierWithReturnList precopy_notifier_list; 356 357 void precopy_infrastructure_init(void) 358 { 359 notifier_with_return_list_init(&precopy_notifier_list); 360 } 361 362 void precopy_add_notifier(NotifierWithReturn *n) 363 { 364 notifier_with_return_list_add(&precopy_notifier_list, n); 365 } 366 367 void precopy_remove_notifier(NotifierWithReturn *n) 368 { 369 notifier_with_return_remove(n); 370 } 371 372 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 373 { 374 PrecopyNotifyData pnd; 375 pnd.reason = reason; 376 pnd.errp = errp; 377 378 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 379 } 380 381 void precopy_enable_free_page_optimization(void) 382 { 383 if (!ram_state) { 384 return; 385 } 386 387 ram_state->fpo_enabled = true; 388 } 389 390 uint64_t ram_bytes_remaining(void) 391 { 392 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 393 0; 394 } 395 396 MigrationStats ram_counters; 397 398 /* used by the search for pages to send */ 399 struct PageSearchStatus { 400 /* Current block being searched */ 401 RAMBlock *block; 402 /* Current page to search from */ 403 unsigned long page; 404 /* Set once we wrap around */ 405 bool complete_round; 406 }; 407 typedef struct PageSearchStatus PageSearchStatus; 408 409 CompressionStats compression_counters; 410 411 struct CompressParam { 412 bool done; 413 bool quit; 414 bool zero_page; 415 QEMUFile *file; 416 QemuMutex mutex; 417 QemuCond cond; 418 RAMBlock *block; 419 ram_addr_t offset; 420 421 /* internally used fields */ 422 z_stream stream; 423 uint8_t *originbuf; 424 }; 425 typedef struct CompressParam CompressParam; 426 427 struct DecompressParam { 428 bool done; 429 bool quit; 430 QemuMutex mutex; 431 QemuCond cond; 432 void *des; 433 uint8_t *compbuf; 434 int len; 435 z_stream stream; 436 }; 437 typedef struct DecompressParam DecompressParam; 438 439 static CompressParam *comp_param; 440 static QemuThread *compress_threads; 441 /* comp_done_cond is used to wake up the migration thread when 442 * one of the compression threads has finished the compression. 443 * comp_done_lock is used to co-work with comp_done_cond. 444 */ 445 static QemuMutex comp_done_lock; 446 static QemuCond comp_done_cond; 447 /* The empty QEMUFileOps will be used by file in CompressParam */ 448 static const QEMUFileOps empty_ops = { }; 449 450 static QEMUFile *decomp_file; 451 static DecompressParam *decomp_param; 452 static QemuThread *decompress_threads; 453 static QemuMutex decomp_done_lock; 454 static QemuCond decomp_done_cond; 455 456 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 457 ram_addr_t offset, uint8_t *source_buf); 458 459 static void *do_data_compress(void *opaque) 460 { 461 CompressParam *param = opaque; 462 RAMBlock *block; 463 ram_addr_t offset; 464 bool zero_page; 465 466 qemu_mutex_lock(¶m->mutex); 467 while (!param->quit) { 468 if (param->block) { 469 block = param->block; 470 offset = param->offset; 471 param->block = NULL; 472 qemu_mutex_unlock(¶m->mutex); 473 474 zero_page = do_compress_ram_page(param->file, ¶m->stream, 475 block, offset, param->originbuf); 476 477 qemu_mutex_lock(&comp_done_lock); 478 param->done = true; 479 param->zero_page = zero_page; 480 qemu_cond_signal(&comp_done_cond); 481 qemu_mutex_unlock(&comp_done_lock); 482 483 qemu_mutex_lock(¶m->mutex); 484 } else { 485 qemu_cond_wait(¶m->cond, ¶m->mutex); 486 } 487 } 488 qemu_mutex_unlock(¶m->mutex); 489 490 return NULL; 491 } 492 493 static void compress_threads_save_cleanup(void) 494 { 495 int i, thread_count; 496 497 if (!migrate_use_compression() || !comp_param) { 498 return; 499 } 500 501 thread_count = migrate_compress_threads(); 502 for (i = 0; i < thread_count; i++) { 503 /* 504 * we use it as a indicator which shows if the thread is 505 * properly init'd or not 506 */ 507 if (!comp_param[i].file) { 508 break; 509 } 510 511 qemu_mutex_lock(&comp_param[i].mutex); 512 comp_param[i].quit = true; 513 qemu_cond_signal(&comp_param[i].cond); 514 qemu_mutex_unlock(&comp_param[i].mutex); 515 516 qemu_thread_join(compress_threads + i); 517 qemu_mutex_destroy(&comp_param[i].mutex); 518 qemu_cond_destroy(&comp_param[i].cond); 519 deflateEnd(&comp_param[i].stream); 520 g_free(comp_param[i].originbuf); 521 qemu_fclose(comp_param[i].file); 522 comp_param[i].file = NULL; 523 } 524 qemu_mutex_destroy(&comp_done_lock); 525 qemu_cond_destroy(&comp_done_cond); 526 g_free(compress_threads); 527 g_free(comp_param); 528 compress_threads = NULL; 529 comp_param = NULL; 530 } 531 532 static int compress_threads_save_setup(void) 533 { 534 int i, thread_count; 535 536 if (!migrate_use_compression()) { 537 return 0; 538 } 539 thread_count = migrate_compress_threads(); 540 compress_threads = g_new0(QemuThread, thread_count); 541 comp_param = g_new0(CompressParam, thread_count); 542 qemu_cond_init(&comp_done_cond); 543 qemu_mutex_init(&comp_done_lock); 544 for (i = 0; i < thread_count; i++) { 545 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 546 if (!comp_param[i].originbuf) { 547 goto exit; 548 } 549 550 if (deflateInit(&comp_param[i].stream, 551 migrate_compress_level()) != Z_OK) { 552 g_free(comp_param[i].originbuf); 553 goto exit; 554 } 555 556 /* comp_param[i].file is just used as a dummy buffer to save data, 557 * set its ops to empty. 558 */ 559 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 560 comp_param[i].done = true; 561 comp_param[i].quit = false; 562 qemu_mutex_init(&comp_param[i].mutex); 563 qemu_cond_init(&comp_param[i].cond); 564 qemu_thread_create(compress_threads + i, "compress", 565 do_data_compress, comp_param + i, 566 QEMU_THREAD_JOINABLE); 567 } 568 return 0; 569 570 exit: 571 compress_threads_save_cleanup(); 572 return -1; 573 } 574 575 /** 576 * save_page_header: write page header to wire 577 * 578 * If this is the 1st block, it also writes the block identification 579 * 580 * Returns the number of bytes written 581 * 582 * @f: QEMUFile where to send the data 583 * @block: block that contains the page we want to send 584 * @offset: offset inside the block for the page 585 * in the lower bits, it contains flags 586 */ 587 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 588 ram_addr_t offset) 589 { 590 size_t size, len; 591 592 if (block == rs->last_sent_block) { 593 offset |= RAM_SAVE_FLAG_CONTINUE; 594 } 595 qemu_put_be64(f, offset); 596 size = 8; 597 598 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 599 len = strlen(block->idstr); 600 qemu_put_byte(f, len); 601 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 602 size += 1 + len; 603 rs->last_sent_block = block; 604 } 605 return size; 606 } 607 608 /** 609 * mig_throttle_guest_down: throotle down the guest 610 * 611 * Reduce amount of guest cpu execution to hopefully slow down memory 612 * writes. If guest dirty memory rate is reduced below the rate at 613 * which we can transfer pages to the destination then we should be 614 * able to complete migration. Some workloads dirty memory way too 615 * fast and will not effectively converge, even with auto-converge. 616 */ 617 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 618 uint64_t bytes_dirty_threshold) 619 { 620 MigrationState *s = migrate_get_current(); 621 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 622 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 623 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 624 int pct_max = s->parameters.max_cpu_throttle; 625 626 uint64_t throttle_now = cpu_throttle_get_percentage(); 627 uint64_t cpu_now, cpu_ideal, throttle_inc; 628 629 /* We have not started throttling yet. Let's start it. */ 630 if (!cpu_throttle_active()) { 631 cpu_throttle_set(pct_initial); 632 } else { 633 /* Throttling already on, just increase the rate */ 634 if (!pct_tailslow) { 635 throttle_inc = pct_increment; 636 } else { 637 /* Compute the ideal CPU percentage used by Guest, which may 638 * make the dirty rate match the dirty rate threshold. */ 639 cpu_now = 100 - throttle_now; 640 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 641 bytes_dirty_period); 642 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 643 } 644 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 645 } 646 } 647 648 /** 649 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 650 * 651 * @rs: current RAM state 652 * @current_addr: address for the zero page 653 * 654 * Update the xbzrle cache to reflect a page that's been sent as all 0. 655 * The important thing is that a stale (not-yet-0'd) page be replaced 656 * by the new data. 657 * As a bonus, if the page wasn't in the cache it gets added so that 658 * when a small write is made into the 0'd page it gets XBZRLE sent. 659 */ 660 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 661 { 662 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 663 return; 664 } 665 666 /* We don't care if this fails to allocate a new cache page 667 * as long as it updated an old one */ 668 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 669 ram_counters.dirty_sync_count); 670 } 671 672 #define ENCODING_FLAG_XBZRLE 0x1 673 674 /** 675 * save_xbzrle_page: compress and send current page 676 * 677 * Returns: 1 means that we wrote the page 678 * 0 means that page is identical to the one already sent 679 * -1 means that xbzrle would be longer than normal 680 * 681 * @rs: current RAM state 682 * @current_data: pointer to the address of the page contents 683 * @current_addr: addr of the page 684 * @block: block that contains the page we want to send 685 * @offset: offset inside the block for the page 686 * @last_stage: if we are at the completion stage 687 */ 688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 689 ram_addr_t current_addr, RAMBlock *block, 690 ram_addr_t offset, bool last_stage) 691 { 692 int encoded_len = 0, bytes_xbzrle; 693 uint8_t *prev_cached_page; 694 695 if (!cache_is_cached(XBZRLE.cache, current_addr, 696 ram_counters.dirty_sync_count)) { 697 xbzrle_counters.cache_miss++; 698 if (!last_stage) { 699 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 700 ram_counters.dirty_sync_count) == -1) { 701 return -1; 702 } else { 703 /* update *current_data when the page has been 704 inserted into cache */ 705 *current_data = get_cached_data(XBZRLE.cache, current_addr); 706 } 707 } 708 return -1; 709 } 710 711 /* 712 * Reaching here means the page has hit the xbzrle cache, no matter what 713 * encoding result it is (normal encoding, overflow or skipping the page), 714 * count the page as encoded. This is used to calculate the encoding rate. 715 * 716 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 717 * 2nd page turns out to be skipped (i.e. no new bytes written to the 718 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 719 * skipped page included. In this way, the encoding rate can tell if the 720 * guest page is good for xbzrle encoding. 721 */ 722 xbzrle_counters.pages++; 723 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 724 725 /* save current buffer into memory */ 726 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 727 728 /* XBZRLE encoding (if there is no overflow) */ 729 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 730 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 731 TARGET_PAGE_SIZE); 732 733 /* 734 * Update the cache contents, so that it corresponds to the data 735 * sent, in all cases except where we skip the page. 736 */ 737 if (!last_stage && encoded_len != 0) { 738 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 739 /* 740 * In the case where we couldn't compress, ensure that the caller 741 * sends the data from the cache, since the guest might have 742 * changed the RAM since we copied it. 743 */ 744 *current_data = prev_cached_page; 745 } 746 747 if (encoded_len == 0) { 748 trace_save_xbzrle_page_skipping(); 749 return 0; 750 } else if (encoded_len == -1) { 751 trace_save_xbzrle_page_overflow(); 752 xbzrle_counters.overflow++; 753 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 754 return -1; 755 } 756 757 /* Send XBZRLE based compressed page */ 758 bytes_xbzrle = save_page_header(rs, rs->f, block, 759 offset | RAM_SAVE_FLAG_XBZRLE); 760 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 761 qemu_put_be16(rs->f, encoded_len); 762 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 763 bytes_xbzrle += encoded_len + 1 + 2; 764 /* 765 * Like compressed_size (please see update_compress_thread_counts), 766 * the xbzrle encoded bytes don't count the 8 byte header with 767 * RAM_SAVE_FLAG_CONTINUE. 768 */ 769 xbzrle_counters.bytes += bytes_xbzrle - 8; 770 ram_counters.transferred += bytes_xbzrle; 771 772 return 1; 773 } 774 775 /** 776 * migration_bitmap_find_dirty: find the next dirty page from start 777 * 778 * Returns the page offset within memory region of the start of a dirty page 779 * 780 * @rs: current RAM state 781 * @rb: RAMBlock where to search for dirty pages 782 * @start: page where we start the search 783 */ 784 static inline 785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 786 unsigned long start) 787 { 788 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 789 unsigned long *bitmap = rb->bmap; 790 unsigned long next; 791 792 if (ramblock_is_ignored(rb)) { 793 return size; 794 } 795 796 /* 797 * When the free page optimization is enabled, we need to check the bitmap 798 * to send the non-free pages rather than all the pages in the bulk stage. 799 */ 800 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) { 801 next = start + 1; 802 } else { 803 next = find_next_bit(bitmap, size, start); 804 } 805 806 return next; 807 } 808 809 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 810 RAMBlock *rb, 811 unsigned long page) 812 { 813 bool ret; 814 815 qemu_mutex_lock(&rs->bitmap_mutex); 816 817 /* 818 * Clear dirty bitmap if needed. This _must_ be called before we 819 * send any of the page in the chunk because we need to make sure 820 * we can capture further page content changes when we sync dirty 821 * log the next time. So as long as we are going to send any of 822 * the page in the chunk we clear the remote dirty bitmap for all. 823 * Clearing it earlier won't be a problem, but too late will. 824 */ 825 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) { 826 uint8_t shift = rb->clear_bmap_shift; 827 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift); 828 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); 829 830 /* 831 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 832 * can make things easier sometimes since then start address 833 * of the small chunk will always be 64 pages aligned so the 834 * bitmap will always be aligned to unsigned long. We should 835 * even be able to remove this restriction but I'm simply 836 * keeping it. 837 */ 838 assert(shift >= 6); 839 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 840 memory_region_clear_dirty_bitmap(rb->mr, start, size); 841 } 842 843 ret = test_and_clear_bit(page, rb->bmap); 844 845 if (ret) { 846 rs->migration_dirty_pages--; 847 } 848 qemu_mutex_unlock(&rs->bitmap_mutex); 849 850 return ret; 851 } 852 853 /* Called with RCU critical section */ 854 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 855 { 856 uint64_t new_dirty_pages = 857 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 858 859 rs->migration_dirty_pages += new_dirty_pages; 860 rs->num_dirty_pages_period += new_dirty_pages; 861 } 862 863 /** 864 * ram_pagesize_summary: calculate all the pagesizes of a VM 865 * 866 * Returns a summary bitmap of the page sizes of all RAMBlocks 867 * 868 * For VMs with just normal pages this is equivalent to the host page 869 * size. If it's got some huge pages then it's the OR of all the 870 * different page sizes. 871 */ 872 uint64_t ram_pagesize_summary(void) 873 { 874 RAMBlock *block; 875 uint64_t summary = 0; 876 877 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 878 summary |= block->page_size; 879 } 880 881 return summary; 882 } 883 884 uint64_t ram_get_total_transferred_pages(void) 885 { 886 return ram_counters.normal + ram_counters.duplicate + 887 compression_counters.pages + xbzrle_counters.pages; 888 } 889 890 static void migration_update_rates(RAMState *rs, int64_t end_time) 891 { 892 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 893 double compressed_size; 894 895 /* calculate period counters */ 896 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 897 / (end_time - rs->time_last_bitmap_sync); 898 899 if (!page_count) { 900 return; 901 } 902 903 if (migrate_use_xbzrle()) { 904 double encoded_size, unencoded_size; 905 906 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 907 rs->xbzrle_cache_miss_prev) / page_count; 908 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 909 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 910 TARGET_PAGE_SIZE; 911 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 912 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 913 xbzrle_counters.encoding_rate = 0; 914 } else { 915 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 916 } 917 rs->xbzrle_pages_prev = xbzrle_counters.pages; 918 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 919 } 920 921 if (migrate_use_compression()) { 922 compression_counters.busy_rate = (double)(compression_counters.busy - 923 rs->compress_thread_busy_prev) / page_count; 924 rs->compress_thread_busy_prev = compression_counters.busy; 925 926 compressed_size = compression_counters.compressed_size - 927 rs->compressed_size_prev; 928 if (compressed_size) { 929 double uncompressed_size = (compression_counters.pages - 930 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 931 932 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 933 compression_counters.compression_rate = 934 uncompressed_size / compressed_size; 935 936 rs->compress_pages_prev = compression_counters.pages; 937 rs->compressed_size_prev = compression_counters.compressed_size; 938 } 939 } 940 } 941 942 static void migration_trigger_throttle(RAMState *rs) 943 { 944 MigrationState *s = migrate_get_current(); 945 uint64_t threshold = s->parameters.throttle_trigger_threshold; 946 947 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 948 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 949 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 950 951 /* During block migration the auto-converge logic incorrectly detects 952 * that ram migration makes no progress. Avoid this by disabling the 953 * throttling logic during the bulk phase of block migration. */ 954 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 955 /* The following detection logic can be refined later. For now: 956 Check to see if the ratio between dirtied bytes and the approx. 957 amount of bytes that just got transferred since the last time 958 we were in this routine reaches the threshold. If that happens 959 twice, start or increase throttling. */ 960 961 if ((bytes_dirty_period > bytes_dirty_threshold) && 962 (++rs->dirty_rate_high_cnt >= 2)) { 963 trace_migration_throttle(); 964 rs->dirty_rate_high_cnt = 0; 965 mig_throttle_guest_down(bytes_dirty_period, 966 bytes_dirty_threshold); 967 } 968 } 969 } 970 971 static void migration_bitmap_sync(RAMState *rs) 972 { 973 RAMBlock *block; 974 int64_t end_time; 975 976 ram_counters.dirty_sync_count++; 977 978 if (!rs->time_last_bitmap_sync) { 979 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 980 } 981 982 trace_migration_bitmap_sync_start(); 983 memory_global_dirty_log_sync(); 984 985 qemu_mutex_lock(&rs->bitmap_mutex); 986 WITH_RCU_READ_LOCK_GUARD() { 987 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 988 ramblock_sync_dirty_bitmap(rs, block); 989 } 990 ram_counters.remaining = ram_bytes_remaining(); 991 } 992 qemu_mutex_unlock(&rs->bitmap_mutex); 993 994 memory_global_after_dirty_log_sync(); 995 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 996 997 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 998 999 /* more than 1 second = 1000 millisecons */ 1000 if (end_time > rs->time_last_bitmap_sync + 1000) { 1001 migration_trigger_throttle(rs); 1002 1003 migration_update_rates(rs, end_time); 1004 1005 rs->target_page_count_prev = rs->target_page_count; 1006 1007 /* reset period counters */ 1008 rs->time_last_bitmap_sync = end_time; 1009 rs->num_dirty_pages_period = 0; 1010 rs->bytes_xfer_prev = ram_counters.transferred; 1011 } 1012 if (migrate_use_events()) { 1013 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1014 } 1015 } 1016 1017 static void migration_bitmap_sync_precopy(RAMState *rs) 1018 { 1019 Error *local_err = NULL; 1020 1021 /* 1022 * The current notifier usage is just an optimization to migration, so we 1023 * don't stop the normal migration process in the error case. 1024 */ 1025 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1026 error_report_err(local_err); 1027 local_err = NULL; 1028 } 1029 1030 migration_bitmap_sync(rs); 1031 1032 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1033 error_report_err(local_err); 1034 } 1035 } 1036 1037 /** 1038 * save_zero_page_to_file: send the zero page to the file 1039 * 1040 * Returns the size of data written to the file, 0 means the page is not 1041 * a zero page 1042 * 1043 * @rs: current RAM state 1044 * @file: the file where the data is saved 1045 * @block: block that contains the page we want to send 1046 * @offset: offset inside the block for the page 1047 */ 1048 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1049 RAMBlock *block, ram_addr_t offset) 1050 { 1051 uint8_t *p = block->host + offset; 1052 int len = 0; 1053 1054 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1055 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1056 qemu_put_byte(file, 0); 1057 len += 1; 1058 } 1059 return len; 1060 } 1061 1062 /** 1063 * save_zero_page: send the zero page to the stream 1064 * 1065 * Returns the number of pages written. 1066 * 1067 * @rs: current RAM state 1068 * @block: block that contains the page we want to send 1069 * @offset: offset inside the block for the page 1070 */ 1071 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1072 { 1073 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1074 1075 if (len) { 1076 ram_counters.duplicate++; 1077 ram_counters.transferred += len; 1078 return 1; 1079 } 1080 return -1; 1081 } 1082 1083 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1084 { 1085 if (!migrate_release_ram() || !migration_in_postcopy()) { 1086 return; 1087 } 1088 1089 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1090 } 1091 1092 /* 1093 * @pages: the number of pages written by the control path, 1094 * < 0 - error 1095 * > 0 - number of pages written 1096 * 1097 * Return true if the pages has been saved, otherwise false is returned. 1098 */ 1099 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1100 int *pages) 1101 { 1102 uint64_t bytes_xmit = 0; 1103 int ret; 1104 1105 *pages = -1; 1106 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1107 &bytes_xmit); 1108 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1109 return false; 1110 } 1111 1112 if (bytes_xmit) { 1113 ram_counters.transferred += bytes_xmit; 1114 *pages = 1; 1115 } 1116 1117 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1118 return true; 1119 } 1120 1121 if (bytes_xmit > 0) { 1122 ram_counters.normal++; 1123 } else if (bytes_xmit == 0) { 1124 ram_counters.duplicate++; 1125 } 1126 1127 return true; 1128 } 1129 1130 /* 1131 * directly send the page to the stream 1132 * 1133 * Returns the number of pages written. 1134 * 1135 * @rs: current RAM state 1136 * @block: block that contains the page we want to send 1137 * @offset: offset inside the block for the page 1138 * @buf: the page to be sent 1139 * @async: send to page asyncly 1140 */ 1141 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1142 uint8_t *buf, bool async) 1143 { 1144 ram_counters.transferred += save_page_header(rs, rs->f, block, 1145 offset | RAM_SAVE_FLAG_PAGE); 1146 if (async) { 1147 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1148 migrate_release_ram() & 1149 migration_in_postcopy()); 1150 } else { 1151 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1152 } 1153 ram_counters.transferred += TARGET_PAGE_SIZE; 1154 ram_counters.normal++; 1155 return 1; 1156 } 1157 1158 /** 1159 * ram_save_page: send the given page to the stream 1160 * 1161 * Returns the number of pages written. 1162 * < 0 - error 1163 * >=0 - Number of pages written - this might legally be 0 1164 * if xbzrle noticed the page was the same. 1165 * 1166 * @rs: current RAM state 1167 * @block: block that contains the page we want to send 1168 * @offset: offset inside the block for the page 1169 * @last_stage: if we are at the completion stage 1170 */ 1171 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1172 { 1173 int pages = -1; 1174 uint8_t *p; 1175 bool send_async = true; 1176 RAMBlock *block = pss->block; 1177 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1178 ram_addr_t current_addr = block->offset + offset; 1179 1180 p = block->host + offset; 1181 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1182 1183 XBZRLE_cache_lock(); 1184 if (!rs->ram_bulk_stage && !migration_in_postcopy() && 1185 migrate_use_xbzrle()) { 1186 pages = save_xbzrle_page(rs, &p, current_addr, block, 1187 offset, last_stage); 1188 if (!last_stage) { 1189 /* Can't send this cached data async, since the cache page 1190 * might get updated before it gets to the wire 1191 */ 1192 send_async = false; 1193 } 1194 } 1195 1196 /* XBZRLE overflow or normal page */ 1197 if (pages == -1) { 1198 pages = save_normal_page(rs, block, offset, p, send_async); 1199 } 1200 1201 XBZRLE_cache_unlock(); 1202 1203 return pages; 1204 } 1205 1206 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1207 ram_addr_t offset) 1208 { 1209 if (multifd_queue_page(rs->f, block, offset) < 0) { 1210 return -1; 1211 } 1212 ram_counters.normal++; 1213 1214 return 1; 1215 } 1216 1217 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1218 ram_addr_t offset, uint8_t *source_buf) 1219 { 1220 RAMState *rs = ram_state; 1221 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1222 bool zero_page = false; 1223 int ret; 1224 1225 if (save_zero_page_to_file(rs, f, block, offset)) { 1226 zero_page = true; 1227 goto exit; 1228 } 1229 1230 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1231 1232 /* 1233 * copy it to a internal buffer to avoid it being modified by VM 1234 * so that we can catch up the error during compression and 1235 * decompression 1236 */ 1237 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1238 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1239 if (ret < 0) { 1240 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1241 error_report("compressed data failed!"); 1242 return false; 1243 } 1244 1245 exit: 1246 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1247 return zero_page; 1248 } 1249 1250 static void 1251 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1252 { 1253 ram_counters.transferred += bytes_xmit; 1254 1255 if (param->zero_page) { 1256 ram_counters.duplicate++; 1257 return; 1258 } 1259 1260 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1261 compression_counters.compressed_size += bytes_xmit - 8; 1262 compression_counters.pages++; 1263 } 1264 1265 static bool save_page_use_compression(RAMState *rs); 1266 1267 static void flush_compressed_data(RAMState *rs) 1268 { 1269 int idx, len, thread_count; 1270 1271 if (!save_page_use_compression(rs)) { 1272 return; 1273 } 1274 thread_count = migrate_compress_threads(); 1275 1276 qemu_mutex_lock(&comp_done_lock); 1277 for (idx = 0; idx < thread_count; idx++) { 1278 while (!comp_param[idx].done) { 1279 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1280 } 1281 } 1282 qemu_mutex_unlock(&comp_done_lock); 1283 1284 for (idx = 0; idx < thread_count; idx++) { 1285 qemu_mutex_lock(&comp_param[idx].mutex); 1286 if (!comp_param[idx].quit) { 1287 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1288 /* 1289 * it's safe to fetch zero_page without holding comp_done_lock 1290 * as there is no further request submitted to the thread, 1291 * i.e, the thread should be waiting for a request at this point. 1292 */ 1293 update_compress_thread_counts(&comp_param[idx], len); 1294 } 1295 qemu_mutex_unlock(&comp_param[idx].mutex); 1296 } 1297 } 1298 1299 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1300 ram_addr_t offset) 1301 { 1302 param->block = block; 1303 param->offset = offset; 1304 } 1305 1306 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1307 ram_addr_t offset) 1308 { 1309 int idx, thread_count, bytes_xmit = -1, pages = -1; 1310 bool wait = migrate_compress_wait_thread(); 1311 1312 thread_count = migrate_compress_threads(); 1313 qemu_mutex_lock(&comp_done_lock); 1314 retry: 1315 for (idx = 0; idx < thread_count; idx++) { 1316 if (comp_param[idx].done) { 1317 comp_param[idx].done = false; 1318 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1319 qemu_mutex_lock(&comp_param[idx].mutex); 1320 set_compress_params(&comp_param[idx], block, offset); 1321 qemu_cond_signal(&comp_param[idx].cond); 1322 qemu_mutex_unlock(&comp_param[idx].mutex); 1323 pages = 1; 1324 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1325 break; 1326 } 1327 } 1328 1329 /* 1330 * wait for the free thread if the user specifies 'compress-wait-thread', 1331 * otherwise we will post the page out in the main thread as normal page. 1332 */ 1333 if (pages < 0 && wait) { 1334 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1335 goto retry; 1336 } 1337 qemu_mutex_unlock(&comp_done_lock); 1338 1339 return pages; 1340 } 1341 1342 /** 1343 * find_dirty_block: find the next dirty page and update any state 1344 * associated with the search process. 1345 * 1346 * Returns true if a page is found 1347 * 1348 * @rs: current RAM state 1349 * @pss: data about the state of the current dirty page scan 1350 * @again: set to false if the search has scanned the whole of RAM 1351 */ 1352 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1353 { 1354 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1355 if (pss->complete_round && pss->block == rs->last_seen_block && 1356 pss->page >= rs->last_page) { 1357 /* 1358 * We've been once around the RAM and haven't found anything. 1359 * Give up. 1360 */ 1361 *again = false; 1362 return false; 1363 } 1364 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS) 1365 >= pss->block->used_length) { 1366 /* Didn't find anything in this RAM Block */ 1367 pss->page = 0; 1368 pss->block = QLIST_NEXT_RCU(pss->block, next); 1369 if (!pss->block) { 1370 /* 1371 * If memory migration starts over, we will meet a dirtied page 1372 * which may still exists in compression threads's ring, so we 1373 * should flush the compressed data to make sure the new page 1374 * is not overwritten by the old one in the destination. 1375 * 1376 * Also If xbzrle is on, stop using the data compression at this 1377 * point. In theory, xbzrle can do better than compression. 1378 */ 1379 flush_compressed_data(rs); 1380 1381 /* Hit the end of the list */ 1382 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1383 /* Flag that we've looped */ 1384 pss->complete_round = true; 1385 rs->ram_bulk_stage = false; 1386 } 1387 /* Didn't find anything this time, but try again on the new block */ 1388 *again = true; 1389 return false; 1390 } else { 1391 /* Can go around again, but... */ 1392 *again = true; 1393 /* We've found something so probably don't need to */ 1394 return true; 1395 } 1396 } 1397 1398 /** 1399 * unqueue_page: gets a page of the queue 1400 * 1401 * Helper for 'get_queued_page' - gets a page off the queue 1402 * 1403 * Returns the block of the page (or NULL if none available) 1404 * 1405 * @rs: current RAM state 1406 * @offset: used to return the offset within the RAMBlock 1407 */ 1408 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1409 { 1410 RAMBlock *block = NULL; 1411 1412 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1413 return NULL; 1414 } 1415 1416 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1417 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1418 struct RAMSrcPageRequest *entry = 1419 QSIMPLEQ_FIRST(&rs->src_page_requests); 1420 block = entry->rb; 1421 *offset = entry->offset; 1422 1423 if (entry->len > TARGET_PAGE_SIZE) { 1424 entry->len -= TARGET_PAGE_SIZE; 1425 entry->offset += TARGET_PAGE_SIZE; 1426 } else { 1427 memory_region_unref(block->mr); 1428 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1429 g_free(entry); 1430 migration_consume_urgent_request(); 1431 } 1432 } 1433 1434 return block; 1435 } 1436 1437 /** 1438 * get_queued_page: unqueue a page from the postcopy requests 1439 * 1440 * Skips pages that are already sent (!dirty) 1441 * 1442 * Returns true if a queued page is found 1443 * 1444 * @rs: current RAM state 1445 * @pss: data about the state of the current dirty page scan 1446 */ 1447 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1448 { 1449 RAMBlock *block; 1450 ram_addr_t offset; 1451 bool dirty; 1452 1453 do { 1454 block = unqueue_page(rs, &offset); 1455 /* 1456 * We're sending this page, and since it's postcopy nothing else 1457 * will dirty it, and we must make sure it doesn't get sent again 1458 * even if this queue request was received after the background 1459 * search already sent it. 1460 */ 1461 if (block) { 1462 unsigned long page; 1463 1464 page = offset >> TARGET_PAGE_BITS; 1465 dirty = test_bit(page, block->bmap); 1466 if (!dirty) { 1467 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1468 page); 1469 } else { 1470 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1471 } 1472 } 1473 1474 } while (block && !dirty); 1475 1476 if (block) { 1477 /* 1478 * As soon as we start servicing pages out of order, then we have 1479 * to kill the bulk stage, since the bulk stage assumes 1480 * in (migration_bitmap_find_and_reset_dirty) that every page is 1481 * dirty, that's no longer true. 1482 */ 1483 rs->ram_bulk_stage = false; 1484 1485 /* 1486 * We want the background search to continue from the queued page 1487 * since the guest is likely to want other pages near to the page 1488 * it just requested. 1489 */ 1490 pss->block = block; 1491 pss->page = offset >> TARGET_PAGE_BITS; 1492 1493 /* 1494 * This unqueued page would break the "one round" check, even is 1495 * really rare. 1496 */ 1497 pss->complete_round = false; 1498 } 1499 1500 return !!block; 1501 } 1502 1503 /** 1504 * migration_page_queue_free: drop any remaining pages in the ram 1505 * request queue 1506 * 1507 * It should be empty at the end anyway, but in error cases there may 1508 * be some left. in case that there is any page left, we drop it. 1509 * 1510 */ 1511 static void migration_page_queue_free(RAMState *rs) 1512 { 1513 struct RAMSrcPageRequest *mspr, *next_mspr; 1514 /* This queue generally should be empty - but in the case of a failed 1515 * migration might have some droppings in. 1516 */ 1517 RCU_READ_LOCK_GUARD(); 1518 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1519 memory_region_unref(mspr->rb->mr); 1520 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1521 g_free(mspr); 1522 } 1523 } 1524 1525 /** 1526 * ram_save_queue_pages: queue the page for transmission 1527 * 1528 * A request from postcopy destination for example. 1529 * 1530 * Returns zero on success or negative on error 1531 * 1532 * @rbname: Name of the RAMBLock of the request. NULL means the 1533 * same that last one. 1534 * @start: starting address from the start of the RAMBlock 1535 * @len: length (in bytes) to send 1536 */ 1537 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1538 { 1539 RAMBlock *ramblock; 1540 RAMState *rs = ram_state; 1541 1542 ram_counters.postcopy_requests++; 1543 RCU_READ_LOCK_GUARD(); 1544 1545 if (!rbname) { 1546 /* Reuse last RAMBlock */ 1547 ramblock = rs->last_req_rb; 1548 1549 if (!ramblock) { 1550 /* 1551 * Shouldn't happen, we can't reuse the last RAMBlock if 1552 * it's the 1st request. 1553 */ 1554 error_report("ram_save_queue_pages no previous block"); 1555 return -1; 1556 } 1557 } else { 1558 ramblock = qemu_ram_block_by_name(rbname); 1559 1560 if (!ramblock) { 1561 /* We shouldn't be asked for a non-existent RAMBlock */ 1562 error_report("ram_save_queue_pages no block '%s'", rbname); 1563 return -1; 1564 } 1565 rs->last_req_rb = ramblock; 1566 } 1567 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1568 if (start + len > ramblock->used_length) { 1569 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1570 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1571 __func__, start, len, ramblock->used_length); 1572 return -1; 1573 } 1574 1575 struct RAMSrcPageRequest *new_entry = 1576 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1577 new_entry->rb = ramblock; 1578 new_entry->offset = start; 1579 new_entry->len = len; 1580 1581 memory_region_ref(ramblock->mr); 1582 qemu_mutex_lock(&rs->src_page_req_mutex); 1583 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1584 migration_make_urgent_request(); 1585 qemu_mutex_unlock(&rs->src_page_req_mutex); 1586 1587 return 0; 1588 } 1589 1590 static bool save_page_use_compression(RAMState *rs) 1591 { 1592 if (!migrate_use_compression()) { 1593 return false; 1594 } 1595 1596 /* 1597 * If xbzrle is on, stop using the data compression after first 1598 * round of migration even if compression is enabled. In theory, 1599 * xbzrle can do better than compression. 1600 */ 1601 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 1602 return true; 1603 } 1604 1605 return false; 1606 } 1607 1608 /* 1609 * try to compress the page before posting it out, return true if the page 1610 * has been properly handled by compression, otherwise needs other 1611 * paths to handle it 1612 */ 1613 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1614 { 1615 if (!save_page_use_compression(rs)) { 1616 return false; 1617 } 1618 1619 /* 1620 * When starting the process of a new block, the first page of 1621 * the block should be sent out before other pages in the same 1622 * block, and all the pages in last block should have been sent 1623 * out, keeping this order is important, because the 'cont' flag 1624 * is used to avoid resending the block name. 1625 * 1626 * We post the fist page as normal page as compression will take 1627 * much CPU resource. 1628 */ 1629 if (block != rs->last_sent_block) { 1630 flush_compressed_data(rs); 1631 return false; 1632 } 1633 1634 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 1635 return true; 1636 } 1637 1638 compression_counters.busy++; 1639 return false; 1640 } 1641 1642 /** 1643 * ram_save_target_page: save one target page 1644 * 1645 * Returns the number of pages written 1646 * 1647 * @rs: current RAM state 1648 * @pss: data about the page we want to send 1649 * @last_stage: if we are at the completion stage 1650 */ 1651 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1652 bool last_stage) 1653 { 1654 RAMBlock *block = pss->block; 1655 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1656 int res; 1657 1658 if (control_save_page(rs, block, offset, &res)) { 1659 return res; 1660 } 1661 1662 if (save_compress_page(rs, block, offset)) { 1663 return 1; 1664 } 1665 1666 res = save_zero_page(rs, block, offset); 1667 if (res > 0) { 1668 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1669 * page would be stale 1670 */ 1671 if (!save_page_use_compression(rs)) { 1672 XBZRLE_cache_lock(); 1673 xbzrle_cache_zero_page(rs, block->offset + offset); 1674 XBZRLE_cache_unlock(); 1675 } 1676 ram_release_pages(block->idstr, offset, res); 1677 return res; 1678 } 1679 1680 /* 1681 * Do not use multifd for: 1682 * 1. Compression as the first page in the new block should be posted out 1683 * before sending the compressed page 1684 * 2. In postcopy as one whole host page should be placed 1685 */ 1686 if (!save_page_use_compression(rs) && migrate_use_multifd() 1687 && !migration_in_postcopy()) { 1688 return ram_save_multifd_page(rs, block, offset); 1689 } 1690 1691 return ram_save_page(rs, pss, last_stage); 1692 } 1693 1694 /** 1695 * ram_save_host_page: save a whole host page 1696 * 1697 * Starting at *offset send pages up to the end of the current host 1698 * page. It's valid for the initial offset to point into the middle of 1699 * a host page in which case the remainder of the hostpage is sent. 1700 * Only dirty target pages are sent. Note that the host page size may 1701 * be a huge page for this block. 1702 * The saving stops at the boundary of the used_length of the block 1703 * if the RAMBlock isn't a multiple of the host page size. 1704 * 1705 * Returns the number of pages written or negative on error 1706 * 1707 * @rs: current RAM state 1708 * @ms: current migration state 1709 * @pss: data about the page we want to send 1710 * @last_stage: if we are at the completion stage 1711 */ 1712 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1713 bool last_stage) 1714 { 1715 int tmppages, pages = 0; 1716 size_t pagesize_bits = 1717 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1718 1719 if (ramblock_is_ignored(pss->block)) { 1720 error_report("block %s should not be migrated !", pss->block->idstr); 1721 return 0; 1722 } 1723 1724 do { 1725 /* Check the pages is dirty and if it is send it */ 1726 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 1727 pss->page++; 1728 continue; 1729 } 1730 1731 tmppages = ram_save_target_page(rs, pss, last_stage); 1732 if (tmppages < 0) { 1733 return tmppages; 1734 } 1735 1736 pages += tmppages; 1737 pss->page++; 1738 /* Allow rate limiting to happen in the middle of huge pages */ 1739 migration_rate_limit(); 1740 } while ((pss->page & (pagesize_bits - 1)) && 1741 offset_in_ramblock(pss->block, 1742 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 1743 1744 /* The offset we leave with is the last one we looked at */ 1745 pss->page--; 1746 return pages; 1747 } 1748 1749 /** 1750 * ram_find_and_save_block: finds a dirty page and sends it to f 1751 * 1752 * Called within an RCU critical section. 1753 * 1754 * Returns the number of pages written where zero means no dirty pages, 1755 * or negative on error 1756 * 1757 * @rs: current RAM state 1758 * @last_stage: if we are at the completion stage 1759 * 1760 * On systems where host-page-size > target-page-size it will send all the 1761 * pages in a host page that are dirty. 1762 */ 1763 1764 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 1765 { 1766 PageSearchStatus pss; 1767 int pages = 0; 1768 bool again, found; 1769 1770 /* No dirty page as there is zero RAM */ 1771 if (!ram_bytes_total()) { 1772 return pages; 1773 } 1774 1775 pss.block = rs->last_seen_block; 1776 pss.page = rs->last_page; 1777 pss.complete_round = false; 1778 1779 if (!pss.block) { 1780 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1781 } 1782 1783 do { 1784 again = true; 1785 found = get_queued_page(rs, &pss); 1786 1787 if (!found) { 1788 /* priority queue empty, so just search for something dirty */ 1789 found = find_dirty_block(rs, &pss, &again); 1790 } 1791 1792 if (found) { 1793 pages = ram_save_host_page(rs, &pss, last_stage); 1794 } 1795 } while (!pages && again); 1796 1797 rs->last_seen_block = pss.block; 1798 rs->last_page = pss.page; 1799 1800 return pages; 1801 } 1802 1803 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1804 { 1805 uint64_t pages = size / TARGET_PAGE_SIZE; 1806 1807 if (zero) { 1808 ram_counters.duplicate += pages; 1809 } else { 1810 ram_counters.normal += pages; 1811 ram_counters.transferred += size; 1812 qemu_update_position(f, size); 1813 } 1814 } 1815 1816 static uint64_t ram_bytes_total_common(bool count_ignored) 1817 { 1818 RAMBlock *block; 1819 uint64_t total = 0; 1820 1821 RCU_READ_LOCK_GUARD(); 1822 1823 if (count_ignored) { 1824 RAMBLOCK_FOREACH_MIGRATABLE(block) { 1825 total += block->used_length; 1826 } 1827 } else { 1828 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1829 total += block->used_length; 1830 } 1831 } 1832 return total; 1833 } 1834 1835 uint64_t ram_bytes_total(void) 1836 { 1837 return ram_bytes_total_common(false); 1838 } 1839 1840 static void xbzrle_load_setup(void) 1841 { 1842 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 1843 } 1844 1845 static void xbzrle_load_cleanup(void) 1846 { 1847 g_free(XBZRLE.decoded_buf); 1848 XBZRLE.decoded_buf = NULL; 1849 } 1850 1851 static void ram_state_cleanup(RAMState **rsp) 1852 { 1853 if (*rsp) { 1854 migration_page_queue_free(*rsp); 1855 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 1856 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 1857 g_free(*rsp); 1858 *rsp = NULL; 1859 } 1860 } 1861 1862 static void xbzrle_cleanup(void) 1863 { 1864 XBZRLE_cache_lock(); 1865 if (XBZRLE.cache) { 1866 cache_fini(XBZRLE.cache); 1867 g_free(XBZRLE.encoded_buf); 1868 g_free(XBZRLE.current_buf); 1869 g_free(XBZRLE.zero_target_page); 1870 XBZRLE.cache = NULL; 1871 XBZRLE.encoded_buf = NULL; 1872 XBZRLE.current_buf = NULL; 1873 XBZRLE.zero_target_page = NULL; 1874 } 1875 XBZRLE_cache_unlock(); 1876 } 1877 1878 static void ram_save_cleanup(void *opaque) 1879 { 1880 RAMState **rsp = opaque; 1881 RAMBlock *block; 1882 1883 /* caller have hold iothread lock or is in a bh, so there is 1884 * no writing race against the migration bitmap 1885 */ 1886 memory_global_dirty_log_stop(); 1887 1888 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1889 g_free(block->clear_bmap); 1890 block->clear_bmap = NULL; 1891 g_free(block->bmap); 1892 block->bmap = NULL; 1893 } 1894 1895 xbzrle_cleanup(); 1896 compress_threads_save_cleanup(); 1897 ram_state_cleanup(rsp); 1898 } 1899 1900 static void ram_state_reset(RAMState *rs) 1901 { 1902 rs->last_seen_block = NULL; 1903 rs->last_sent_block = NULL; 1904 rs->last_page = 0; 1905 rs->last_version = ram_list.version; 1906 rs->ram_bulk_stage = true; 1907 rs->fpo_enabled = false; 1908 } 1909 1910 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1911 1912 /* 1913 * 'expected' is the value you expect the bitmap mostly to be full 1914 * of; it won't bother printing lines that are all this value. 1915 * If 'todump' is null the migration bitmap is dumped. 1916 */ 1917 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 1918 unsigned long pages) 1919 { 1920 int64_t cur; 1921 int64_t linelen = 128; 1922 char linebuf[129]; 1923 1924 for (cur = 0; cur < pages; cur += linelen) { 1925 int64_t curb; 1926 bool found = false; 1927 /* 1928 * Last line; catch the case where the line length 1929 * is longer than remaining ram 1930 */ 1931 if (cur + linelen > pages) { 1932 linelen = pages - cur; 1933 } 1934 for (curb = 0; curb < linelen; curb++) { 1935 bool thisbit = test_bit(cur + curb, todump); 1936 linebuf[curb] = thisbit ? '1' : '.'; 1937 found = found || (thisbit != expected); 1938 } 1939 if (found) { 1940 linebuf[curb] = '\0'; 1941 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1942 } 1943 } 1944 } 1945 1946 /* **** functions for postcopy ***** */ 1947 1948 void ram_postcopy_migrated_memory_release(MigrationState *ms) 1949 { 1950 struct RAMBlock *block; 1951 1952 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1953 unsigned long *bitmap = block->bmap; 1954 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 1955 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 1956 1957 while (run_start < range) { 1958 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 1959 ram_discard_range(block->idstr, 1960 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 1961 ((ram_addr_t)(run_end - run_start)) 1962 << TARGET_PAGE_BITS); 1963 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 1964 } 1965 } 1966 } 1967 1968 /** 1969 * postcopy_send_discard_bm_ram: discard a RAMBlock 1970 * 1971 * Returns zero on success 1972 * 1973 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1974 * 1975 * @ms: current migration state 1976 * @block: RAMBlock to discard 1977 */ 1978 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 1979 { 1980 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 1981 unsigned long current; 1982 unsigned long *bitmap = block->bmap; 1983 1984 for (current = 0; current < end; ) { 1985 unsigned long one = find_next_bit(bitmap, end, current); 1986 unsigned long zero, discard_length; 1987 1988 if (one >= end) { 1989 break; 1990 } 1991 1992 zero = find_next_zero_bit(bitmap, end, one + 1); 1993 1994 if (zero >= end) { 1995 discard_length = end - one; 1996 } else { 1997 discard_length = zero - one; 1998 } 1999 postcopy_discard_send_range(ms, one, discard_length); 2000 current = one + discard_length; 2001 } 2002 2003 return 0; 2004 } 2005 2006 /** 2007 * postcopy_each_ram_send_discard: discard all RAMBlocks 2008 * 2009 * Returns 0 for success or negative for error 2010 * 2011 * Utility for the outgoing postcopy code. 2012 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2013 * passing it bitmap indexes and name. 2014 * (qemu_ram_foreach_block ends up passing unscaled lengths 2015 * which would mean postcopy code would have to deal with target page) 2016 * 2017 * @ms: current migration state 2018 */ 2019 static int postcopy_each_ram_send_discard(MigrationState *ms) 2020 { 2021 struct RAMBlock *block; 2022 int ret; 2023 2024 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2025 postcopy_discard_send_init(ms, block->idstr); 2026 2027 /* 2028 * Postcopy sends chunks of bitmap over the wire, but it 2029 * just needs indexes at this point, avoids it having 2030 * target page specific code. 2031 */ 2032 ret = postcopy_send_discard_bm_ram(ms, block); 2033 postcopy_discard_send_finish(ms); 2034 if (ret) { 2035 return ret; 2036 } 2037 } 2038 2039 return 0; 2040 } 2041 2042 /** 2043 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2044 * 2045 * Helper for postcopy_chunk_hostpages; it's called twice to 2046 * canonicalize the two bitmaps, that are similar, but one is 2047 * inverted. 2048 * 2049 * Postcopy requires that all target pages in a hostpage are dirty or 2050 * clean, not a mix. This function canonicalizes the bitmaps. 2051 * 2052 * @ms: current migration state 2053 * @block: block that contains the page we want to canonicalize 2054 */ 2055 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2056 { 2057 RAMState *rs = ram_state; 2058 unsigned long *bitmap = block->bmap; 2059 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2060 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2061 unsigned long run_start; 2062 2063 if (block->page_size == TARGET_PAGE_SIZE) { 2064 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2065 return; 2066 } 2067 2068 /* Find a dirty page */ 2069 run_start = find_next_bit(bitmap, pages, 0); 2070 2071 while (run_start < pages) { 2072 2073 /* 2074 * If the start of this run of pages is in the middle of a host 2075 * page, then we need to fixup this host page. 2076 */ 2077 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2078 /* Find the end of this run */ 2079 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2080 /* 2081 * If the end isn't at the start of a host page, then the 2082 * run doesn't finish at the end of a host page 2083 * and we need to discard. 2084 */ 2085 } 2086 2087 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2088 unsigned long page; 2089 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2090 host_ratio); 2091 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2092 2093 /* Clean up the bitmap */ 2094 for (page = fixup_start_addr; 2095 page < fixup_start_addr + host_ratio; page++) { 2096 /* 2097 * Remark them as dirty, updating the count for any pages 2098 * that weren't previously dirty. 2099 */ 2100 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2101 } 2102 } 2103 2104 /* Find the next dirty page for the next iteration */ 2105 run_start = find_next_bit(bitmap, pages, run_start); 2106 } 2107 } 2108 2109 /** 2110 * postcopy_chunk_hostpages: discard any partially sent host page 2111 * 2112 * Utility for the outgoing postcopy code. 2113 * 2114 * Discard any partially sent host-page size chunks, mark any partially 2115 * dirty host-page size chunks as all dirty. In this case the host-page 2116 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2117 * 2118 * Returns zero on success 2119 * 2120 * @ms: current migration state 2121 * @block: block we want to work with 2122 */ 2123 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2124 { 2125 postcopy_discard_send_init(ms, block->idstr); 2126 2127 /* 2128 * Ensure that all partially dirty host pages are made fully dirty. 2129 */ 2130 postcopy_chunk_hostpages_pass(ms, block); 2131 2132 postcopy_discard_send_finish(ms); 2133 return 0; 2134 } 2135 2136 /** 2137 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2138 * 2139 * Returns zero on success 2140 * 2141 * Transmit the set of pages to be discarded after precopy to the target 2142 * these are pages that: 2143 * a) Have been previously transmitted but are now dirty again 2144 * b) Pages that have never been transmitted, this ensures that 2145 * any pages on the destination that have been mapped by background 2146 * tasks get discarded (transparent huge pages is the specific concern) 2147 * Hopefully this is pretty sparse 2148 * 2149 * @ms: current migration state 2150 */ 2151 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2152 { 2153 RAMState *rs = ram_state; 2154 RAMBlock *block; 2155 int ret; 2156 2157 RCU_READ_LOCK_GUARD(); 2158 2159 /* This should be our last sync, the src is now paused */ 2160 migration_bitmap_sync(rs); 2161 2162 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2163 rs->last_seen_block = NULL; 2164 rs->last_sent_block = NULL; 2165 rs->last_page = 0; 2166 2167 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2168 /* Deal with TPS != HPS and huge pages */ 2169 ret = postcopy_chunk_hostpages(ms, block); 2170 if (ret) { 2171 return ret; 2172 } 2173 2174 #ifdef DEBUG_POSTCOPY 2175 ram_debug_dump_bitmap(block->bmap, true, 2176 block->used_length >> TARGET_PAGE_BITS); 2177 #endif 2178 } 2179 trace_ram_postcopy_send_discard_bitmap(); 2180 2181 return postcopy_each_ram_send_discard(ms); 2182 } 2183 2184 /** 2185 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2186 * 2187 * Returns zero on success 2188 * 2189 * @rbname: name of the RAMBlock of the request. NULL means the 2190 * same that last one. 2191 * @start: RAMBlock starting page 2192 * @length: RAMBlock size 2193 */ 2194 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2195 { 2196 trace_ram_discard_range(rbname, start, length); 2197 2198 RCU_READ_LOCK_GUARD(); 2199 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2200 2201 if (!rb) { 2202 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2203 return -1; 2204 } 2205 2206 /* 2207 * On source VM, we don't need to update the received bitmap since 2208 * we don't even have one. 2209 */ 2210 if (rb->receivedmap) { 2211 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2212 length >> qemu_target_page_bits()); 2213 } 2214 2215 return ram_block_discard_range(rb, start, length); 2216 } 2217 2218 /* 2219 * For every allocation, we will try not to crash the VM if the 2220 * allocation failed. 2221 */ 2222 static int xbzrle_init(void) 2223 { 2224 Error *local_err = NULL; 2225 2226 if (!migrate_use_xbzrle()) { 2227 return 0; 2228 } 2229 2230 XBZRLE_cache_lock(); 2231 2232 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2233 if (!XBZRLE.zero_target_page) { 2234 error_report("%s: Error allocating zero page", __func__); 2235 goto err_out; 2236 } 2237 2238 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2239 TARGET_PAGE_SIZE, &local_err); 2240 if (!XBZRLE.cache) { 2241 error_report_err(local_err); 2242 goto free_zero_page; 2243 } 2244 2245 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2246 if (!XBZRLE.encoded_buf) { 2247 error_report("%s: Error allocating encoded_buf", __func__); 2248 goto free_cache; 2249 } 2250 2251 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2252 if (!XBZRLE.current_buf) { 2253 error_report("%s: Error allocating current_buf", __func__); 2254 goto free_encoded_buf; 2255 } 2256 2257 /* We are all good */ 2258 XBZRLE_cache_unlock(); 2259 return 0; 2260 2261 free_encoded_buf: 2262 g_free(XBZRLE.encoded_buf); 2263 XBZRLE.encoded_buf = NULL; 2264 free_cache: 2265 cache_fini(XBZRLE.cache); 2266 XBZRLE.cache = NULL; 2267 free_zero_page: 2268 g_free(XBZRLE.zero_target_page); 2269 XBZRLE.zero_target_page = NULL; 2270 err_out: 2271 XBZRLE_cache_unlock(); 2272 return -ENOMEM; 2273 } 2274 2275 static int ram_state_init(RAMState **rsp) 2276 { 2277 *rsp = g_try_new0(RAMState, 1); 2278 2279 if (!*rsp) { 2280 error_report("%s: Init ramstate fail", __func__); 2281 return -1; 2282 } 2283 2284 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2285 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2286 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2287 2288 /* 2289 * Count the total number of pages used by ram blocks not including any 2290 * gaps due to alignment or unplugs. 2291 * This must match with the initial values of dirty bitmap. 2292 */ 2293 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2294 ram_state_reset(*rsp); 2295 2296 return 0; 2297 } 2298 2299 static void ram_list_init_bitmaps(void) 2300 { 2301 MigrationState *ms = migrate_get_current(); 2302 RAMBlock *block; 2303 unsigned long pages; 2304 uint8_t shift; 2305 2306 /* Skip setting bitmap if there is no RAM */ 2307 if (ram_bytes_total()) { 2308 shift = ms->clear_bitmap_shift; 2309 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2310 error_report("clear_bitmap_shift (%u) too big, using " 2311 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2312 shift = CLEAR_BITMAP_SHIFT_MAX; 2313 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2314 error_report("clear_bitmap_shift (%u) too small, using " 2315 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2316 shift = CLEAR_BITMAP_SHIFT_MIN; 2317 } 2318 2319 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2320 pages = block->max_length >> TARGET_PAGE_BITS; 2321 /* 2322 * The initial dirty bitmap for migration must be set with all 2323 * ones to make sure we'll migrate every guest RAM page to 2324 * destination. 2325 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2326 * new migration after a failed migration, ram_list. 2327 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2328 * guest memory. 2329 */ 2330 block->bmap = bitmap_new(pages); 2331 bitmap_set(block->bmap, 0, pages); 2332 block->clear_bmap_shift = shift; 2333 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2334 } 2335 } 2336 } 2337 2338 static void ram_init_bitmaps(RAMState *rs) 2339 { 2340 /* For memory_global_dirty_log_start below. */ 2341 qemu_mutex_lock_iothread(); 2342 qemu_mutex_lock_ramlist(); 2343 2344 WITH_RCU_READ_LOCK_GUARD() { 2345 ram_list_init_bitmaps(); 2346 memory_global_dirty_log_start(); 2347 migration_bitmap_sync_precopy(rs); 2348 } 2349 qemu_mutex_unlock_ramlist(); 2350 qemu_mutex_unlock_iothread(); 2351 } 2352 2353 static int ram_init_all(RAMState **rsp) 2354 { 2355 if (ram_state_init(rsp)) { 2356 return -1; 2357 } 2358 2359 if (xbzrle_init()) { 2360 ram_state_cleanup(rsp); 2361 return -1; 2362 } 2363 2364 ram_init_bitmaps(*rsp); 2365 2366 return 0; 2367 } 2368 2369 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2370 { 2371 RAMBlock *block; 2372 uint64_t pages = 0; 2373 2374 /* 2375 * Postcopy is not using xbzrle/compression, so no need for that. 2376 * Also, since source are already halted, we don't need to care 2377 * about dirty page logging as well. 2378 */ 2379 2380 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2381 pages += bitmap_count_one(block->bmap, 2382 block->used_length >> TARGET_PAGE_BITS); 2383 } 2384 2385 /* This may not be aligned with current bitmaps. Recalculate. */ 2386 rs->migration_dirty_pages = pages; 2387 2388 rs->last_seen_block = NULL; 2389 rs->last_sent_block = NULL; 2390 rs->last_page = 0; 2391 rs->last_version = ram_list.version; 2392 /* 2393 * Disable the bulk stage, otherwise we'll resend the whole RAM no 2394 * matter what we have sent. 2395 */ 2396 rs->ram_bulk_stage = false; 2397 2398 /* Update RAMState cache of output QEMUFile */ 2399 rs->f = out; 2400 2401 trace_ram_state_resume_prepare(pages); 2402 } 2403 2404 /* 2405 * This function clears bits of the free pages reported by the caller from the 2406 * migration dirty bitmap. @addr is the host address corresponding to the 2407 * start of the continuous guest free pages, and @len is the total bytes of 2408 * those pages. 2409 */ 2410 void qemu_guest_free_page_hint(void *addr, size_t len) 2411 { 2412 RAMBlock *block; 2413 ram_addr_t offset; 2414 size_t used_len, start, npages; 2415 MigrationState *s = migrate_get_current(); 2416 2417 /* This function is currently expected to be used during live migration */ 2418 if (!migration_is_setup_or_active(s->state)) { 2419 return; 2420 } 2421 2422 for (; len > 0; len -= used_len, addr += used_len) { 2423 block = qemu_ram_block_from_host(addr, false, &offset); 2424 if (unlikely(!block || offset >= block->used_length)) { 2425 /* 2426 * The implementation might not support RAMBlock resize during 2427 * live migration, but it could happen in theory with future 2428 * updates. So we add a check here to capture that case. 2429 */ 2430 error_report_once("%s unexpected error", __func__); 2431 return; 2432 } 2433 2434 if (len <= block->used_length - offset) { 2435 used_len = len; 2436 } else { 2437 used_len = block->used_length - offset; 2438 } 2439 2440 start = offset >> TARGET_PAGE_BITS; 2441 npages = used_len >> TARGET_PAGE_BITS; 2442 2443 qemu_mutex_lock(&ram_state->bitmap_mutex); 2444 ram_state->migration_dirty_pages -= 2445 bitmap_count_one_with_offset(block->bmap, start, npages); 2446 bitmap_clear(block->bmap, start, npages); 2447 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2448 } 2449 } 2450 2451 /* 2452 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2453 * long-running RCU critical section. When rcu-reclaims in the code 2454 * start to become numerous it will be necessary to reduce the 2455 * granularity of these critical sections. 2456 */ 2457 2458 /** 2459 * ram_save_setup: Setup RAM for migration 2460 * 2461 * Returns zero to indicate success and negative for error 2462 * 2463 * @f: QEMUFile where to send the data 2464 * @opaque: RAMState pointer 2465 */ 2466 static int ram_save_setup(QEMUFile *f, void *opaque) 2467 { 2468 RAMState **rsp = opaque; 2469 RAMBlock *block; 2470 2471 if (compress_threads_save_setup()) { 2472 return -1; 2473 } 2474 2475 /* migration has already setup the bitmap, reuse it. */ 2476 if (!migration_in_colo_state()) { 2477 if (ram_init_all(rsp) != 0) { 2478 compress_threads_save_cleanup(); 2479 return -1; 2480 } 2481 } 2482 (*rsp)->f = f; 2483 2484 WITH_RCU_READ_LOCK_GUARD() { 2485 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2486 2487 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2488 qemu_put_byte(f, strlen(block->idstr)); 2489 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2490 qemu_put_be64(f, block->used_length); 2491 if (migrate_postcopy_ram() && block->page_size != 2492 qemu_host_page_size) { 2493 qemu_put_be64(f, block->page_size); 2494 } 2495 if (migrate_ignore_shared()) { 2496 qemu_put_be64(f, block->mr->addr); 2497 } 2498 } 2499 } 2500 2501 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2502 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2503 2504 multifd_send_sync_main(f); 2505 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2506 qemu_fflush(f); 2507 2508 return 0; 2509 } 2510 2511 /** 2512 * ram_save_iterate: iterative stage for migration 2513 * 2514 * Returns zero to indicate success and negative for error 2515 * 2516 * @f: QEMUFile where to send the data 2517 * @opaque: RAMState pointer 2518 */ 2519 static int ram_save_iterate(QEMUFile *f, void *opaque) 2520 { 2521 RAMState **temp = opaque; 2522 RAMState *rs = *temp; 2523 int ret = 0; 2524 int i; 2525 int64_t t0; 2526 int done = 0; 2527 2528 if (blk_mig_bulk_active()) { 2529 /* Avoid transferring ram during bulk phase of block migration as 2530 * the bulk phase will usually take a long time and transferring 2531 * ram updates during that time is pointless. */ 2532 goto out; 2533 } 2534 2535 WITH_RCU_READ_LOCK_GUARD() { 2536 if (ram_list.version != rs->last_version) { 2537 ram_state_reset(rs); 2538 } 2539 2540 /* Read version before ram_list.blocks */ 2541 smp_rmb(); 2542 2543 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2544 2545 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2546 i = 0; 2547 while ((ret = qemu_file_rate_limit(f)) == 0 || 2548 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 2549 int pages; 2550 2551 if (qemu_file_get_error(f)) { 2552 break; 2553 } 2554 2555 pages = ram_find_and_save_block(rs, false); 2556 /* no more pages to sent */ 2557 if (pages == 0) { 2558 done = 1; 2559 break; 2560 } 2561 2562 if (pages < 0) { 2563 qemu_file_set_error(f, pages); 2564 break; 2565 } 2566 2567 rs->target_page_count += pages; 2568 2569 /* 2570 * During postcopy, it is necessary to make sure one whole host 2571 * page is sent in one chunk. 2572 */ 2573 if (migrate_postcopy_ram()) { 2574 flush_compressed_data(rs); 2575 } 2576 2577 /* 2578 * we want to check in the 1st loop, just in case it was the 1st 2579 * time and we had to sync the dirty bitmap. 2580 * qemu_clock_get_ns() is a bit expensive, so we only check each 2581 * some iterations 2582 */ 2583 if ((i & 63) == 0) { 2584 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 2585 1000000; 2586 if (t1 > MAX_WAIT) { 2587 trace_ram_save_iterate_big_wait(t1, i); 2588 break; 2589 } 2590 } 2591 i++; 2592 } 2593 } 2594 2595 /* 2596 * Must occur before EOS (or any QEMUFile operation) 2597 * because of RDMA protocol. 2598 */ 2599 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2600 2601 out: 2602 if (ret >= 0 2603 && migration_is_setup_or_active(migrate_get_current()->state)) { 2604 multifd_send_sync_main(rs->f); 2605 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2606 qemu_fflush(f); 2607 ram_counters.transferred += 8; 2608 2609 ret = qemu_file_get_error(f); 2610 } 2611 if (ret < 0) { 2612 return ret; 2613 } 2614 2615 return done; 2616 } 2617 2618 /** 2619 * ram_save_complete: function called to send the remaining amount of ram 2620 * 2621 * Returns zero to indicate success or negative on error 2622 * 2623 * Called with iothread lock 2624 * 2625 * @f: QEMUFile where to send the data 2626 * @opaque: RAMState pointer 2627 */ 2628 static int ram_save_complete(QEMUFile *f, void *opaque) 2629 { 2630 RAMState **temp = opaque; 2631 RAMState *rs = *temp; 2632 int ret = 0; 2633 2634 WITH_RCU_READ_LOCK_GUARD() { 2635 if (!migration_in_postcopy()) { 2636 migration_bitmap_sync_precopy(rs); 2637 } 2638 2639 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2640 2641 /* try transferring iterative blocks of memory */ 2642 2643 /* flush all remaining blocks regardless of rate limiting */ 2644 while (true) { 2645 int pages; 2646 2647 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2648 /* no more blocks to sent */ 2649 if (pages == 0) { 2650 break; 2651 } 2652 if (pages < 0) { 2653 ret = pages; 2654 break; 2655 } 2656 } 2657 2658 flush_compressed_data(rs); 2659 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2660 } 2661 2662 if (ret >= 0) { 2663 multifd_send_sync_main(rs->f); 2664 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2665 qemu_fflush(f); 2666 } 2667 2668 return ret; 2669 } 2670 2671 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2672 uint64_t *res_precopy_only, 2673 uint64_t *res_compatible, 2674 uint64_t *res_postcopy_only) 2675 { 2676 RAMState **temp = opaque; 2677 RAMState *rs = *temp; 2678 uint64_t remaining_size; 2679 2680 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2681 2682 if (!migration_in_postcopy() && 2683 remaining_size < max_size) { 2684 qemu_mutex_lock_iothread(); 2685 WITH_RCU_READ_LOCK_GUARD() { 2686 migration_bitmap_sync_precopy(rs); 2687 } 2688 qemu_mutex_unlock_iothread(); 2689 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2690 } 2691 2692 if (migrate_postcopy_ram()) { 2693 /* We can do postcopy, and all the data is postcopiable */ 2694 *res_compatible += remaining_size; 2695 } else { 2696 *res_precopy_only += remaining_size; 2697 } 2698 } 2699 2700 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2701 { 2702 unsigned int xh_len; 2703 int xh_flags; 2704 uint8_t *loaded_data; 2705 2706 /* extract RLE header */ 2707 xh_flags = qemu_get_byte(f); 2708 xh_len = qemu_get_be16(f); 2709 2710 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2711 error_report("Failed to load XBZRLE page - wrong compression!"); 2712 return -1; 2713 } 2714 2715 if (xh_len > TARGET_PAGE_SIZE) { 2716 error_report("Failed to load XBZRLE page - len overflow!"); 2717 return -1; 2718 } 2719 loaded_data = XBZRLE.decoded_buf; 2720 /* load data and decode */ 2721 /* it can change loaded_data to point to an internal buffer */ 2722 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2723 2724 /* decode RLE */ 2725 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2726 TARGET_PAGE_SIZE) == -1) { 2727 error_report("Failed to load XBZRLE page - decode error!"); 2728 return -1; 2729 } 2730 2731 return 0; 2732 } 2733 2734 /** 2735 * ram_block_from_stream: read a RAMBlock id from the migration stream 2736 * 2737 * Must be called from within a rcu critical section. 2738 * 2739 * Returns a pointer from within the RCU-protected ram_list. 2740 * 2741 * @f: QEMUFile where to read the data from 2742 * @flags: Page flags (mostly to see if it's a continuation of previous block) 2743 */ 2744 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 2745 { 2746 static RAMBlock *block; 2747 char id[256]; 2748 uint8_t len; 2749 2750 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2751 if (!block) { 2752 error_report("Ack, bad migration stream!"); 2753 return NULL; 2754 } 2755 return block; 2756 } 2757 2758 len = qemu_get_byte(f); 2759 qemu_get_buffer(f, (uint8_t *)id, len); 2760 id[len] = 0; 2761 2762 block = qemu_ram_block_by_name(id); 2763 if (!block) { 2764 error_report("Can't find block %s", id); 2765 return NULL; 2766 } 2767 2768 if (ramblock_is_ignored(block)) { 2769 error_report("block %s should not be migrated !", id); 2770 return NULL; 2771 } 2772 2773 return block; 2774 } 2775 2776 static inline void *host_from_ram_block_offset(RAMBlock *block, 2777 ram_addr_t offset) 2778 { 2779 if (!offset_in_ramblock(block, offset)) { 2780 return NULL; 2781 } 2782 2783 return block->host + offset; 2784 } 2785 2786 static inline void *colo_cache_from_block_offset(RAMBlock *block, 2787 ram_addr_t offset, bool record_bitmap) 2788 { 2789 if (!offset_in_ramblock(block, offset)) { 2790 return NULL; 2791 } 2792 if (!block->colo_cache) { 2793 error_report("%s: colo_cache is NULL in block :%s", 2794 __func__, block->idstr); 2795 return NULL; 2796 } 2797 2798 /* 2799 * During colo checkpoint, we need bitmap of these migrated pages. 2800 * It help us to decide which pages in ram cache should be flushed 2801 * into VM's RAM later. 2802 */ 2803 if (record_bitmap && 2804 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 2805 ram_state->migration_dirty_pages++; 2806 } 2807 return block->colo_cache + offset; 2808 } 2809 2810 /** 2811 * ram_handle_compressed: handle the zero page case 2812 * 2813 * If a page (or a whole RDMA chunk) has been 2814 * determined to be zero, then zap it. 2815 * 2816 * @host: host address for the zero page 2817 * @ch: what the page is filled from. We only support zero 2818 * @size: size of the zero page 2819 */ 2820 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2821 { 2822 if (ch != 0 || !is_zero_range(host, size)) { 2823 memset(host, ch, size); 2824 } 2825 } 2826 2827 /* return the size after decompression, or negative value on error */ 2828 static int 2829 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 2830 const uint8_t *source, size_t source_len) 2831 { 2832 int err; 2833 2834 err = inflateReset(stream); 2835 if (err != Z_OK) { 2836 return -1; 2837 } 2838 2839 stream->avail_in = source_len; 2840 stream->next_in = (uint8_t *)source; 2841 stream->avail_out = dest_len; 2842 stream->next_out = dest; 2843 2844 err = inflate(stream, Z_NO_FLUSH); 2845 if (err != Z_STREAM_END) { 2846 return -1; 2847 } 2848 2849 return stream->total_out; 2850 } 2851 2852 static void *do_data_decompress(void *opaque) 2853 { 2854 DecompressParam *param = opaque; 2855 unsigned long pagesize; 2856 uint8_t *des; 2857 int len, ret; 2858 2859 qemu_mutex_lock(¶m->mutex); 2860 while (!param->quit) { 2861 if (param->des) { 2862 des = param->des; 2863 len = param->len; 2864 param->des = 0; 2865 qemu_mutex_unlock(¶m->mutex); 2866 2867 pagesize = TARGET_PAGE_SIZE; 2868 2869 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 2870 param->compbuf, len); 2871 if (ret < 0 && migrate_get_current()->decompress_error_check) { 2872 error_report("decompress data failed"); 2873 qemu_file_set_error(decomp_file, ret); 2874 } 2875 2876 qemu_mutex_lock(&decomp_done_lock); 2877 param->done = true; 2878 qemu_cond_signal(&decomp_done_cond); 2879 qemu_mutex_unlock(&decomp_done_lock); 2880 2881 qemu_mutex_lock(¶m->mutex); 2882 } else { 2883 qemu_cond_wait(¶m->cond, ¶m->mutex); 2884 } 2885 } 2886 qemu_mutex_unlock(¶m->mutex); 2887 2888 return NULL; 2889 } 2890 2891 static int wait_for_decompress_done(void) 2892 { 2893 int idx, thread_count; 2894 2895 if (!migrate_use_compression()) { 2896 return 0; 2897 } 2898 2899 thread_count = migrate_decompress_threads(); 2900 qemu_mutex_lock(&decomp_done_lock); 2901 for (idx = 0; idx < thread_count; idx++) { 2902 while (!decomp_param[idx].done) { 2903 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2904 } 2905 } 2906 qemu_mutex_unlock(&decomp_done_lock); 2907 return qemu_file_get_error(decomp_file); 2908 } 2909 2910 static void compress_threads_load_cleanup(void) 2911 { 2912 int i, thread_count; 2913 2914 if (!migrate_use_compression()) { 2915 return; 2916 } 2917 thread_count = migrate_decompress_threads(); 2918 for (i = 0; i < thread_count; i++) { 2919 /* 2920 * we use it as a indicator which shows if the thread is 2921 * properly init'd or not 2922 */ 2923 if (!decomp_param[i].compbuf) { 2924 break; 2925 } 2926 2927 qemu_mutex_lock(&decomp_param[i].mutex); 2928 decomp_param[i].quit = true; 2929 qemu_cond_signal(&decomp_param[i].cond); 2930 qemu_mutex_unlock(&decomp_param[i].mutex); 2931 } 2932 for (i = 0; i < thread_count; i++) { 2933 if (!decomp_param[i].compbuf) { 2934 break; 2935 } 2936 2937 qemu_thread_join(decompress_threads + i); 2938 qemu_mutex_destroy(&decomp_param[i].mutex); 2939 qemu_cond_destroy(&decomp_param[i].cond); 2940 inflateEnd(&decomp_param[i].stream); 2941 g_free(decomp_param[i].compbuf); 2942 decomp_param[i].compbuf = NULL; 2943 } 2944 g_free(decompress_threads); 2945 g_free(decomp_param); 2946 decompress_threads = NULL; 2947 decomp_param = NULL; 2948 decomp_file = NULL; 2949 } 2950 2951 static int compress_threads_load_setup(QEMUFile *f) 2952 { 2953 int i, thread_count; 2954 2955 if (!migrate_use_compression()) { 2956 return 0; 2957 } 2958 2959 thread_count = migrate_decompress_threads(); 2960 decompress_threads = g_new0(QemuThread, thread_count); 2961 decomp_param = g_new0(DecompressParam, thread_count); 2962 qemu_mutex_init(&decomp_done_lock); 2963 qemu_cond_init(&decomp_done_cond); 2964 decomp_file = f; 2965 for (i = 0; i < thread_count; i++) { 2966 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 2967 goto exit; 2968 } 2969 2970 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2971 qemu_mutex_init(&decomp_param[i].mutex); 2972 qemu_cond_init(&decomp_param[i].cond); 2973 decomp_param[i].done = true; 2974 decomp_param[i].quit = false; 2975 qemu_thread_create(decompress_threads + i, "decompress", 2976 do_data_decompress, decomp_param + i, 2977 QEMU_THREAD_JOINABLE); 2978 } 2979 return 0; 2980 exit: 2981 compress_threads_load_cleanup(); 2982 return -1; 2983 } 2984 2985 static void decompress_data_with_multi_threads(QEMUFile *f, 2986 void *host, int len) 2987 { 2988 int idx, thread_count; 2989 2990 thread_count = migrate_decompress_threads(); 2991 qemu_mutex_lock(&decomp_done_lock); 2992 while (true) { 2993 for (idx = 0; idx < thread_count; idx++) { 2994 if (decomp_param[idx].done) { 2995 decomp_param[idx].done = false; 2996 qemu_mutex_lock(&decomp_param[idx].mutex); 2997 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2998 decomp_param[idx].des = host; 2999 decomp_param[idx].len = len; 3000 qemu_cond_signal(&decomp_param[idx].cond); 3001 qemu_mutex_unlock(&decomp_param[idx].mutex); 3002 break; 3003 } 3004 } 3005 if (idx < thread_count) { 3006 break; 3007 } else { 3008 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3009 } 3010 } 3011 qemu_mutex_unlock(&decomp_done_lock); 3012 } 3013 3014 /* 3015 * colo cache: this is for secondary VM, we cache the whole 3016 * memory of the secondary VM, it is need to hold the global lock 3017 * to call this helper. 3018 */ 3019 int colo_init_ram_cache(void) 3020 { 3021 RAMBlock *block; 3022 3023 WITH_RCU_READ_LOCK_GUARD() { 3024 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3025 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3026 NULL, 3027 false); 3028 if (!block->colo_cache) { 3029 error_report("%s: Can't alloc memory for COLO cache of block %s," 3030 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3031 block->used_length); 3032 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3033 if (block->colo_cache) { 3034 qemu_anon_ram_free(block->colo_cache, block->used_length); 3035 block->colo_cache = NULL; 3036 } 3037 } 3038 return -errno; 3039 } 3040 } 3041 } 3042 3043 /* 3044 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3045 * with to decide which page in cache should be flushed into SVM's RAM. Here 3046 * we use the same name 'ram_bitmap' as for migration. 3047 */ 3048 if (ram_bytes_total()) { 3049 RAMBlock *block; 3050 3051 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3052 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3053 block->bmap = bitmap_new(pages); 3054 } 3055 } 3056 3057 ram_state_init(&ram_state); 3058 return 0; 3059 } 3060 3061 /* TODO: duplicated with ram_init_bitmaps */ 3062 void colo_incoming_start_dirty_log(void) 3063 { 3064 RAMBlock *block = NULL; 3065 /* For memory_global_dirty_log_start below. */ 3066 qemu_mutex_lock_iothread(); 3067 qemu_mutex_lock_ramlist(); 3068 3069 memory_global_dirty_log_sync(); 3070 WITH_RCU_READ_LOCK_GUARD() { 3071 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3072 ramblock_sync_dirty_bitmap(ram_state, block); 3073 /* Discard this dirty bitmap record */ 3074 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3075 } 3076 memory_global_dirty_log_start(); 3077 } 3078 ram_state->migration_dirty_pages = 0; 3079 qemu_mutex_unlock_ramlist(); 3080 qemu_mutex_unlock_iothread(); 3081 } 3082 3083 /* It is need to hold the global lock to call this helper */ 3084 void colo_release_ram_cache(void) 3085 { 3086 RAMBlock *block; 3087 3088 memory_global_dirty_log_stop(); 3089 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3090 g_free(block->bmap); 3091 block->bmap = NULL; 3092 } 3093 3094 WITH_RCU_READ_LOCK_GUARD() { 3095 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3096 if (block->colo_cache) { 3097 qemu_anon_ram_free(block->colo_cache, block->used_length); 3098 block->colo_cache = NULL; 3099 } 3100 } 3101 } 3102 ram_state_cleanup(&ram_state); 3103 } 3104 3105 /** 3106 * ram_load_setup: Setup RAM for migration incoming side 3107 * 3108 * Returns zero to indicate success and negative for error 3109 * 3110 * @f: QEMUFile where to receive the data 3111 * @opaque: RAMState pointer 3112 */ 3113 static int ram_load_setup(QEMUFile *f, void *opaque) 3114 { 3115 if (compress_threads_load_setup(f)) { 3116 return -1; 3117 } 3118 3119 xbzrle_load_setup(); 3120 ramblock_recv_map_init(); 3121 3122 return 0; 3123 } 3124 3125 static int ram_load_cleanup(void *opaque) 3126 { 3127 RAMBlock *rb; 3128 3129 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3130 qemu_ram_block_writeback(rb); 3131 } 3132 3133 xbzrle_load_cleanup(); 3134 compress_threads_load_cleanup(); 3135 3136 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3137 g_free(rb->receivedmap); 3138 rb->receivedmap = NULL; 3139 } 3140 3141 return 0; 3142 } 3143 3144 /** 3145 * ram_postcopy_incoming_init: allocate postcopy data structures 3146 * 3147 * Returns 0 for success and negative if there was one error 3148 * 3149 * @mis: current migration incoming state 3150 * 3151 * Allocate data structures etc needed by incoming migration with 3152 * postcopy-ram. postcopy-ram's similarly names 3153 * postcopy_ram_incoming_init does the work. 3154 */ 3155 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3156 { 3157 return postcopy_ram_incoming_init(mis); 3158 } 3159 3160 /** 3161 * ram_load_postcopy: load a page in postcopy case 3162 * 3163 * Returns 0 for success or -errno in case of error 3164 * 3165 * Called in postcopy mode by ram_load(). 3166 * rcu_read_lock is taken prior to this being called. 3167 * 3168 * @f: QEMUFile where to send the data 3169 */ 3170 static int ram_load_postcopy(QEMUFile *f) 3171 { 3172 int flags = 0, ret = 0; 3173 bool place_needed = false; 3174 bool matches_target_page_size = false; 3175 MigrationIncomingState *mis = migration_incoming_get_current(); 3176 /* Temporary page that is later 'placed' */ 3177 void *postcopy_host_page = mis->postcopy_tmp_page; 3178 void *this_host = NULL; 3179 bool all_zero = true; 3180 int target_pages = 0; 3181 3182 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3183 ram_addr_t addr; 3184 void *host = NULL; 3185 void *page_buffer = NULL; 3186 void *place_source = NULL; 3187 RAMBlock *block = NULL; 3188 uint8_t ch; 3189 int len; 3190 3191 addr = qemu_get_be64(f); 3192 3193 /* 3194 * If qemu file error, we should stop here, and then "addr" 3195 * may be invalid 3196 */ 3197 ret = qemu_file_get_error(f); 3198 if (ret) { 3199 break; 3200 } 3201 3202 flags = addr & ~TARGET_PAGE_MASK; 3203 addr &= TARGET_PAGE_MASK; 3204 3205 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3206 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3207 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3208 block = ram_block_from_stream(f, flags); 3209 3210 host = host_from_ram_block_offset(block, addr); 3211 if (!host) { 3212 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3213 ret = -EINVAL; 3214 break; 3215 } 3216 target_pages++; 3217 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3218 /* 3219 * Postcopy requires that we place whole host pages atomically; 3220 * these may be huge pages for RAMBlocks that are backed by 3221 * hugetlbfs. 3222 * To make it atomic, the data is read into a temporary page 3223 * that's moved into place later. 3224 * The migration protocol uses, possibly smaller, target-pages 3225 * however the source ensures it always sends all the components 3226 * of a host page in one chunk. 3227 */ 3228 page_buffer = postcopy_host_page + 3229 ((uintptr_t)host & (block->page_size - 1)); 3230 if (target_pages == 1) { 3231 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3232 block->page_size); 3233 } else { 3234 /* not the 1st TP within the HP */ 3235 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) != 3236 (uintptr_t)this_host) { 3237 error_report("Non-same host page %p/%p", 3238 host, this_host); 3239 ret = -EINVAL; 3240 break; 3241 } 3242 } 3243 3244 /* 3245 * If it's the last part of a host page then we place the host 3246 * page 3247 */ 3248 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3249 place_needed = true; 3250 } 3251 place_source = postcopy_host_page; 3252 } 3253 3254 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3255 case RAM_SAVE_FLAG_ZERO: 3256 ch = qemu_get_byte(f); 3257 /* 3258 * Can skip to set page_buffer when 3259 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3260 */ 3261 if (ch || !matches_target_page_size) { 3262 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3263 } 3264 if (ch) { 3265 all_zero = false; 3266 } 3267 break; 3268 3269 case RAM_SAVE_FLAG_PAGE: 3270 all_zero = false; 3271 if (!matches_target_page_size) { 3272 /* For huge pages, we always use temporary buffer */ 3273 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3274 } else { 3275 /* 3276 * For small pages that matches target page size, we 3277 * avoid the qemu_file copy. Instead we directly use 3278 * the buffer of QEMUFile to place the page. Note: we 3279 * cannot do any QEMUFile operation before using that 3280 * buffer to make sure the buffer is valid when 3281 * placing the page. 3282 */ 3283 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3284 TARGET_PAGE_SIZE); 3285 } 3286 break; 3287 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3288 all_zero = false; 3289 len = qemu_get_be32(f); 3290 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3291 error_report("Invalid compressed data length: %d", len); 3292 ret = -EINVAL; 3293 break; 3294 } 3295 decompress_data_with_multi_threads(f, page_buffer, len); 3296 break; 3297 3298 case RAM_SAVE_FLAG_EOS: 3299 /* normal exit */ 3300 multifd_recv_sync_main(); 3301 break; 3302 default: 3303 error_report("Unknown combination of migration flags: 0x%x" 3304 " (postcopy mode)", flags); 3305 ret = -EINVAL; 3306 break; 3307 } 3308 3309 /* Got the whole host page, wait for decompress before placing. */ 3310 if (place_needed) { 3311 ret |= wait_for_decompress_done(); 3312 } 3313 3314 /* Detect for any possible file errors */ 3315 if (!ret && qemu_file_get_error(f)) { 3316 ret = qemu_file_get_error(f); 3317 } 3318 3319 if (!ret && place_needed) { 3320 /* This gets called at the last target page in the host page */ 3321 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3322 block->page_size); 3323 3324 if (all_zero) { 3325 ret = postcopy_place_page_zero(mis, place_dest, 3326 block); 3327 } else { 3328 ret = postcopy_place_page(mis, place_dest, 3329 place_source, block); 3330 } 3331 place_needed = false; 3332 target_pages = 0; 3333 /* Assume we have a zero page until we detect something different */ 3334 all_zero = true; 3335 } 3336 } 3337 3338 return ret; 3339 } 3340 3341 static bool postcopy_is_advised(void) 3342 { 3343 PostcopyState ps = postcopy_state_get(); 3344 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3345 } 3346 3347 static bool postcopy_is_running(void) 3348 { 3349 PostcopyState ps = postcopy_state_get(); 3350 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3351 } 3352 3353 /* 3354 * Flush content of RAM cache into SVM's memory. 3355 * Only flush the pages that be dirtied by PVM or SVM or both. 3356 */ 3357 void colo_flush_ram_cache(void) 3358 { 3359 RAMBlock *block = NULL; 3360 void *dst_host; 3361 void *src_host; 3362 unsigned long offset = 0; 3363 3364 memory_global_dirty_log_sync(); 3365 WITH_RCU_READ_LOCK_GUARD() { 3366 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3367 ramblock_sync_dirty_bitmap(ram_state, block); 3368 } 3369 } 3370 3371 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3372 WITH_RCU_READ_LOCK_GUARD() { 3373 block = QLIST_FIRST_RCU(&ram_list.blocks); 3374 3375 while (block) { 3376 offset = migration_bitmap_find_dirty(ram_state, block, offset); 3377 3378 if (((ram_addr_t)offset) << TARGET_PAGE_BITS 3379 >= block->used_length) { 3380 offset = 0; 3381 block = QLIST_NEXT_RCU(block, next); 3382 } else { 3383 migration_bitmap_clear_dirty(ram_state, block, offset); 3384 dst_host = block->host 3385 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3386 src_host = block->colo_cache 3387 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3388 memcpy(dst_host, src_host, TARGET_PAGE_SIZE); 3389 } 3390 } 3391 } 3392 trace_colo_flush_ram_cache_end(); 3393 } 3394 3395 /** 3396 * ram_load_precopy: load pages in precopy case 3397 * 3398 * Returns 0 for success or -errno in case of error 3399 * 3400 * Called in precopy mode by ram_load(). 3401 * rcu_read_lock is taken prior to this being called. 3402 * 3403 * @f: QEMUFile where to send the data 3404 */ 3405 static int ram_load_precopy(QEMUFile *f) 3406 { 3407 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3408 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3409 bool postcopy_advised = postcopy_is_advised(); 3410 if (!migrate_use_compression()) { 3411 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3412 } 3413 3414 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3415 ram_addr_t addr, total_ram_bytes; 3416 void *host = NULL, *host_bak = NULL; 3417 uint8_t ch; 3418 3419 /* 3420 * Yield periodically to let main loop run, but an iteration of 3421 * the main loop is expensive, so do it each some iterations 3422 */ 3423 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3424 aio_co_schedule(qemu_get_current_aio_context(), 3425 qemu_coroutine_self()); 3426 qemu_coroutine_yield(); 3427 } 3428 i++; 3429 3430 addr = qemu_get_be64(f); 3431 flags = addr & ~TARGET_PAGE_MASK; 3432 addr &= TARGET_PAGE_MASK; 3433 3434 if (flags & invalid_flags) { 3435 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3436 error_report("Received an unexpected compressed page"); 3437 } 3438 3439 ret = -EINVAL; 3440 break; 3441 } 3442 3443 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3444 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3445 RAMBlock *block = ram_block_from_stream(f, flags); 3446 3447 host = host_from_ram_block_offset(block, addr); 3448 /* 3449 * After going into COLO stage, we should not load the page 3450 * into SVM's memory directly, we put them into colo_cache firstly. 3451 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3452 * Previously, we copied all these memory in preparing stage of COLO 3453 * while we need to stop VM, which is a time-consuming process. 3454 * Here we optimize it by a trick, back-up every page while in 3455 * migration process while COLO is enabled, though it affects the 3456 * speed of the migration, but it obviously reduce the downtime of 3457 * back-up all SVM'S memory in COLO preparing stage. 3458 */ 3459 if (migration_incoming_colo_enabled()) { 3460 if (migration_incoming_in_colo_state()) { 3461 /* In COLO stage, put all pages into cache temporarily */ 3462 host = colo_cache_from_block_offset(block, addr, true); 3463 } else { 3464 /* 3465 * In migration stage but before COLO stage, 3466 * Put all pages into both cache and SVM's memory. 3467 */ 3468 host_bak = colo_cache_from_block_offset(block, addr, false); 3469 } 3470 } 3471 if (!host) { 3472 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3473 ret = -EINVAL; 3474 break; 3475 } 3476 if (!migration_incoming_in_colo_state()) { 3477 ramblock_recv_bitmap_set(block, host); 3478 } 3479 3480 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3481 } 3482 3483 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3484 case RAM_SAVE_FLAG_MEM_SIZE: 3485 /* Synchronize RAM block list */ 3486 total_ram_bytes = addr; 3487 while (!ret && total_ram_bytes) { 3488 RAMBlock *block; 3489 char id[256]; 3490 ram_addr_t length; 3491 3492 len = qemu_get_byte(f); 3493 qemu_get_buffer(f, (uint8_t *)id, len); 3494 id[len] = 0; 3495 length = qemu_get_be64(f); 3496 3497 block = qemu_ram_block_by_name(id); 3498 if (block && !qemu_ram_is_migratable(block)) { 3499 error_report("block %s should not be migrated !", id); 3500 ret = -EINVAL; 3501 } else if (block) { 3502 if (length != block->used_length) { 3503 Error *local_err = NULL; 3504 3505 ret = qemu_ram_resize(block, length, 3506 &local_err); 3507 if (local_err) { 3508 error_report_err(local_err); 3509 } 3510 } 3511 /* For postcopy we need to check hugepage sizes match */ 3512 if (postcopy_advised && 3513 block->page_size != qemu_host_page_size) { 3514 uint64_t remote_page_size = qemu_get_be64(f); 3515 if (remote_page_size != block->page_size) { 3516 error_report("Mismatched RAM page size %s " 3517 "(local) %zd != %" PRId64, 3518 id, block->page_size, 3519 remote_page_size); 3520 ret = -EINVAL; 3521 } 3522 } 3523 if (migrate_ignore_shared()) { 3524 hwaddr addr = qemu_get_be64(f); 3525 if (ramblock_is_ignored(block) && 3526 block->mr->addr != addr) { 3527 error_report("Mismatched GPAs for block %s " 3528 "%" PRId64 "!= %" PRId64, 3529 id, (uint64_t)addr, 3530 (uint64_t)block->mr->addr); 3531 ret = -EINVAL; 3532 } 3533 } 3534 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3535 block->idstr); 3536 } else { 3537 error_report("Unknown ramblock \"%s\", cannot " 3538 "accept migration", id); 3539 ret = -EINVAL; 3540 } 3541 3542 total_ram_bytes -= length; 3543 } 3544 break; 3545 3546 case RAM_SAVE_FLAG_ZERO: 3547 ch = qemu_get_byte(f); 3548 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3549 break; 3550 3551 case RAM_SAVE_FLAG_PAGE: 3552 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3553 break; 3554 3555 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3556 len = qemu_get_be32(f); 3557 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3558 error_report("Invalid compressed data length: %d", len); 3559 ret = -EINVAL; 3560 break; 3561 } 3562 decompress_data_with_multi_threads(f, host, len); 3563 break; 3564 3565 case RAM_SAVE_FLAG_XBZRLE: 3566 if (load_xbzrle(f, addr, host) < 0) { 3567 error_report("Failed to decompress XBZRLE page at " 3568 RAM_ADDR_FMT, addr); 3569 ret = -EINVAL; 3570 break; 3571 } 3572 break; 3573 case RAM_SAVE_FLAG_EOS: 3574 /* normal exit */ 3575 multifd_recv_sync_main(); 3576 break; 3577 default: 3578 if (flags & RAM_SAVE_FLAG_HOOK) { 3579 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3580 } else { 3581 error_report("Unknown combination of migration flags: 0x%x", 3582 flags); 3583 ret = -EINVAL; 3584 } 3585 } 3586 if (!ret) { 3587 ret = qemu_file_get_error(f); 3588 } 3589 if (!ret && host_bak) { 3590 memcpy(host_bak, host, TARGET_PAGE_SIZE); 3591 } 3592 } 3593 3594 ret |= wait_for_decompress_done(); 3595 return ret; 3596 } 3597 3598 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3599 { 3600 int ret = 0; 3601 static uint64_t seq_iter; 3602 /* 3603 * If system is running in postcopy mode, page inserts to host memory must 3604 * be atomic 3605 */ 3606 bool postcopy_running = postcopy_is_running(); 3607 3608 seq_iter++; 3609 3610 if (version_id != 4) { 3611 return -EINVAL; 3612 } 3613 3614 /* 3615 * This RCU critical section can be very long running. 3616 * When RCU reclaims in the code start to become numerous, 3617 * it will be necessary to reduce the granularity of this 3618 * critical section. 3619 */ 3620 WITH_RCU_READ_LOCK_GUARD() { 3621 if (postcopy_running) { 3622 ret = ram_load_postcopy(f); 3623 } else { 3624 ret = ram_load_precopy(f); 3625 } 3626 } 3627 trace_ram_load_complete(ret, seq_iter); 3628 3629 return ret; 3630 } 3631 3632 static bool ram_has_postcopy(void *opaque) 3633 { 3634 RAMBlock *rb; 3635 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3636 if (ramblock_is_pmem(rb)) { 3637 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 3638 "is not supported now!", rb->idstr, rb->host); 3639 return false; 3640 } 3641 } 3642 3643 return migrate_postcopy_ram(); 3644 } 3645 3646 /* Sync all the dirty bitmap with destination VM. */ 3647 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 3648 { 3649 RAMBlock *block; 3650 QEMUFile *file = s->to_dst_file; 3651 int ramblock_count = 0; 3652 3653 trace_ram_dirty_bitmap_sync_start(); 3654 3655 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3656 qemu_savevm_send_recv_bitmap(file, block->idstr); 3657 trace_ram_dirty_bitmap_request(block->idstr); 3658 ramblock_count++; 3659 } 3660 3661 trace_ram_dirty_bitmap_sync_wait(); 3662 3663 /* Wait until all the ramblocks' dirty bitmap synced */ 3664 while (ramblock_count--) { 3665 qemu_sem_wait(&s->rp_state.rp_sem); 3666 } 3667 3668 trace_ram_dirty_bitmap_sync_complete(); 3669 3670 return 0; 3671 } 3672 3673 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 3674 { 3675 qemu_sem_post(&s->rp_state.rp_sem); 3676 } 3677 3678 /* 3679 * Read the received bitmap, revert it as the initial dirty bitmap. 3680 * This is only used when the postcopy migration is paused but wants 3681 * to resume from a middle point. 3682 */ 3683 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 3684 { 3685 int ret = -EINVAL; 3686 QEMUFile *file = s->rp_state.from_dst_file; 3687 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 3688 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 3689 uint64_t size, end_mark; 3690 3691 trace_ram_dirty_bitmap_reload_begin(block->idstr); 3692 3693 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 3694 error_report("%s: incorrect state %s", __func__, 3695 MigrationStatus_str(s->state)); 3696 return -EINVAL; 3697 } 3698 3699 /* 3700 * Note: see comments in ramblock_recv_bitmap_send() on why we 3701 * need the endianness conversion, and the paddings. 3702 */ 3703 local_size = ROUND_UP(local_size, 8); 3704 3705 /* Add paddings */ 3706 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 3707 3708 size = qemu_get_be64(file); 3709 3710 /* The size of the bitmap should match with our ramblock */ 3711 if (size != local_size) { 3712 error_report("%s: ramblock '%s' bitmap size mismatch " 3713 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 3714 block->idstr, size, local_size); 3715 ret = -EINVAL; 3716 goto out; 3717 } 3718 3719 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 3720 end_mark = qemu_get_be64(file); 3721 3722 ret = qemu_file_get_error(file); 3723 if (ret || size != local_size) { 3724 error_report("%s: read bitmap failed for ramblock '%s': %d" 3725 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 3726 __func__, block->idstr, ret, local_size, size); 3727 ret = -EIO; 3728 goto out; 3729 } 3730 3731 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 3732 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64, 3733 __func__, block->idstr, end_mark); 3734 ret = -EINVAL; 3735 goto out; 3736 } 3737 3738 /* 3739 * Endianness conversion. We are during postcopy (though paused). 3740 * The dirty bitmap won't change. We can directly modify it. 3741 */ 3742 bitmap_from_le(block->bmap, le_bitmap, nbits); 3743 3744 /* 3745 * What we received is "received bitmap". Revert it as the initial 3746 * dirty bitmap for this ramblock. 3747 */ 3748 bitmap_complement(block->bmap, block->bmap, nbits); 3749 3750 trace_ram_dirty_bitmap_reload_complete(block->idstr); 3751 3752 /* 3753 * We succeeded to sync bitmap for current ramblock. If this is 3754 * the last one to sync, we need to notify the main send thread. 3755 */ 3756 ram_dirty_bitmap_reload_notify(s); 3757 3758 ret = 0; 3759 out: 3760 g_free(le_bitmap); 3761 return ret; 3762 } 3763 3764 static int ram_resume_prepare(MigrationState *s, void *opaque) 3765 { 3766 RAMState *rs = *(RAMState **)opaque; 3767 int ret; 3768 3769 ret = ram_dirty_bitmap_sync_all(s, rs); 3770 if (ret) { 3771 return ret; 3772 } 3773 3774 ram_state_resume_prepare(rs, s->to_dst_file); 3775 3776 return 0; 3777 } 3778 3779 static SaveVMHandlers savevm_ram_handlers = { 3780 .save_setup = ram_save_setup, 3781 .save_live_iterate = ram_save_iterate, 3782 .save_live_complete_postcopy = ram_save_complete, 3783 .save_live_complete_precopy = ram_save_complete, 3784 .has_postcopy = ram_has_postcopy, 3785 .save_live_pending = ram_save_pending, 3786 .load_state = ram_load, 3787 .save_cleanup = ram_save_cleanup, 3788 .load_setup = ram_load_setup, 3789 .load_cleanup = ram_load_cleanup, 3790 .resume_prepare = ram_resume_prepare, 3791 }; 3792 3793 void ram_mig_init(void) 3794 { 3795 qemu_mutex_init(&XBZRLE.lock); 3796 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 3797 } 3798