1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "cpu.h" 31 #include "qemu/cutils.h" 32 #include "qemu/bitops.h" 33 #include "qemu/bitmap.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration/register.h" 39 #include "migration/misc.h" 40 #include "qemu-file.h" 41 #include "postcopy-ram.h" 42 #include "page_cache.h" 43 #include "qemu/error-report.h" 44 #include "qapi/error.h" 45 #include "qapi/qapi-types-migration.h" 46 #include "qapi/qapi-events-migration.h" 47 #include "qapi/qmp/qerror.h" 48 #include "trace.h" 49 #include "exec/ram_addr.h" 50 #include "exec/target_page.h" 51 #include "qemu/rcu_queue.h" 52 #include "migration/colo.h" 53 #include "block.h" 54 #include "sysemu/sysemu.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 60 /***********************************************************/ 61 /* ram save/restore */ 62 63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 64 * worked for pages that where filled with the same char. We switched 65 * it to only search for the zero value. And to avoid confusion with 66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 67 */ 68 69 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 70 #define RAM_SAVE_FLAG_ZERO 0x02 71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 72 #define RAM_SAVE_FLAG_PAGE 0x08 73 #define RAM_SAVE_FLAG_EOS 0x10 74 #define RAM_SAVE_FLAG_CONTINUE 0x20 75 #define RAM_SAVE_FLAG_XBZRLE 0x40 76 /* 0x80 is reserved in migration.h start with 0x100 next */ 77 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 78 79 static inline bool is_zero_range(uint8_t *p, uint64_t size) 80 { 81 return buffer_is_zero(p, size); 82 } 83 84 XBZRLECacheStats xbzrle_counters; 85 86 /* struct contains XBZRLE cache and a static page 87 used by the compression */ 88 static struct { 89 /* buffer used for XBZRLE encoding */ 90 uint8_t *encoded_buf; 91 /* buffer for storing page content */ 92 uint8_t *current_buf; 93 /* Cache for XBZRLE, Protected by lock. */ 94 PageCache *cache; 95 QemuMutex lock; 96 /* it will store a page full of zeros */ 97 uint8_t *zero_target_page; 98 /* buffer used for XBZRLE decoding */ 99 uint8_t *decoded_buf; 100 } XBZRLE; 101 102 static void XBZRLE_cache_lock(void) 103 { 104 if (migrate_use_xbzrle()) 105 qemu_mutex_lock(&XBZRLE.lock); 106 } 107 108 static void XBZRLE_cache_unlock(void) 109 { 110 if (migrate_use_xbzrle()) 111 qemu_mutex_unlock(&XBZRLE.lock); 112 } 113 114 /** 115 * xbzrle_cache_resize: resize the xbzrle cache 116 * 117 * This function is called from qmp_migrate_set_cache_size in main 118 * thread, possibly while a migration is in progress. A running 119 * migration may be using the cache and might finish during this call, 120 * hence changes to the cache are protected by XBZRLE.lock(). 121 * 122 * Returns 0 for success or -1 for error 123 * 124 * @new_size: new cache size 125 * @errp: set *errp if the check failed, with reason 126 */ 127 int xbzrle_cache_resize(int64_t new_size, Error **errp) 128 { 129 PageCache *new_cache; 130 int64_t ret = 0; 131 132 /* Check for truncation */ 133 if (new_size != (size_t)new_size) { 134 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 135 "exceeding address space"); 136 return -1; 137 } 138 139 if (new_size == migrate_xbzrle_cache_size()) { 140 /* nothing to do */ 141 return 0; 142 } 143 144 XBZRLE_cache_lock(); 145 146 if (XBZRLE.cache != NULL) { 147 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 148 if (!new_cache) { 149 ret = -1; 150 goto out; 151 } 152 153 cache_fini(XBZRLE.cache); 154 XBZRLE.cache = new_cache; 155 } 156 out: 157 XBZRLE_cache_unlock(); 158 return ret; 159 } 160 161 bool ramblock_is_ignored(RAMBlock *block) 162 { 163 return !qemu_ram_is_migratable(block) || 164 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 165 } 166 167 #undef RAMBLOCK_FOREACH 168 169 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 170 { 171 RAMBlock *block; 172 int ret = 0; 173 174 RCU_READ_LOCK_GUARD(); 175 176 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 177 ret = func(block, opaque); 178 if (ret) { 179 break; 180 } 181 } 182 return ret; 183 } 184 185 static void ramblock_recv_map_init(void) 186 { 187 RAMBlock *rb; 188 189 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 190 assert(!rb->receivedmap); 191 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 192 } 193 } 194 195 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 196 { 197 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 198 rb->receivedmap); 199 } 200 201 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 202 { 203 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 204 } 205 206 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 207 { 208 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 209 } 210 211 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 212 size_t nr) 213 { 214 bitmap_set_atomic(rb->receivedmap, 215 ramblock_recv_bitmap_offset(host_addr, rb), 216 nr); 217 } 218 219 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 220 221 /* 222 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 223 * 224 * Returns >0 if success with sent bytes, or <0 if error. 225 */ 226 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 227 const char *block_name) 228 { 229 RAMBlock *block = qemu_ram_block_by_name(block_name); 230 unsigned long *le_bitmap, nbits; 231 uint64_t size; 232 233 if (!block) { 234 error_report("%s: invalid block name: %s", __func__, block_name); 235 return -1; 236 } 237 238 nbits = block->used_length >> TARGET_PAGE_BITS; 239 240 /* 241 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 242 * machines we may need 4 more bytes for padding (see below 243 * comment). So extend it a bit before hand. 244 */ 245 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 246 247 /* 248 * Always use little endian when sending the bitmap. This is 249 * required that when source and destination VMs are not using the 250 * same endianness. (Note: big endian won't work.) 251 */ 252 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 253 254 /* Size of the bitmap, in bytes */ 255 size = DIV_ROUND_UP(nbits, 8); 256 257 /* 258 * size is always aligned to 8 bytes for 64bit machines, but it 259 * may not be true for 32bit machines. We need this padding to 260 * make sure the migration can survive even between 32bit and 261 * 64bit machines. 262 */ 263 size = ROUND_UP(size, 8); 264 265 qemu_put_be64(file, size); 266 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 267 /* 268 * Mark as an end, in case the middle part is screwed up due to 269 * some "mysterious" reason. 270 */ 271 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 272 qemu_fflush(file); 273 274 g_free(le_bitmap); 275 276 if (qemu_file_get_error(file)) { 277 return qemu_file_get_error(file); 278 } 279 280 return size + sizeof(size); 281 } 282 283 /* 284 * An outstanding page request, on the source, having been received 285 * and queued 286 */ 287 struct RAMSrcPageRequest { 288 RAMBlock *rb; 289 hwaddr offset; 290 hwaddr len; 291 292 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 293 }; 294 295 /* State of RAM for migration */ 296 struct RAMState { 297 /* QEMUFile used for this migration */ 298 QEMUFile *f; 299 /* Last block that we have visited searching for dirty pages */ 300 RAMBlock *last_seen_block; 301 /* Last block from where we have sent data */ 302 RAMBlock *last_sent_block; 303 /* Last dirty target page we have sent */ 304 ram_addr_t last_page; 305 /* last ram version we have seen */ 306 uint32_t last_version; 307 /* We are in the first round */ 308 bool ram_bulk_stage; 309 /* The free page optimization is enabled */ 310 bool fpo_enabled; 311 /* How many times we have dirty too many pages */ 312 int dirty_rate_high_cnt; 313 /* these variables are used for bitmap sync */ 314 /* last time we did a full bitmap_sync */ 315 int64_t time_last_bitmap_sync; 316 /* bytes transferred at start_time */ 317 uint64_t bytes_xfer_prev; 318 /* number of dirty pages since start_time */ 319 uint64_t num_dirty_pages_period; 320 /* xbzrle misses since the beginning of the period */ 321 uint64_t xbzrle_cache_miss_prev; 322 /* Amount of xbzrle pages since the beginning of the period */ 323 uint64_t xbzrle_pages_prev; 324 /* Amount of xbzrle encoded bytes since the beginning of the period */ 325 uint64_t xbzrle_bytes_prev; 326 327 /* compression statistics since the beginning of the period */ 328 /* amount of count that no free thread to compress data */ 329 uint64_t compress_thread_busy_prev; 330 /* amount bytes after compression */ 331 uint64_t compressed_size_prev; 332 /* amount of compressed pages */ 333 uint64_t compress_pages_prev; 334 335 /* total handled target pages at the beginning of period */ 336 uint64_t target_page_count_prev; 337 /* total handled target pages since start */ 338 uint64_t target_page_count; 339 /* number of dirty bits in the bitmap */ 340 uint64_t migration_dirty_pages; 341 /* Protects modification of the bitmap and migration dirty pages */ 342 QemuMutex bitmap_mutex; 343 /* The RAMBlock used in the last src_page_requests */ 344 RAMBlock *last_req_rb; 345 /* Queue of outstanding page requests from the destination */ 346 QemuMutex src_page_req_mutex; 347 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 348 }; 349 typedef struct RAMState RAMState; 350 351 static RAMState *ram_state; 352 353 static NotifierWithReturnList precopy_notifier_list; 354 355 void precopy_infrastructure_init(void) 356 { 357 notifier_with_return_list_init(&precopy_notifier_list); 358 } 359 360 void precopy_add_notifier(NotifierWithReturn *n) 361 { 362 notifier_with_return_list_add(&precopy_notifier_list, n); 363 } 364 365 void precopy_remove_notifier(NotifierWithReturn *n) 366 { 367 notifier_with_return_remove(n); 368 } 369 370 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 371 { 372 PrecopyNotifyData pnd; 373 pnd.reason = reason; 374 pnd.errp = errp; 375 376 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 377 } 378 379 void precopy_enable_free_page_optimization(void) 380 { 381 if (!ram_state) { 382 return; 383 } 384 385 ram_state->fpo_enabled = true; 386 } 387 388 uint64_t ram_bytes_remaining(void) 389 { 390 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 391 0; 392 } 393 394 MigrationStats ram_counters; 395 396 /* used by the search for pages to send */ 397 struct PageSearchStatus { 398 /* Current block being searched */ 399 RAMBlock *block; 400 /* Current page to search from */ 401 unsigned long page; 402 /* Set once we wrap around */ 403 bool complete_round; 404 }; 405 typedef struct PageSearchStatus PageSearchStatus; 406 407 CompressionStats compression_counters; 408 409 struct CompressParam { 410 bool done; 411 bool quit; 412 bool zero_page; 413 QEMUFile *file; 414 QemuMutex mutex; 415 QemuCond cond; 416 RAMBlock *block; 417 ram_addr_t offset; 418 419 /* internally used fields */ 420 z_stream stream; 421 uint8_t *originbuf; 422 }; 423 typedef struct CompressParam CompressParam; 424 425 struct DecompressParam { 426 bool done; 427 bool quit; 428 QemuMutex mutex; 429 QemuCond cond; 430 void *des; 431 uint8_t *compbuf; 432 int len; 433 z_stream stream; 434 }; 435 typedef struct DecompressParam DecompressParam; 436 437 static CompressParam *comp_param; 438 static QemuThread *compress_threads; 439 /* comp_done_cond is used to wake up the migration thread when 440 * one of the compression threads has finished the compression. 441 * comp_done_lock is used to co-work with comp_done_cond. 442 */ 443 static QemuMutex comp_done_lock; 444 static QemuCond comp_done_cond; 445 /* The empty QEMUFileOps will be used by file in CompressParam */ 446 static const QEMUFileOps empty_ops = { }; 447 448 static QEMUFile *decomp_file; 449 static DecompressParam *decomp_param; 450 static QemuThread *decompress_threads; 451 static QemuMutex decomp_done_lock; 452 static QemuCond decomp_done_cond; 453 454 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 455 ram_addr_t offset, uint8_t *source_buf); 456 457 static void *do_data_compress(void *opaque) 458 { 459 CompressParam *param = opaque; 460 RAMBlock *block; 461 ram_addr_t offset; 462 bool zero_page; 463 464 qemu_mutex_lock(¶m->mutex); 465 while (!param->quit) { 466 if (param->block) { 467 block = param->block; 468 offset = param->offset; 469 param->block = NULL; 470 qemu_mutex_unlock(¶m->mutex); 471 472 zero_page = do_compress_ram_page(param->file, ¶m->stream, 473 block, offset, param->originbuf); 474 475 qemu_mutex_lock(&comp_done_lock); 476 param->done = true; 477 param->zero_page = zero_page; 478 qemu_cond_signal(&comp_done_cond); 479 qemu_mutex_unlock(&comp_done_lock); 480 481 qemu_mutex_lock(¶m->mutex); 482 } else { 483 qemu_cond_wait(¶m->cond, ¶m->mutex); 484 } 485 } 486 qemu_mutex_unlock(¶m->mutex); 487 488 return NULL; 489 } 490 491 static void compress_threads_save_cleanup(void) 492 { 493 int i, thread_count; 494 495 if (!migrate_use_compression() || !comp_param) { 496 return; 497 } 498 499 thread_count = migrate_compress_threads(); 500 for (i = 0; i < thread_count; i++) { 501 /* 502 * we use it as a indicator which shows if the thread is 503 * properly init'd or not 504 */ 505 if (!comp_param[i].file) { 506 break; 507 } 508 509 qemu_mutex_lock(&comp_param[i].mutex); 510 comp_param[i].quit = true; 511 qemu_cond_signal(&comp_param[i].cond); 512 qemu_mutex_unlock(&comp_param[i].mutex); 513 514 qemu_thread_join(compress_threads + i); 515 qemu_mutex_destroy(&comp_param[i].mutex); 516 qemu_cond_destroy(&comp_param[i].cond); 517 deflateEnd(&comp_param[i].stream); 518 g_free(comp_param[i].originbuf); 519 qemu_fclose(comp_param[i].file); 520 comp_param[i].file = NULL; 521 } 522 qemu_mutex_destroy(&comp_done_lock); 523 qemu_cond_destroy(&comp_done_cond); 524 g_free(compress_threads); 525 g_free(comp_param); 526 compress_threads = NULL; 527 comp_param = NULL; 528 } 529 530 static int compress_threads_save_setup(void) 531 { 532 int i, thread_count; 533 534 if (!migrate_use_compression()) { 535 return 0; 536 } 537 thread_count = migrate_compress_threads(); 538 compress_threads = g_new0(QemuThread, thread_count); 539 comp_param = g_new0(CompressParam, thread_count); 540 qemu_cond_init(&comp_done_cond); 541 qemu_mutex_init(&comp_done_lock); 542 for (i = 0; i < thread_count; i++) { 543 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 544 if (!comp_param[i].originbuf) { 545 goto exit; 546 } 547 548 if (deflateInit(&comp_param[i].stream, 549 migrate_compress_level()) != Z_OK) { 550 g_free(comp_param[i].originbuf); 551 goto exit; 552 } 553 554 /* comp_param[i].file is just used as a dummy buffer to save data, 555 * set its ops to empty. 556 */ 557 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 558 comp_param[i].done = true; 559 comp_param[i].quit = false; 560 qemu_mutex_init(&comp_param[i].mutex); 561 qemu_cond_init(&comp_param[i].cond); 562 qemu_thread_create(compress_threads + i, "compress", 563 do_data_compress, comp_param + i, 564 QEMU_THREAD_JOINABLE); 565 } 566 return 0; 567 568 exit: 569 compress_threads_save_cleanup(); 570 return -1; 571 } 572 573 /** 574 * save_page_header: write page header to wire 575 * 576 * If this is the 1st block, it also writes the block identification 577 * 578 * Returns the number of bytes written 579 * 580 * @f: QEMUFile where to send the data 581 * @block: block that contains the page we want to send 582 * @offset: offset inside the block for the page 583 * in the lower bits, it contains flags 584 */ 585 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 586 ram_addr_t offset) 587 { 588 size_t size, len; 589 590 if (block == rs->last_sent_block) { 591 offset |= RAM_SAVE_FLAG_CONTINUE; 592 } 593 qemu_put_be64(f, offset); 594 size = 8; 595 596 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 597 len = strlen(block->idstr); 598 qemu_put_byte(f, len); 599 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 600 size += 1 + len; 601 rs->last_sent_block = block; 602 } 603 return size; 604 } 605 606 /** 607 * mig_throttle_guest_down: throotle down the guest 608 * 609 * Reduce amount of guest cpu execution to hopefully slow down memory 610 * writes. If guest dirty memory rate is reduced below the rate at 611 * which we can transfer pages to the destination then we should be 612 * able to complete migration. Some workloads dirty memory way too 613 * fast and will not effectively converge, even with auto-converge. 614 */ 615 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 616 uint64_t bytes_dirty_threshold) 617 { 618 MigrationState *s = migrate_get_current(); 619 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 620 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 621 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 622 int pct_max = s->parameters.max_cpu_throttle; 623 624 uint64_t throttle_now = cpu_throttle_get_percentage(); 625 uint64_t cpu_now, cpu_ideal, throttle_inc; 626 627 /* We have not started throttling yet. Let's start it. */ 628 if (!cpu_throttle_active()) { 629 cpu_throttle_set(pct_initial); 630 } else { 631 /* Throttling already on, just increase the rate */ 632 if (!pct_tailslow) { 633 throttle_inc = pct_increment; 634 } else { 635 /* Compute the ideal CPU percentage used by Guest, which may 636 * make the dirty rate match the dirty rate threshold. */ 637 cpu_now = 100 - throttle_now; 638 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 639 bytes_dirty_period); 640 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 641 } 642 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 643 } 644 } 645 646 /** 647 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 648 * 649 * @rs: current RAM state 650 * @current_addr: address for the zero page 651 * 652 * Update the xbzrle cache to reflect a page that's been sent as all 0. 653 * The important thing is that a stale (not-yet-0'd) page be replaced 654 * by the new data. 655 * As a bonus, if the page wasn't in the cache it gets added so that 656 * when a small write is made into the 0'd page it gets XBZRLE sent. 657 */ 658 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 659 { 660 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 661 return; 662 } 663 664 /* We don't care if this fails to allocate a new cache page 665 * as long as it updated an old one */ 666 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 667 ram_counters.dirty_sync_count); 668 } 669 670 #define ENCODING_FLAG_XBZRLE 0x1 671 672 /** 673 * save_xbzrle_page: compress and send current page 674 * 675 * Returns: 1 means that we wrote the page 676 * 0 means that page is identical to the one already sent 677 * -1 means that xbzrle would be longer than normal 678 * 679 * @rs: current RAM state 680 * @current_data: pointer to the address of the page contents 681 * @current_addr: addr of the page 682 * @block: block that contains the page we want to send 683 * @offset: offset inside the block for the page 684 * @last_stage: if we are at the completion stage 685 */ 686 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 687 ram_addr_t current_addr, RAMBlock *block, 688 ram_addr_t offset, bool last_stage) 689 { 690 int encoded_len = 0, bytes_xbzrle; 691 uint8_t *prev_cached_page; 692 693 if (!cache_is_cached(XBZRLE.cache, current_addr, 694 ram_counters.dirty_sync_count)) { 695 xbzrle_counters.cache_miss++; 696 if (!last_stage) { 697 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 698 ram_counters.dirty_sync_count) == -1) { 699 return -1; 700 } else { 701 /* update *current_data when the page has been 702 inserted into cache */ 703 *current_data = get_cached_data(XBZRLE.cache, current_addr); 704 } 705 } 706 return -1; 707 } 708 709 /* 710 * Reaching here means the page has hit the xbzrle cache, no matter what 711 * encoding result it is (normal encoding, overflow or skipping the page), 712 * count the page as encoded. This is used to calculate the encoding rate. 713 * 714 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 715 * 2nd page turns out to be skipped (i.e. no new bytes written to the 716 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 717 * skipped page included. In this way, the encoding rate can tell if the 718 * guest page is good for xbzrle encoding. 719 */ 720 xbzrle_counters.pages++; 721 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 722 723 /* save current buffer into memory */ 724 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 725 726 /* XBZRLE encoding (if there is no overflow) */ 727 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 728 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 729 TARGET_PAGE_SIZE); 730 731 /* 732 * Update the cache contents, so that it corresponds to the data 733 * sent, in all cases except where we skip the page. 734 */ 735 if (!last_stage && encoded_len != 0) { 736 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 737 /* 738 * In the case where we couldn't compress, ensure that the caller 739 * sends the data from the cache, since the guest might have 740 * changed the RAM since we copied it. 741 */ 742 *current_data = prev_cached_page; 743 } 744 745 if (encoded_len == 0) { 746 trace_save_xbzrle_page_skipping(); 747 return 0; 748 } else if (encoded_len == -1) { 749 trace_save_xbzrle_page_overflow(); 750 xbzrle_counters.overflow++; 751 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 752 return -1; 753 } 754 755 /* Send XBZRLE based compressed page */ 756 bytes_xbzrle = save_page_header(rs, rs->f, block, 757 offset | RAM_SAVE_FLAG_XBZRLE); 758 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 759 qemu_put_be16(rs->f, encoded_len); 760 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 761 bytes_xbzrle += encoded_len + 1 + 2; 762 /* 763 * Like compressed_size (please see update_compress_thread_counts), 764 * the xbzrle encoded bytes don't count the 8 byte header with 765 * RAM_SAVE_FLAG_CONTINUE. 766 */ 767 xbzrle_counters.bytes += bytes_xbzrle - 8; 768 ram_counters.transferred += bytes_xbzrle; 769 770 return 1; 771 } 772 773 /** 774 * migration_bitmap_find_dirty: find the next dirty page from start 775 * 776 * Returns the page offset within memory region of the start of a dirty page 777 * 778 * @rs: current RAM state 779 * @rb: RAMBlock where to search for dirty pages 780 * @start: page where we start the search 781 */ 782 static inline 783 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 784 unsigned long start) 785 { 786 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 787 unsigned long *bitmap = rb->bmap; 788 unsigned long next; 789 790 if (ramblock_is_ignored(rb)) { 791 return size; 792 } 793 794 /* 795 * When the free page optimization is enabled, we need to check the bitmap 796 * to send the non-free pages rather than all the pages in the bulk stage. 797 */ 798 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) { 799 next = start + 1; 800 } else { 801 next = find_next_bit(bitmap, size, start); 802 } 803 804 return next; 805 } 806 807 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 808 RAMBlock *rb, 809 unsigned long page) 810 { 811 bool ret; 812 813 qemu_mutex_lock(&rs->bitmap_mutex); 814 815 /* 816 * Clear dirty bitmap if needed. This _must_ be called before we 817 * send any of the page in the chunk because we need to make sure 818 * we can capture further page content changes when we sync dirty 819 * log the next time. So as long as we are going to send any of 820 * the page in the chunk we clear the remote dirty bitmap for all. 821 * Clearing it earlier won't be a problem, but too late will. 822 */ 823 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) { 824 uint8_t shift = rb->clear_bmap_shift; 825 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift); 826 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); 827 828 /* 829 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 830 * can make things easier sometimes since then start address 831 * of the small chunk will always be 64 pages aligned so the 832 * bitmap will always be aligned to unsigned long. We should 833 * even be able to remove this restriction but I'm simply 834 * keeping it. 835 */ 836 assert(shift >= 6); 837 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 838 memory_region_clear_dirty_bitmap(rb->mr, start, size); 839 } 840 841 ret = test_and_clear_bit(page, rb->bmap); 842 843 if (ret) { 844 rs->migration_dirty_pages--; 845 } 846 qemu_mutex_unlock(&rs->bitmap_mutex); 847 848 return ret; 849 } 850 851 /* Called with RCU critical section */ 852 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 853 { 854 uint64_t new_dirty_pages = 855 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 856 857 rs->migration_dirty_pages += new_dirty_pages; 858 rs->num_dirty_pages_period += new_dirty_pages; 859 } 860 861 /** 862 * ram_pagesize_summary: calculate all the pagesizes of a VM 863 * 864 * Returns a summary bitmap of the page sizes of all RAMBlocks 865 * 866 * For VMs with just normal pages this is equivalent to the host page 867 * size. If it's got some huge pages then it's the OR of all the 868 * different page sizes. 869 */ 870 uint64_t ram_pagesize_summary(void) 871 { 872 RAMBlock *block; 873 uint64_t summary = 0; 874 875 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 876 summary |= block->page_size; 877 } 878 879 return summary; 880 } 881 882 uint64_t ram_get_total_transferred_pages(void) 883 { 884 return ram_counters.normal + ram_counters.duplicate + 885 compression_counters.pages + xbzrle_counters.pages; 886 } 887 888 static void migration_update_rates(RAMState *rs, int64_t end_time) 889 { 890 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 891 double compressed_size; 892 893 /* calculate period counters */ 894 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 895 / (end_time - rs->time_last_bitmap_sync); 896 897 if (!page_count) { 898 return; 899 } 900 901 if (migrate_use_xbzrle()) { 902 double encoded_size, unencoded_size; 903 904 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 905 rs->xbzrle_cache_miss_prev) / page_count; 906 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 907 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 908 TARGET_PAGE_SIZE; 909 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 910 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 911 xbzrle_counters.encoding_rate = 0; 912 } else { 913 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 914 } 915 rs->xbzrle_pages_prev = xbzrle_counters.pages; 916 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 917 } 918 919 if (migrate_use_compression()) { 920 compression_counters.busy_rate = (double)(compression_counters.busy - 921 rs->compress_thread_busy_prev) / page_count; 922 rs->compress_thread_busy_prev = compression_counters.busy; 923 924 compressed_size = compression_counters.compressed_size - 925 rs->compressed_size_prev; 926 if (compressed_size) { 927 double uncompressed_size = (compression_counters.pages - 928 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 929 930 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 931 compression_counters.compression_rate = 932 uncompressed_size / compressed_size; 933 934 rs->compress_pages_prev = compression_counters.pages; 935 rs->compressed_size_prev = compression_counters.compressed_size; 936 } 937 } 938 } 939 940 static void migration_trigger_throttle(RAMState *rs) 941 { 942 MigrationState *s = migrate_get_current(); 943 uint64_t threshold = s->parameters.throttle_trigger_threshold; 944 945 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 946 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 947 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 948 949 /* During block migration the auto-converge logic incorrectly detects 950 * that ram migration makes no progress. Avoid this by disabling the 951 * throttling logic during the bulk phase of block migration. */ 952 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 953 /* The following detection logic can be refined later. For now: 954 Check to see if the ratio between dirtied bytes and the approx. 955 amount of bytes that just got transferred since the last time 956 we were in this routine reaches the threshold. If that happens 957 twice, start or increase throttling. */ 958 959 if ((bytes_dirty_period > bytes_dirty_threshold) && 960 (++rs->dirty_rate_high_cnt >= 2)) { 961 trace_migration_throttle(); 962 rs->dirty_rate_high_cnt = 0; 963 mig_throttle_guest_down(bytes_dirty_period, 964 bytes_dirty_threshold); 965 } 966 } 967 } 968 969 static void migration_bitmap_sync(RAMState *rs) 970 { 971 RAMBlock *block; 972 int64_t end_time; 973 974 ram_counters.dirty_sync_count++; 975 976 if (!rs->time_last_bitmap_sync) { 977 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 978 } 979 980 trace_migration_bitmap_sync_start(); 981 memory_global_dirty_log_sync(); 982 983 qemu_mutex_lock(&rs->bitmap_mutex); 984 WITH_RCU_READ_LOCK_GUARD() { 985 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 986 ramblock_sync_dirty_bitmap(rs, block); 987 } 988 ram_counters.remaining = ram_bytes_remaining(); 989 } 990 qemu_mutex_unlock(&rs->bitmap_mutex); 991 992 memory_global_after_dirty_log_sync(); 993 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 994 995 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 996 997 /* more than 1 second = 1000 millisecons */ 998 if (end_time > rs->time_last_bitmap_sync + 1000) { 999 migration_trigger_throttle(rs); 1000 1001 migration_update_rates(rs, end_time); 1002 1003 rs->target_page_count_prev = rs->target_page_count; 1004 1005 /* reset period counters */ 1006 rs->time_last_bitmap_sync = end_time; 1007 rs->num_dirty_pages_period = 0; 1008 rs->bytes_xfer_prev = ram_counters.transferred; 1009 } 1010 if (migrate_use_events()) { 1011 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1012 } 1013 } 1014 1015 static void migration_bitmap_sync_precopy(RAMState *rs) 1016 { 1017 Error *local_err = NULL; 1018 1019 /* 1020 * The current notifier usage is just an optimization to migration, so we 1021 * don't stop the normal migration process in the error case. 1022 */ 1023 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1024 error_report_err(local_err); 1025 local_err = NULL; 1026 } 1027 1028 migration_bitmap_sync(rs); 1029 1030 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1031 error_report_err(local_err); 1032 } 1033 } 1034 1035 /** 1036 * save_zero_page_to_file: send the zero page to the file 1037 * 1038 * Returns the size of data written to the file, 0 means the page is not 1039 * a zero page 1040 * 1041 * @rs: current RAM state 1042 * @file: the file where the data is saved 1043 * @block: block that contains the page we want to send 1044 * @offset: offset inside the block for the page 1045 */ 1046 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1047 RAMBlock *block, ram_addr_t offset) 1048 { 1049 uint8_t *p = block->host + offset; 1050 int len = 0; 1051 1052 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1053 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1054 qemu_put_byte(file, 0); 1055 len += 1; 1056 } 1057 return len; 1058 } 1059 1060 /** 1061 * save_zero_page: send the zero page to the stream 1062 * 1063 * Returns the number of pages written. 1064 * 1065 * @rs: current RAM state 1066 * @block: block that contains the page we want to send 1067 * @offset: offset inside the block for the page 1068 */ 1069 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1070 { 1071 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1072 1073 if (len) { 1074 ram_counters.duplicate++; 1075 ram_counters.transferred += len; 1076 return 1; 1077 } 1078 return -1; 1079 } 1080 1081 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1082 { 1083 if (!migrate_release_ram() || !migration_in_postcopy()) { 1084 return; 1085 } 1086 1087 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1088 } 1089 1090 /* 1091 * @pages: the number of pages written by the control path, 1092 * < 0 - error 1093 * > 0 - number of pages written 1094 * 1095 * Return true if the pages has been saved, otherwise false is returned. 1096 */ 1097 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1098 int *pages) 1099 { 1100 uint64_t bytes_xmit = 0; 1101 int ret; 1102 1103 *pages = -1; 1104 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1105 &bytes_xmit); 1106 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1107 return false; 1108 } 1109 1110 if (bytes_xmit) { 1111 ram_counters.transferred += bytes_xmit; 1112 *pages = 1; 1113 } 1114 1115 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1116 return true; 1117 } 1118 1119 if (bytes_xmit > 0) { 1120 ram_counters.normal++; 1121 } else if (bytes_xmit == 0) { 1122 ram_counters.duplicate++; 1123 } 1124 1125 return true; 1126 } 1127 1128 /* 1129 * directly send the page to the stream 1130 * 1131 * Returns the number of pages written. 1132 * 1133 * @rs: current RAM state 1134 * @block: block that contains the page we want to send 1135 * @offset: offset inside the block for the page 1136 * @buf: the page to be sent 1137 * @async: send to page asyncly 1138 */ 1139 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1140 uint8_t *buf, bool async) 1141 { 1142 ram_counters.transferred += save_page_header(rs, rs->f, block, 1143 offset | RAM_SAVE_FLAG_PAGE); 1144 if (async) { 1145 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1146 migrate_release_ram() & 1147 migration_in_postcopy()); 1148 } else { 1149 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1150 } 1151 ram_counters.transferred += TARGET_PAGE_SIZE; 1152 ram_counters.normal++; 1153 return 1; 1154 } 1155 1156 /** 1157 * ram_save_page: send the given page to the stream 1158 * 1159 * Returns the number of pages written. 1160 * < 0 - error 1161 * >=0 - Number of pages written - this might legally be 0 1162 * if xbzrle noticed the page was the same. 1163 * 1164 * @rs: current RAM state 1165 * @block: block that contains the page we want to send 1166 * @offset: offset inside the block for the page 1167 * @last_stage: if we are at the completion stage 1168 */ 1169 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1170 { 1171 int pages = -1; 1172 uint8_t *p; 1173 bool send_async = true; 1174 RAMBlock *block = pss->block; 1175 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1176 ram_addr_t current_addr = block->offset + offset; 1177 1178 p = block->host + offset; 1179 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1180 1181 XBZRLE_cache_lock(); 1182 if (!rs->ram_bulk_stage && !migration_in_postcopy() && 1183 migrate_use_xbzrle()) { 1184 pages = save_xbzrle_page(rs, &p, current_addr, block, 1185 offset, last_stage); 1186 if (!last_stage) { 1187 /* Can't send this cached data async, since the cache page 1188 * might get updated before it gets to the wire 1189 */ 1190 send_async = false; 1191 } 1192 } 1193 1194 /* XBZRLE overflow or normal page */ 1195 if (pages == -1) { 1196 pages = save_normal_page(rs, block, offset, p, send_async); 1197 } 1198 1199 XBZRLE_cache_unlock(); 1200 1201 return pages; 1202 } 1203 1204 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1205 ram_addr_t offset) 1206 { 1207 if (multifd_queue_page(rs->f, block, offset) < 0) { 1208 return -1; 1209 } 1210 ram_counters.normal++; 1211 1212 return 1; 1213 } 1214 1215 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1216 ram_addr_t offset, uint8_t *source_buf) 1217 { 1218 RAMState *rs = ram_state; 1219 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1220 bool zero_page = false; 1221 int ret; 1222 1223 if (save_zero_page_to_file(rs, f, block, offset)) { 1224 zero_page = true; 1225 goto exit; 1226 } 1227 1228 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1229 1230 /* 1231 * copy it to a internal buffer to avoid it being modified by VM 1232 * so that we can catch up the error during compression and 1233 * decompression 1234 */ 1235 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1236 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1237 if (ret < 0) { 1238 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1239 error_report("compressed data failed!"); 1240 return false; 1241 } 1242 1243 exit: 1244 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1245 return zero_page; 1246 } 1247 1248 static void 1249 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1250 { 1251 ram_counters.transferred += bytes_xmit; 1252 1253 if (param->zero_page) { 1254 ram_counters.duplicate++; 1255 return; 1256 } 1257 1258 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1259 compression_counters.compressed_size += bytes_xmit - 8; 1260 compression_counters.pages++; 1261 } 1262 1263 static bool save_page_use_compression(RAMState *rs); 1264 1265 static void flush_compressed_data(RAMState *rs) 1266 { 1267 int idx, len, thread_count; 1268 1269 if (!save_page_use_compression(rs)) { 1270 return; 1271 } 1272 thread_count = migrate_compress_threads(); 1273 1274 qemu_mutex_lock(&comp_done_lock); 1275 for (idx = 0; idx < thread_count; idx++) { 1276 while (!comp_param[idx].done) { 1277 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1278 } 1279 } 1280 qemu_mutex_unlock(&comp_done_lock); 1281 1282 for (idx = 0; idx < thread_count; idx++) { 1283 qemu_mutex_lock(&comp_param[idx].mutex); 1284 if (!comp_param[idx].quit) { 1285 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1286 /* 1287 * it's safe to fetch zero_page without holding comp_done_lock 1288 * as there is no further request submitted to the thread, 1289 * i.e, the thread should be waiting for a request at this point. 1290 */ 1291 update_compress_thread_counts(&comp_param[idx], len); 1292 } 1293 qemu_mutex_unlock(&comp_param[idx].mutex); 1294 } 1295 } 1296 1297 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1298 ram_addr_t offset) 1299 { 1300 param->block = block; 1301 param->offset = offset; 1302 } 1303 1304 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1305 ram_addr_t offset) 1306 { 1307 int idx, thread_count, bytes_xmit = -1, pages = -1; 1308 bool wait = migrate_compress_wait_thread(); 1309 1310 thread_count = migrate_compress_threads(); 1311 qemu_mutex_lock(&comp_done_lock); 1312 retry: 1313 for (idx = 0; idx < thread_count; idx++) { 1314 if (comp_param[idx].done) { 1315 comp_param[idx].done = false; 1316 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1317 qemu_mutex_lock(&comp_param[idx].mutex); 1318 set_compress_params(&comp_param[idx], block, offset); 1319 qemu_cond_signal(&comp_param[idx].cond); 1320 qemu_mutex_unlock(&comp_param[idx].mutex); 1321 pages = 1; 1322 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1323 break; 1324 } 1325 } 1326 1327 /* 1328 * wait for the free thread if the user specifies 'compress-wait-thread', 1329 * otherwise we will post the page out in the main thread as normal page. 1330 */ 1331 if (pages < 0 && wait) { 1332 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1333 goto retry; 1334 } 1335 qemu_mutex_unlock(&comp_done_lock); 1336 1337 return pages; 1338 } 1339 1340 /** 1341 * find_dirty_block: find the next dirty page and update any state 1342 * associated with the search process. 1343 * 1344 * Returns true if a page is found 1345 * 1346 * @rs: current RAM state 1347 * @pss: data about the state of the current dirty page scan 1348 * @again: set to false if the search has scanned the whole of RAM 1349 */ 1350 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1351 { 1352 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1353 if (pss->complete_round && pss->block == rs->last_seen_block && 1354 pss->page >= rs->last_page) { 1355 /* 1356 * We've been once around the RAM and haven't found anything. 1357 * Give up. 1358 */ 1359 *again = false; 1360 return false; 1361 } 1362 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS) 1363 >= pss->block->used_length) { 1364 /* Didn't find anything in this RAM Block */ 1365 pss->page = 0; 1366 pss->block = QLIST_NEXT_RCU(pss->block, next); 1367 if (!pss->block) { 1368 /* 1369 * If memory migration starts over, we will meet a dirtied page 1370 * which may still exists in compression threads's ring, so we 1371 * should flush the compressed data to make sure the new page 1372 * is not overwritten by the old one in the destination. 1373 * 1374 * Also If xbzrle is on, stop using the data compression at this 1375 * point. In theory, xbzrle can do better than compression. 1376 */ 1377 flush_compressed_data(rs); 1378 1379 /* Hit the end of the list */ 1380 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1381 /* Flag that we've looped */ 1382 pss->complete_round = true; 1383 rs->ram_bulk_stage = false; 1384 } 1385 /* Didn't find anything this time, but try again on the new block */ 1386 *again = true; 1387 return false; 1388 } else { 1389 /* Can go around again, but... */ 1390 *again = true; 1391 /* We've found something so probably don't need to */ 1392 return true; 1393 } 1394 } 1395 1396 /** 1397 * unqueue_page: gets a page of the queue 1398 * 1399 * Helper for 'get_queued_page' - gets a page off the queue 1400 * 1401 * Returns the block of the page (or NULL if none available) 1402 * 1403 * @rs: current RAM state 1404 * @offset: used to return the offset within the RAMBlock 1405 */ 1406 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1407 { 1408 RAMBlock *block = NULL; 1409 1410 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1411 return NULL; 1412 } 1413 1414 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1415 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1416 struct RAMSrcPageRequest *entry = 1417 QSIMPLEQ_FIRST(&rs->src_page_requests); 1418 block = entry->rb; 1419 *offset = entry->offset; 1420 1421 if (entry->len > TARGET_PAGE_SIZE) { 1422 entry->len -= TARGET_PAGE_SIZE; 1423 entry->offset += TARGET_PAGE_SIZE; 1424 } else { 1425 memory_region_unref(block->mr); 1426 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1427 g_free(entry); 1428 migration_consume_urgent_request(); 1429 } 1430 } 1431 1432 return block; 1433 } 1434 1435 /** 1436 * get_queued_page: unqueue a page from the postcopy requests 1437 * 1438 * Skips pages that are already sent (!dirty) 1439 * 1440 * Returns true if a queued page is found 1441 * 1442 * @rs: current RAM state 1443 * @pss: data about the state of the current dirty page scan 1444 */ 1445 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1446 { 1447 RAMBlock *block; 1448 ram_addr_t offset; 1449 bool dirty; 1450 1451 do { 1452 block = unqueue_page(rs, &offset); 1453 /* 1454 * We're sending this page, and since it's postcopy nothing else 1455 * will dirty it, and we must make sure it doesn't get sent again 1456 * even if this queue request was received after the background 1457 * search already sent it. 1458 */ 1459 if (block) { 1460 unsigned long page; 1461 1462 page = offset >> TARGET_PAGE_BITS; 1463 dirty = test_bit(page, block->bmap); 1464 if (!dirty) { 1465 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1466 page); 1467 } else { 1468 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1469 } 1470 } 1471 1472 } while (block && !dirty); 1473 1474 if (block) { 1475 /* 1476 * As soon as we start servicing pages out of order, then we have 1477 * to kill the bulk stage, since the bulk stage assumes 1478 * in (migration_bitmap_find_and_reset_dirty) that every page is 1479 * dirty, that's no longer true. 1480 */ 1481 rs->ram_bulk_stage = false; 1482 1483 /* 1484 * We want the background search to continue from the queued page 1485 * since the guest is likely to want other pages near to the page 1486 * it just requested. 1487 */ 1488 pss->block = block; 1489 pss->page = offset >> TARGET_PAGE_BITS; 1490 1491 /* 1492 * This unqueued page would break the "one round" check, even is 1493 * really rare. 1494 */ 1495 pss->complete_round = false; 1496 } 1497 1498 return !!block; 1499 } 1500 1501 /** 1502 * migration_page_queue_free: drop any remaining pages in the ram 1503 * request queue 1504 * 1505 * It should be empty at the end anyway, but in error cases there may 1506 * be some left. in case that there is any page left, we drop it. 1507 * 1508 */ 1509 static void migration_page_queue_free(RAMState *rs) 1510 { 1511 struct RAMSrcPageRequest *mspr, *next_mspr; 1512 /* This queue generally should be empty - but in the case of a failed 1513 * migration might have some droppings in. 1514 */ 1515 RCU_READ_LOCK_GUARD(); 1516 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1517 memory_region_unref(mspr->rb->mr); 1518 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1519 g_free(mspr); 1520 } 1521 } 1522 1523 /** 1524 * ram_save_queue_pages: queue the page for transmission 1525 * 1526 * A request from postcopy destination for example. 1527 * 1528 * Returns zero on success or negative on error 1529 * 1530 * @rbname: Name of the RAMBLock of the request. NULL means the 1531 * same that last one. 1532 * @start: starting address from the start of the RAMBlock 1533 * @len: length (in bytes) to send 1534 */ 1535 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1536 { 1537 RAMBlock *ramblock; 1538 RAMState *rs = ram_state; 1539 1540 ram_counters.postcopy_requests++; 1541 RCU_READ_LOCK_GUARD(); 1542 1543 if (!rbname) { 1544 /* Reuse last RAMBlock */ 1545 ramblock = rs->last_req_rb; 1546 1547 if (!ramblock) { 1548 /* 1549 * Shouldn't happen, we can't reuse the last RAMBlock if 1550 * it's the 1st request. 1551 */ 1552 error_report("ram_save_queue_pages no previous block"); 1553 return -1; 1554 } 1555 } else { 1556 ramblock = qemu_ram_block_by_name(rbname); 1557 1558 if (!ramblock) { 1559 /* We shouldn't be asked for a non-existent RAMBlock */ 1560 error_report("ram_save_queue_pages no block '%s'", rbname); 1561 return -1; 1562 } 1563 rs->last_req_rb = ramblock; 1564 } 1565 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1566 if (start+len > ramblock->used_length) { 1567 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1568 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1569 __func__, start, len, ramblock->used_length); 1570 return -1; 1571 } 1572 1573 struct RAMSrcPageRequest *new_entry = 1574 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1575 new_entry->rb = ramblock; 1576 new_entry->offset = start; 1577 new_entry->len = len; 1578 1579 memory_region_ref(ramblock->mr); 1580 qemu_mutex_lock(&rs->src_page_req_mutex); 1581 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1582 migration_make_urgent_request(); 1583 qemu_mutex_unlock(&rs->src_page_req_mutex); 1584 1585 return 0; 1586 } 1587 1588 static bool save_page_use_compression(RAMState *rs) 1589 { 1590 if (!migrate_use_compression()) { 1591 return false; 1592 } 1593 1594 /* 1595 * If xbzrle is on, stop using the data compression after first 1596 * round of migration even if compression is enabled. In theory, 1597 * xbzrle can do better than compression. 1598 */ 1599 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 1600 return true; 1601 } 1602 1603 return false; 1604 } 1605 1606 /* 1607 * try to compress the page before posting it out, return true if the page 1608 * has been properly handled by compression, otherwise needs other 1609 * paths to handle it 1610 */ 1611 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1612 { 1613 if (!save_page_use_compression(rs)) { 1614 return false; 1615 } 1616 1617 /* 1618 * When starting the process of a new block, the first page of 1619 * the block should be sent out before other pages in the same 1620 * block, and all the pages in last block should have been sent 1621 * out, keeping this order is important, because the 'cont' flag 1622 * is used to avoid resending the block name. 1623 * 1624 * We post the fist page as normal page as compression will take 1625 * much CPU resource. 1626 */ 1627 if (block != rs->last_sent_block) { 1628 flush_compressed_data(rs); 1629 return false; 1630 } 1631 1632 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 1633 return true; 1634 } 1635 1636 compression_counters.busy++; 1637 return false; 1638 } 1639 1640 /** 1641 * ram_save_target_page: save one target page 1642 * 1643 * Returns the number of pages written 1644 * 1645 * @rs: current RAM state 1646 * @pss: data about the page we want to send 1647 * @last_stage: if we are at the completion stage 1648 */ 1649 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1650 bool last_stage) 1651 { 1652 RAMBlock *block = pss->block; 1653 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1654 int res; 1655 1656 if (control_save_page(rs, block, offset, &res)) { 1657 return res; 1658 } 1659 1660 if (save_compress_page(rs, block, offset)) { 1661 return 1; 1662 } 1663 1664 res = save_zero_page(rs, block, offset); 1665 if (res > 0) { 1666 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1667 * page would be stale 1668 */ 1669 if (!save_page_use_compression(rs)) { 1670 XBZRLE_cache_lock(); 1671 xbzrle_cache_zero_page(rs, block->offset + offset); 1672 XBZRLE_cache_unlock(); 1673 } 1674 ram_release_pages(block->idstr, offset, res); 1675 return res; 1676 } 1677 1678 /* 1679 * Do not use multifd for: 1680 * 1. Compression as the first page in the new block should be posted out 1681 * before sending the compressed page 1682 * 2. In postcopy as one whole host page should be placed 1683 */ 1684 if (!save_page_use_compression(rs) && migrate_use_multifd() 1685 && !migration_in_postcopy()) { 1686 return ram_save_multifd_page(rs, block, offset); 1687 } 1688 1689 return ram_save_page(rs, pss, last_stage); 1690 } 1691 1692 /** 1693 * ram_save_host_page: save a whole host page 1694 * 1695 * Starting at *offset send pages up to the end of the current host 1696 * page. It's valid for the initial offset to point into the middle of 1697 * a host page in which case the remainder of the hostpage is sent. 1698 * Only dirty target pages are sent. Note that the host page size may 1699 * be a huge page for this block. 1700 * The saving stops at the boundary of the used_length of the block 1701 * if the RAMBlock isn't a multiple of the host page size. 1702 * 1703 * Returns the number of pages written or negative on error 1704 * 1705 * @rs: current RAM state 1706 * @ms: current migration state 1707 * @pss: data about the page we want to send 1708 * @last_stage: if we are at the completion stage 1709 */ 1710 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1711 bool last_stage) 1712 { 1713 int tmppages, pages = 0; 1714 size_t pagesize_bits = 1715 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1716 1717 if (ramblock_is_ignored(pss->block)) { 1718 error_report("block %s should not be migrated !", pss->block->idstr); 1719 return 0; 1720 } 1721 1722 do { 1723 /* Check the pages is dirty and if it is send it */ 1724 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 1725 pss->page++; 1726 continue; 1727 } 1728 1729 tmppages = ram_save_target_page(rs, pss, last_stage); 1730 if (tmppages < 0) { 1731 return tmppages; 1732 } 1733 1734 pages += tmppages; 1735 pss->page++; 1736 /* Allow rate limiting to happen in the middle of huge pages */ 1737 migration_rate_limit(); 1738 } while ((pss->page & (pagesize_bits - 1)) && 1739 offset_in_ramblock(pss->block, 1740 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 1741 1742 /* The offset we leave with is the last one we looked at */ 1743 pss->page--; 1744 return pages; 1745 } 1746 1747 /** 1748 * ram_find_and_save_block: finds a dirty page and sends it to f 1749 * 1750 * Called within an RCU critical section. 1751 * 1752 * Returns the number of pages written where zero means no dirty pages, 1753 * or negative on error 1754 * 1755 * @rs: current RAM state 1756 * @last_stage: if we are at the completion stage 1757 * 1758 * On systems where host-page-size > target-page-size it will send all the 1759 * pages in a host page that are dirty. 1760 */ 1761 1762 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 1763 { 1764 PageSearchStatus pss; 1765 int pages = 0; 1766 bool again, found; 1767 1768 /* No dirty page as there is zero RAM */ 1769 if (!ram_bytes_total()) { 1770 return pages; 1771 } 1772 1773 pss.block = rs->last_seen_block; 1774 pss.page = rs->last_page; 1775 pss.complete_round = false; 1776 1777 if (!pss.block) { 1778 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1779 } 1780 1781 do { 1782 again = true; 1783 found = get_queued_page(rs, &pss); 1784 1785 if (!found) { 1786 /* priority queue empty, so just search for something dirty */ 1787 found = find_dirty_block(rs, &pss, &again); 1788 } 1789 1790 if (found) { 1791 pages = ram_save_host_page(rs, &pss, last_stage); 1792 } 1793 } while (!pages && again); 1794 1795 rs->last_seen_block = pss.block; 1796 rs->last_page = pss.page; 1797 1798 return pages; 1799 } 1800 1801 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1802 { 1803 uint64_t pages = size / TARGET_PAGE_SIZE; 1804 1805 if (zero) { 1806 ram_counters.duplicate += pages; 1807 } else { 1808 ram_counters.normal += pages; 1809 ram_counters.transferred += size; 1810 qemu_update_position(f, size); 1811 } 1812 } 1813 1814 static uint64_t ram_bytes_total_common(bool count_ignored) 1815 { 1816 RAMBlock *block; 1817 uint64_t total = 0; 1818 1819 RCU_READ_LOCK_GUARD(); 1820 1821 if (count_ignored) { 1822 RAMBLOCK_FOREACH_MIGRATABLE(block) { 1823 total += block->used_length; 1824 } 1825 } else { 1826 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1827 total += block->used_length; 1828 } 1829 } 1830 return total; 1831 } 1832 1833 uint64_t ram_bytes_total(void) 1834 { 1835 return ram_bytes_total_common(false); 1836 } 1837 1838 static void xbzrle_load_setup(void) 1839 { 1840 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 1841 } 1842 1843 static void xbzrle_load_cleanup(void) 1844 { 1845 g_free(XBZRLE.decoded_buf); 1846 XBZRLE.decoded_buf = NULL; 1847 } 1848 1849 static void ram_state_cleanup(RAMState **rsp) 1850 { 1851 if (*rsp) { 1852 migration_page_queue_free(*rsp); 1853 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 1854 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 1855 g_free(*rsp); 1856 *rsp = NULL; 1857 } 1858 } 1859 1860 static void xbzrle_cleanup(void) 1861 { 1862 XBZRLE_cache_lock(); 1863 if (XBZRLE.cache) { 1864 cache_fini(XBZRLE.cache); 1865 g_free(XBZRLE.encoded_buf); 1866 g_free(XBZRLE.current_buf); 1867 g_free(XBZRLE.zero_target_page); 1868 XBZRLE.cache = NULL; 1869 XBZRLE.encoded_buf = NULL; 1870 XBZRLE.current_buf = NULL; 1871 XBZRLE.zero_target_page = NULL; 1872 } 1873 XBZRLE_cache_unlock(); 1874 } 1875 1876 static void ram_save_cleanup(void *opaque) 1877 { 1878 RAMState **rsp = opaque; 1879 RAMBlock *block; 1880 1881 /* caller have hold iothread lock or is in a bh, so there is 1882 * no writing race against the migration bitmap 1883 */ 1884 memory_global_dirty_log_stop(); 1885 1886 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1887 g_free(block->clear_bmap); 1888 block->clear_bmap = NULL; 1889 g_free(block->bmap); 1890 block->bmap = NULL; 1891 } 1892 1893 xbzrle_cleanup(); 1894 compress_threads_save_cleanup(); 1895 ram_state_cleanup(rsp); 1896 } 1897 1898 static void ram_state_reset(RAMState *rs) 1899 { 1900 rs->last_seen_block = NULL; 1901 rs->last_sent_block = NULL; 1902 rs->last_page = 0; 1903 rs->last_version = ram_list.version; 1904 rs->ram_bulk_stage = true; 1905 rs->fpo_enabled = false; 1906 } 1907 1908 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1909 1910 /* 1911 * 'expected' is the value you expect the bitmap mostly to be full 1912 * of; it won't bother printing lines that are all this value. 1913 * If 'todump' is null the migration bitmap is dumped. 1914 */ 1915 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 1916 unsigned long pages) 1917 { 1918 int64_t cur; 1919 int64_t linelen = 128; 1920 char linebuf[129]; 1921 1922 for (cur = 0; cur < pages; cur += linelen) { 1923 int64_t curb; 1924 bool found = false; 1925 /* 1926 * Last line; catch the case where the line length 1927 * is longer than remaining ram 1928 */ 1929 if (cur + linelen > pages) { 1930 linelen = pages - cur; 1931 } 1932 for (curb = 0; curb < linelen; curb++) { 1933 bool thisbit = test_bit(cur + curb, todump); 1934 linebuf[curb] = thisbit ? '1' : '.'; 1935 found = found || (thisbit != expected); 1936 } 1937 if (found) { 1938 linebuf[curb] = '\0'; 1939 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1940 } 1941 } 1942 } 1943 1944 /* **** functions for postcopy ***** */ 1945 1946 void ram_postcopy_migrated_memory_release(MigrationState *ms) 1947 { 1948 struct RAMBlock *block; 1949 1950 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1951 unsigned long *bitmap = block->bmap; 1952 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 1953 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 1954 1955 while (run_start < range) { 1956 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 1957 ram_discard_range(block->idstr, 1958 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 1959 ((ram_addr_t)(run_end - run_start)) 1960 << TARGET_PAGE_BITS); 1961 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 1962 } 1963 } 1964 } 1965 1966 /** 1967 * postcopy_send_discard_bm_ram: discard a RAMBlock 1968 * 1969 * Returns zero on success 1970 * 1971 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1972 * 1973 * @ms: current migration state 1974 * @block: RAMBlock to discard 1975 */ 1976 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 1977 { 1978 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 1979 unsigned long current; 1980 unsigned long *bitmap = block->bmap; 1981 1982 for (current = 0; current < end; ) { 1983 unsigned long one = find_next_bit(bitmap, end, current); 1984 unsigned long zero, discard_length; 1985 1986 if (one >= end) { 1987 break; 1988 } 1989 1990 zero = find_next_zero_bit(bitmap, end, one + 1); 1991 1992 if (zero >= end) { 1993 discard_length = end - one; 1994 } else { 1995 discard_length = zero - one; 1996 } 1997 postcopy_discard_send_range(ms, one, discard_length); 1998 current = one + discard_length; 1999 } 2000 2001 return 0; 2002 } 2003 2004 /** 2005 * postcopy_each_ram_send_discard: discard all RAMBlocks 2006 * 2007 * Returns 0 for success or negative for error 2008 * 2009 * Utility for the outgoing postcopy code. 2010 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2011 * passing it bitmap indexes and name. 2012 * (qemu_ram_foreach_block ends up passing unscaled lengths 2013 * which would mean postcopy code would have to deal with target page) 2014 * 2015 * @ms: current migration state 2016 */ 2017 static int postcopy_each_ram_send_discard(MigrationState *ms) 2018 { 2019 struct RAMBlock *block; 2020 int ret; 2021 2022 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2023 postcopy_discard_send_init(ms, block->idstr); 2024 2025 /* 2026 * Postcopy sends chunks of bitmap over the wire, but it 2027 * just needs indexes at this point, avoids it having 2028 * target page specific code. 2029 */ 2030 ret = postcopy_send_discard_bm_ram(ms, block); 2031 postcopy_discard_send_finish(ms); 2032 if (ret) { 2033 return ret; 2034 } 2035 } 2036 2037 return 0; 2038 } 2039 2040 /** 2041 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2042 * 2043 * Helper for postcopy_chunk_hostpages; it's called twice to 2044 * canonicalize the two bitmaps, that are similar, but one is 2045 * inverted. 2046 * 2047 * Postcopy requires that all target pages in a hostpage are dirty or 2048 * clean, not a mix. This function canonicalizes the bitmaps. 2049 * 2050 * @ms: current migration state 2051 * @block: block that contains the page we want to canonicalize 2052 */ 2053 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2054 { 2055 RAMState *rs = ram_state; 2056 unsigned long *bitmap = block->bmap; 2057 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2058 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2059 unsigned long run_start; 2060 2061 if (block->page_size == TARGET_PAGE_SIZE) { 2062 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2063 return; 2064 } 2065 2066 /* Find a dirty page */ 2067 run_start = find_next_bit(bitmap, pages, 0); 2068 2069 while (run_start < pages) { 2070 2071 /* 2072 * If the start of this run of pages is in the middle of a host 2073 * page, then we need to fixup this host page. 2074 */ 2075 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2076 /* Find the end of this run */ 2077 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2078 /* 2079 * If the end isn't at the start of a host page, then the 2080 * run doesn't finish at the end of a host page 2081 * and we need to discard. 2082 */ 2083 } 2084 2085 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2086 unsigned long page; 2087 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2088 host_ratio); 2089 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2090 2091 /* Clean up the bitmap */ 2092 for (page = fixup_start_addr; 2093 page < fixup_start_addr + host_ratio; page++) { 2094 /* 2095 * Remark them as dirty, updating the count for any pages 2096 * that weren't previously dirty. 2097 */ 2098 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2099 } 2100 } 2101 2102 /* Find the next dirty page for the next iteration */ 2103 run_start = find_next_bit(bitmap, pages, run_start); 2104 } 2105 } 2106 2107 /** 2108 * postcopy_chunk_hostpages: discard any partially sent host page 2109 * 2110 * Utility for the outgoing postcopy code. 2111 * 2112 * Discard any partially sent host-page size chunks, mark any partially 2113 * dirty host-page size chunks as all dirty. In this case the host-page 2114 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2115 * 2116 * Returns zero on success 2117 * 2118 * @ms: current migration state 2119 * @block: block we want to work with 2120 */ 2121 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2122 { 2123 postcopy_discard_send_init(ms, block->idstr); 2124 2125 /* 2126 * Ensure that all partially dirty host pages are made fully dirty. 2127 */ 2128 postcopy_chunk_hostpages_pass(ms, block); 2129 2130 postcopy_discard_send_finish(ms); 2131 return 0; 2132 } 2133 2134 /** 2135 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2136 * 2137 * Returns zero on success 2138 * 2139 * Transmit the set of pages to be discarded after precopy to the target 2140 * these are pages that: 2141 * a) Have been previously transmitted but are now dirty again 2142 * b) Pages that have never been transmitted, this ensures that 2143 * any pages on the destination that have been mapped by background 2144 * tasks get discarded (transparent huge pages is the specific concern) 2145 * Hopefully this is pretty sparse 2146 * 2147 * @ms: current migration state 2148 */ 2149 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2150 { 2151 RAMState *rs = ram_state; 2152 RAMBlock *block; 2153 int ret; 2154 2155 RCU_READ_LOCK_GUARD(); 2156 2157 /* This should be our last sync, the src is now paused */ 2158 migration_bitmap_sync(rs); 2159 2160 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2161 rs->last_seen_block = NULL; 2162 rs->last_sent_block = NULL; 2163 rs->last_page = 0; 2164 2165 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2166 /* Deal with TPS != HPS and huge pages */ 2167 ret = postcopy_chunk_hostpages(ms, block); 2168 if (ret) { 2169 return ret; 2170 } 2171 2172 #ifdef DEBUG_POSTCOPY 2173 ram_debug_dump_bitmap(block->bmap, true, 2174 block->used_length >> TARGET_PAGE_BITS); 2175 #endif 2176 } 2177 trace_ram_postcopy_send_discard_bitmap(); 2178 2179 return postcopy_each_ram_send_discard(ms); 2180 } 2181 2182 /** 2183 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2184 * 2185 * Returns zero on success 2186 * 2187 * @rbname: name of the RAMBlock of the request. NULL means the 2188 * same that last one. 2189 * @start: RAMBlock starting page 2190 * @length: RAMBlock size 2191 */ 2192 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2193 { 2194 trace_ram_discard_range(rbname, start, length); 2195 2196 RCU_READ_LOCK_GUARD(); 2197 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2198 2199 if (!rb) { 2200 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2201 return -1; 2202 } 2203 2204 /* 2205 * On source VM, we don't need to update the received bitmap since 2206 * we don't even have one. 2207 */ 2208 if (rb->receivedmap) { 2209 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2210 length >> qemu_target_page_bits()); 2211 } 2212 2213 return ram_block_discard_range(rb, start, length); 2214 } 2215 2216 /* 2217 * For every allocation, we will try not to crash the VM if the 2218 * allocation failed. 2219 */ 2220 static int xbzrle_init(void) 2221 { 2222 Error *local_err = NULL; 2223 2224 if (!migrate_use_xbzrle()) { 2225 return 0; 2226 } 2227 2228 XBZRLE_cache_lock(); 2229 2230 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2231 if (!XBZRLE.zero_target_page) { 2232 error_report("%s: Error allocating zero page", __func__); 2233 goto err_out; 2234 } 2235 2236 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2237 TARGET_PAGE_SIZE, &local_err); 2238 if (!XBZRLE.cache) { 2239 error_report_err(local_err); 2240 goto free_zero_page; 2241 } 2242 2243 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2244 if (!XBZRLE.encoded_buf) { 2245 error_report("%s: Error allocating encoded_buf", __func__); 2246 goto free_cache; 2247 } 2248 2249 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2250 if (!XBZRLE.current_buf) { 2251 error_report("%s: Error allocating current_buf", __func__); 2252 goto free_encoded_buf; 2253 } 2254 2255 /* We are all good */ 2256 XBZRLE_cache_unlock(); 2257 return 0; 2258 2259 free_encoded_buf: 2260 g_free(XBZRLE.encoded_buf); 2261 XBZRLE.encoded_buf = NULL; 2262 free_cache: 2263 cache_fini(XBZRLE.cache); 2264 XBZRLE.cache = NULL; 2265 free_zero_page: 2266 g_free(XBZRLE.zero_target_page); 2267 XBZRLE.zero_target_page = NULL; 2268 err_out: 2269 XBZRLE_cache_unlock(); 2270 return -ENOMEM; 2271 } 2272 2273 static int ram_state_init(RAMState **rsp) 2274 { 2275 *rsp = g_try_new0(RAMState, 1); 2276 2277 if (!*rsp) { 2278 error_report("%s: Init ramstate fail", __func__); 2279 return -1; 2280 } 2281 2282 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2283 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2284 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2285 2286 /* 2287 * Count the total number of pages used by ram blocks not including any 2288 * gaps due to alignment or unplugs. 2289 * This must match with the initial values of dirty bitmap. 2290 */ 2291 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2292 ram_state_reset(*rsp); 2293 2294 return 0; 2295 } 2296 2297 static void ram_list_init_bitmaps(void) 2298 { 2299 MigrationState *ms = migrate_get_current(); 2300 RAMBlock *block; 2301 unsigned long pages; 2302 uint8_t shift; 2303 2304 /* Skip setting bitmap if there is no RAM */ 2305 if (ram_bytes_total()) { 2306 shift = ms->clear_bitmap_shift; 2307 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2308 error_report("clear_bitmap_shift (%u) too big, using " 2309 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2310 shift = CLEAR_BITMAP_SHIFT_MAX; 2311 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2312 error_report("clear_bitmap_shift (%u) too small, using " 2313 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2314 shift = CLEAR_BITMAP_SHIFT_MIN; 2315 } 2316 2317 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2318 pages = block->max_length >> TARGET_PAGE_BITS; 2319 /* 2320 * The initial dirty bitmap for migration must be set with all 2321 * ones to make sure we'll migrate every guest RAM page to 2322 * destination. 2323 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2324 * new migration after a failed migration, ram_list. 2325 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2326 * guest memory. 2327 */ 2328 block->bmap = bitmap_new(pages); 2329 bitmap_set(block->bmap, 0, pages); 2330 block->clear_bmap_shift = shift; 2331 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2332 } 2333 } 2334 } 2335 2336 static void ram_init_bitmaps(RAMState *rs) 2337 { 2338 /* For memory_global_dirty_log_start below. */ 2339 qemu_mutex_lock_iothread(); 2340 qemu_mutex_lock_ramlist(); 2341 2342 WITH_RCU_READ_LOCK_GUARD() { 2343 ram_list_init_bitmaps(); 2344 memory_global_dirty_log_start(); 2345 migration_bitmap_sync_precopy(rs); 2346 } 2347 qemu_mutex_unlock_ramlist(); 2348 qemu_mutex_unlock_iothread(); 2349 } 2350 2351 static int ram_init_all(RAMState **rsp) 2352 { 2353 if (ram_state_init(rsp)) { 2354 return -1; 2355 } 2356 2357 if (xbzrle_init()) { 2358 ram_state_cleanup(rsp); 2359 return -1; 2360 } 2361 2362 ram_init_bitmaps(*rsp); 2363 2364 return 0; 2365 } 2366 2367 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2368 { 2369 RAMBlock *block; 2370 uint64_t pages = 0; 2371 2372 /* 2373 * Postcopy is not using xbzrle/compression, so no need for that. 2374 * Also, since source are already halted, we don't need to care 2375 * about dirty page logging as well. 2376 */ 2377 2378 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2379 pages += bitmap_count_one(block->bmap, 2380 block->used_length >> TARGET_PAGE_BITS); 2381 } 2382 2383 /* This may not be aligned with current bitmaps. Recalculate. */ 2384 rs->migration_dirty_pages = pages; 2385 2386 rs->last_seen_block = NULL; 2387 rs->last_sent_block = NULL; 2388 rs->last_page = 0; 2389 rs->last_version = ram_list.version; 2390 /* 2391 * Disable the bulk stage, otherwise we'll resend the whole RAM no 2392 * matter what we have sent. 2393 */ 2394 rs->ram_bulk_stage = false; 2395 2396 /* Update RAMState cache of output QEMUFile */ 2397 rs->f = out; 2398 2399 trace_ram_state_resume_prepare(pages); 2400 } 2401 2402 /* 2403 * This function clears bits of the free pages reported by the caller from the 2404 * migration dirty bitmap. @addr is the host address corresponding to the 2405 * start of the continuous guest free pages, and @len is the total bytes of 2406 * those pages. 2407 */ 2408 void qemu_guest_free_page_hint(void *addr, size_t len) 2409 { 2410 RAMBlock *block; 2411 ram_addr_t offset; 2412 size_t used_len, start, npages; 2413 MigrationState *s = migrate_get_current(); 2414 2415 /* This function is currently expected to be used during live migration */ 2416 if (!migration_is_setup_or_active(s->state)) { 2417 return; 2418 } 2419 2420 for (; len > 0; len -= used_len, addr += used_len) { 2421 block = qemu_ram_block_from_host(addr, false, &offset); 2422 if (unlikely(!block || offset >= block->used_length)) { 2423 /* 2424 * The implementation might not support RAMBlock resize during 2425 * live migration, but it could happen in theory with future 2426 * updates. So we add a check here to capture that case. 2427 */ 2428 error_report_once("%s unexpected error", __func__); 2429 return; 2430 } 2431 2432 if (len <= block->used_length - offset) { 2433 used_len = len; 2434 } else { 2435 used_len = block->used_length - offset; 2436 } 2437 2438 start = offset >> TARGET_PAGE_BITS; 2439 npages = used_len >> TARGET_PAGE_BITS; 2440 2441 qemu_mutex_lock(&ram_state->bitmap_mutex); 2442 ram_state->migration_dirty_pages -= 2443 bitmap_count_one_with_offset(block->bmap, start, npages); 2444 bitmap_clear(block->bmap, start, npages); 2445 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2446 } 2447 } 2448 2449 /* 2450 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2451 * long-running RCU critical section. When rcu-reclaims in the code 2452 * start to become numerous it will be necessary to reduce the 2453 * granularity of these critical sections. 2454 */ 2455 2456 /** 2457 * ram_save_setup: Setup RAM for migration 2458 * 2459 * Returns zero to indicate success and negative for error 2460 * 2461 * @f: QEMUFile where to send the data 2462 * @opaque: RAMState pointer 2463 */ 2464 static int ram_save_setup(QEMUFile *f, void *opaque) 2465 { 2466 RAMState **rsp = opaque; 2467 RAMBlock *block; 2468 2469 if (compress_threads_save_setup()) { 2470 return -1; 2471 } 2472 2473 /* migration has already setup the bitmap, reuse it. */ 2474 if (!migration_in_colo_state()) { 2475 if (ram_init_all(rsp) != 0) { 2476 compress_threads_save_cleanup(); 2477 return -1; 2478 } 2479 } 2480 (*rsp)->f = f; 2481 2482 WITH_RCU_READ_LOCK_GUARD() { 2483 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2484 2485 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2486 qemu_put_byte(f, strlen(block->idstr)); 2487 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2488 qemu_put_be64(f, block->used_length); 2489 if (migrate_postcopy_ram() && block->page_size != 2490 qemu_host_page_size) { 2491 qemu_put_be64(f, block->page_size); 2492 } 2493 if (migrate_ignore_shared()) { 2494 qemu_put_be64(f, block->mr->addr); 2495 } 2496 } 2497 } 2498 2499 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2500 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2501 2502 multifd_send_sync_main(f); 2503 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2504 qemu_fflush(f); 2505 2506 return 0; 2507 } 2508 2509 /** 2510 * ram_save_iterate: iterative stage for migration 2511 * 2512 * Returns zero to indicate success and negative for error 2513 * 2514 * @f: QEMUFile where to send the data 2515 * @opaque: RAMState pointer 2516 */ 2517 static int ram_save_iterate(QEMUFile *f, void *opaque) 2518 { 2519 RAMState **temp = opaque; 2520 RAMState *rs = *temp; 2521 int ret = 0; 2522 int i; 2523 int64_t t0; 2524 int done = 0; 2525 2526 if (blk_mig_bulk_active()) { 2527 /* Avoid transferring ram during bulk phase of block migration as 2528 * the bulk phase will usually take a long time and transferring 2529 * ram updates during that time is pointless. */ 2530 goto out; 2531 } 2532 2533 WITH_RCU_READ_LOCK_GUARD() { 2534 if (ram_list.version != rs->last_version) { 2535 ram_state_reset(rs); 2536 } 2537 2538 /* Read version before ram_list.blocks */ 2539 smp_rmb(); 2540 2541 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2542 2543 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2544 i = 0; 2545 while ((ret = qemu_file_rate_limit(f)) == 0 || 2546 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 2547 int pages; 2548 2549 if (qemu_file_get_error(f)) { 2550 break; 2551 } 2552 2553 pages = ram_find_and_save_block(rs, false); 2554 /* no more pages to sent */ 2555 if (pages == 0) { 2556 done = 1; 2557 break; 2558 } 2559 2560 if (pages < 0) { 2561 qemu_file_set_error(f, pages); 2562 break; 2563 } 2564 2565 rs->target_page_count += pages; 2566 2567 /* 2568 * During postcopy, it is necessary to make sure one whole host 2569 * page is sent in one chunk. 2570 */ 2571 if (migrate_postcopy_ram()) { 2572 flush_compressed_data(rs); 2573 } 2574 2575 /* 2576 * we want to check in the 1st loop, just in case it was the 1st 2577 * time and we had to sync the dirty bitmap. 2578 * qemu_clock_get_ns() is a bit expensive, so we only check each 2579 * some iterations 2580 */ 2581 if ((i & 63) == 0) { 2582 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 2583 1000000; 2584 if (t1 > MAX_WAIT) { 2585 trace_ram_save_iterate_big_wait(t1, i); 2586 break; 2587 } 2588 } 2589 i++; 2590 } 2591 } 2592 2593 /* 2594 * Must occur before EOS (or any QEMUFile operation) 2595 * because of RDMA protocol. 2596 */ 2597 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2598 2599 out: 2600 if (ret >= 0 2601 && migration_is_setup_or_active(migrate_get_current()->state)) { 2602 multifd_send_sync_main(rs->f); 2603 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2604 qemu_fflush(f); 2605 ram_counters.transferred += 8; 2606 2607 ret = qemu_file_get_error(f); 2608 } 2609 if (ret < 0) { 2610 return ret; 2611 } 2612 2613 return done; 2614 } 2615 2616 /** 2617 * ram_save_complete: function called to send the remaining amount of ram 2618 * 2619 * Returns zero to indicate success or negative on error 2620 * 2621 * Called with iothread lock 2622 * 2623 * @f: QEMUFile where to send the data 2624 * @opaque: RAMState pointer 2625 */ 2626 static int ram_save_complete(QEMUFile *f, void *opaque) 2627 { 2628 RAMState **temp = opaque; 2629 RAMState *rs = *temp; 2630 int ret = 0; 2631 2632 WITH_RCU_READ_LOCK_GUARD() { 2633 if (!migration_in_postcopy()) { 2634 migration_bitmap_sync_precopy(rs); 2635 } 2636 2637 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2638 2639 /* try transferring iterative blocks of memory */ 2640 2641 /* flush all remaining blocks regardless of rate limiting */ 2642 while (true) { 2643 int pages; 2644 2645 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2646 /* no more blocks to sent */ 2647 if (pages == 0) { 2648 break; 2649 } 2650 if (pages < 0) { 2651 ret = pages; 2652 break; 2653 } 2654 } 2655 2656 flush_compressed_data(rs); 2657 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2658 } 2659 2660 if (ret >= 0) { 2661 multifd_send_sync_main(rs->f); 2662 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2663 qemu_fflush(f); 2664 } 2665 2666 return ret; 2667 } 2668 2669 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2670 uint64_t *res_precopy_only, 2671 uint64_t *res_compatible, 2672 uint64_t *res_postcopy_only) 2673 { 2674 RAMState **temp = opaque; 2675 RAMState *rs = *temp; 2676 uint64_t remaining_size; 2677 2678 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2679 2680 if (!migration_in_postcopy() && 2681 remaining_size < max_size) { 2682 qemu_mutex_lock_iothread(); 2683 WITH_RCU_READ_LOCK_GUARD() { 2684 migration_bitmap_sync_precopy(rs); 2685 } 2686 qemu_mutex_unlock_iothread(); 2687 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2688 } 2689 2690 if (migrate_postcopy_ram()) { 2691 /* We can do postcopy, and all the data is postcopiable */ 2692 *res_compatible += remaining_size; 2693 } else { 2694 *res_precopy_only += remaining_size; 2695 } 2696 } 2697 2698 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2699 { 2700 unsigned int xh_len; 2701 int xh_flags; 2702 uint8_t *loaded_data; 2703 2704 /* extract RLE header */ 2705 xh_flags = qemu_get_byte(f); 2706 xh_len = qemu_get_be16(f); 2707 2708 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2709 error_report("Failed to load XBZRLE page - wrong compression!"); 2710 return -1; 2711 } 2712 2713 if (xh_len > TARGET_PAGE_SIZE) { 2714 error_report("Failed to load XBZRLE page - len overflow!"); 2715 return -1; 2716 } 2717 loaded_data = XBZRLE.decoded_buf; 2718 /* load data and decode */ 2719 /* it can change loaded_data to point to an internal buffer */ 2720 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2721 2722 /* decode RLE */ 2723 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2724 TARGET_PAGE_SIZE) == -1) { 2725 error_report("Failed to load XBZRLE page - decode error!"); 2726 return -1; 2727 } 2728 2729 return 0; 2730 } 2731 2732 /** 2733 * ram_block_from_stream: read a RAMBlock id from the migration stream 2734 * 2735 * Must be called from within a rcu critical section. 2736 * 2737 * Returns a pointer from within the RCU-protected ram_list. 2738 * 2739 * @f: QEMUFile where to read the data from 2740 * @flags: Page flags (mostly to see if it's a continuation of previous block) 2741 */ 2742 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 2743 { 2744 static RAMBlock *block = NULL; 2745 char id[256]; 2746 uint8_t len; 2747 2748 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2749 if (!block) { 2750 error_report("Ack, bad migration stream!"); 2751 return NULL; 2752 } 2753 return block; 2754 } 2755 2756 len = qemu_get_byte(f); 2757 qemu_get_buffer(f, (uint8_t *)id, len); 2758 id[len] = 0; 2759 2760 block = qemu_ram_block_by_name(id); 2761 if (!block) { 2762 error_report("Can't find block %s", id); 2763 return NULL; 2764 } 2765 2766 if (ramblock_is_ignored(block)) { 2767 error_report("block %s should not be migrated !", id); 2768 return NULL; 2769 } 2770 2771 return block; 2772 } 2773 2774 static inline void *host_from_ram_block_offset(RAMBlock *block, 2775 ram_addr_t offset) 2776 { 2777 if (!offset_in_ramblock(block, offset)) { 2778 return NULL; 2779 } 2780 2781 return block->host + offset; 2782 } 2783 2784 static inline void *colo_cache_from_block_offset(RAMBlock *block, 2785 ram_addr_t offset, bool record_bitmap) 2786 { 2787 if (!offset_in_ramblock(block, offset)) { 2788 return NULL; 2789 } 2790 if (!block->colo_cache) { 2791 error_report("%s: colo_cache is NULL in block :%s", 2792 __func__, block->idstr); 2793 return NULL; 2794 } 2795 2796 /* 2797 * During colo checkpoint, we need bitmap of these migrated pages. 2798 * It help us to decide which pages in ram cache should be flushed 2799 * into VM's RAM later. 2800 */ 2801 if (record_bitmap && 2802 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 2803 ram_state->migration_dirty_pages++; 2804 } 2805 return block->colo_cache + offset; 2806 } 2807 2808 /** 2809 * ram_handle_compressed: handle the zero page case 2810 * 2811 * If a page (or a whole RDMA chunk) has been 2812 * determined to be zero, then zap it. 2813 * 2814 * @host: host address for the zero page 2815 * @ch: what the page is filled from. We only support zero 2816 * @size: size of the zero page 2817 */ 2818 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2819 { 2820 if (ch != 0 || !is_zero_range(host, size)) { 2821 memset(host, ch, size); 2822 } 2823 } 2824 2825 /* return the size after decompression, or negative value on error */ 2826 static int 2827 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 2828 const uint8_t *source, size_t source_len) 2829 { 2830 int err; 2831 2832 err = inflateReset(stream); 2833 if (err != Z_OK) { 2834 return -1; 2835 } 2836 2837 stream->avail_in = source_len; 2838 stream->next_in = (uint8_t *)source; 2839 stream->avail_out = dest_len; 2840 stream->next_out = dest; 2841 2842 err = inflate(stream, Z_NO_FLUSH); 2843 if (err != Z_STREAM_END) { 2844 return -1; 2845 } 2846 2847 return stream->total_out; 2848 } 2849 2850 static void *do_data_decompress(void *opaque) 2851 { 2852 DecompressParam *param = opaque; 2853 unsigned long pagesize; 2854 uint8_t *des; 2855 int len, ret; 2856 2857 qemu_mutex_lock(¶m->mutex); 2858 while (!param->quit) { 2859 if (param->des) { 2860 des = param->des; 2861 len = param->len; 2862 param->des = 0; 2863 qemu_mutex_unlock(¶m->mutex); 2864 2865 pagesize = TARGET_PAGE_SIZE; 2866 2867 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 2868 param->compbuf, len); 2869 if (ret < 0 && migrate_get_current()->decompress_error_check) { 2870 error_report("decompress data failed"); 2871 qemu_file_set_error(decomp_file, ret); 2872 } 2873 2874 qemu_mutex_lock(&decomp_done_lock); 2875 param->done = true; 2876 qemu_cond_signal(&decomp_done_cond); 2877 qemu_mutex_unlock(&decomp_done_lock); 2878 2879 qemu_mutex_lock(¶m->mutex); 2880 } else { 2881 qemu_cond_wait(¶m->cond, ¶m->mutex); 2882 } 2883 } 2884 qemu_mutex_unlock(¶m->mutex); 2885 2886 return NULL; 2887 } 2888 2889 static int wait_for_decompress_done(void) 2890 { 2891 int idx, thread_count; 2892 2893 if (!migrate_use_compression()) { 2894 return 0; 2895 } 2896 2897 thread_count = migrate_decompress_threads(); 2898 qemu_mutex_lock(&decomp_done_lock); 2899 for (idx = 0; idx < thread_count; idx++) { 2900 while (!decomp_param[idx].done) { 2901 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2902 } 2903 } 2904 qemu_mutex_unlock(&decomp_done_lock); 2905 return qemu_file_get_error(decomp_file); 2906 } 2907 2908 static void compress_threads_load_cleanup(void) 2909 { 2910 int i, thread_count; 2911 2912 if (!migrate_use_compression()) { 2913 return; 2914 } 2915 thread_count = migrate_decompress_threads(); 2916 for (i = 0; i < thread_count; i++) { 2917 /* 2918 * we use it as a indicator which shows if the thread is 2919 * properly init'd or not 2920 */ 2921 if (!decomp_param[i].compbuf) { 2922 break; 2923 } 2924 2925 qemu_mutex_lock(&decomp_param[i].mutex); 2926 decomp_param[i].quit = true; 2927 qemu_cond_signal(&decomp_param[i].cond); 2928 qemu_mutex_unlock(&decomp_param[i].mutex); 2929 } 2930 for (i = 0; i < thread_count; i++) { 2931 if (!decomp_param[i].compbuf) { 2932 break; 2933 } 2934 2935 qemu_thread_join(decompress_threads + i); 2936 qemu_mutex_destroy(&decomp_param[i].mutex); 2937 qemu_cond_destroy(&decomp_param[i].cond); 2938 inflateEnd(&decomp_param[i].stream); 2939 g_free(decomp_param[i].compbuf); 2940 decomp_param[i].compbuf = NULL; 2941 } 2942 g_free(decompress_threads); 2943 g_free(decomp_param); 2944 decompress_threads = NULL; 2945 decomp_param = NULL; 2946 decomp_file = NULL; 2947 } 2948 2949 static int compress_threads_load_setup(QEMUFile *f) 2950 { 2951 int i, thread_count; 2952 2953 if (!migrate_use_compression()) { 2954 return 0; 2955 } 2956 2957 thread_count = migrate_decompress_threads(); 2958 decompress_threads = g_new0(QemuThread, thread_count); 2959 decomp_param = g_new0(DecompressParam, thread_count); 2960 qemu_mutex_init(&decomp_done_lock); 2961 qemu_cond_init(&decomp_done_cond); 2962 decomp_file = f; 2963 for (i = 0; i < thread_count; i++) { 2964 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 2965 goto exit; 2966 } 2967 2968 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2969 qemu_mutex_init(&decomp_param[i].mutex); 2970 qemu_cond_init(&decomp_param[i].cond); 2971 decomp_param[i].done = true; 2972 decomp_param[i].quit = false; 2973 qemu_thread_create(decompress_threads + i, "decompress", 2974 do_data_decompress, decomp_param + i, 2975 QEMU_THREAD_JOINABLE); 2976 } 2977 return 0; 2978 exit: 2979 compress_threads_load_cleanup(); 2980 return -1; 2981 } 2982 2983 static void decompress_data_with_multi_threads(QEMUFile *f, 2984 void *host, int len) 2985 { 2986 int idx, thread_count; 2987 2988 thread_count = migrate_decompress_threads(); 2989 qemu_mutex_lock(&decomp_done_lock); 2990 while (true) { 2991 for (idx = 0; idx < thread_count; idx++) { 2992 if (decomp_param[idx].done) { 2993 decomp_param[idx].done = false; 2994 qemu_mutex_lock(&decomp_param[idx].mutex); 2995 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2996 decomp_param[idx].des = host; 2997 decomp_param[idx].len = len; 2998 qemu_cond_signal(&decomp_param[idx].cond); 2999 qemu_mutex_unlock(&decomp_param[idx].mutex); 3000 break; 3001 } 3002 } 3003 if (idx < thread_count) { 3004 break; 3005 } else { 3006 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3007 } 3008 } 3009 qemu_mutex_unlock(&decomp_done_lock); 3010 } 3011 3012 /* 3013 * colo cache: this is for secondary VM, we cache the whole 3014 * memory of the secondary VM, it is need to hold the global lock 3015 * to call this helper. 3016 */ 3017 int colo_init_ram_cache(void) 3018 { 3019 RAMBlock *block; 3020 3021 WITH_RCU_READ_LOCK_GUARD() { 3022 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3023 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3024 NULL, 3025 false); 3026 if (!block->colo_cache) { 3027 error_report("%s: Can't alloc memory for COLO cache of block %s," 3028 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3029 block->used_length); 3030 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3031 if (block->colo_cache) { 3032 qemu_anon_ram_free(block->colo_cache, block->used_length); 3033 block->colo_cache = NULL; 3034 } 3035 } 3036 return -errno; 3037 } 3038 } 3039 } 3040 3041 /* 3042 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3043 * with to decide which page in cache should be flushed into SVM's RAM. Here 3044 * we use the same name 'ram_bitmap' as for migration. 3045 */ 3046 if (ram_bytes_total()) { 3047 RAMBlock *block; 3048 3049 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3050 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3051 block->bmap = bitmap_new(pages); 3052 } 3053 } 3054 3055 ram_state_init(&ram_state); 3056 return 0; 3057 } 3058 3059 /* TODO: duplicated with ram_init_bitmaps */ 3060 void colo_incoming_start_dirty_log(void) 3061 { 3062 RAMBlock *block = NULL; 3063 /* For memory_global_dirty_log_start below. */ 3064 qemu_mutex_lock_iothread(); 3065 qemu_mutex_lock_ramlist(); 3066 3067 memory_global_dirty_log_sync(); 3068 WITH_RCU_READ_LOCK_GUARD() { 3069 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3070 ramblock_sync_dirty_bitmap(ram_state, block); 3071 /* Discard this dirty bitmap record */ 3072 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3073 } 3074 memory_global_dirty_log_start(); 3075 } 3076 ram_state->migration_dirty_pages = 0; 3077 qemu_mutex_unlock_ramlist(); 3078 qemu_mutex_unlock_iothread(); 3079 } 3080 3081 /* It is need to hold the global lock to call this helper */ 3082 void colo_release_ram_cache(void) 3083 { 3084 RAMBlock *block; 3085 3086 memory_global_dirty_log_stop(); 3087 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3088 g_free(block->bmap); 3089 block->bmap = NULL; 3090 } 3091 3092 WITH_RCU_READ_LOCK_GUARD() { 3093 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3094 if (block->colo_cache) { 3095 qemu_anon_ram_free(block->colo_cache, block->used_length); 3096 block->colo_cache = NULL; 3097 } 3098 } 3099 } 3100 ram_state_cleanup(&ram_state); 3101 } 3102 3103 /** 3104 * ram_load_setup: Setup RAM for migration incoming side 3105 * 3106 * Returns zero to indicate success and negative for error 3107 * 3108 * @f: QEMUFile where to receive the data 3109 * @opaque: RAMState pointer 3110 */ 3111 static int ram_load_setup(QEMUFile *f, void *opaque) 3112 { 3113 if (compress_threads_load_setup(f)) { 3114 return -1; 3115 } 3116 3117 xbzrle_load_setup(); 3118 ramblock_recv_map_init(); 3119 3120 return 0; 3121 } 3122 3123 static int ram_load_cleanup(void *opaque) 3124 { 3125 RAMBlock *rb; 3126 3127 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3128 qemu_ram_block_writeback(rb); 3129 } 3130 3131 xbzrle_load_cleanup(); 3132 compress_threads_load_cleanup(); 3133 3134 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3135 g_free(rb->receivedmap); 3136 rb->receivedmap = NULL; 3137 } 3138 3139 return 0; 3140 } 3141 3142 /** 3143 * ram_postcopy_incoming_init: allocate postcopy data structures 3144 * 3145 * Returns 0 for success and negative if there was one error 3146 * 3147 * @mis: current migration incoming state 3148 * 3149 * Allocate data structures etc needed by incoming migration with 3150 * postcopy-ram. postcopy-ram's similarly names 3151 * postcopy_ram_incoming_init does the work. 3152 */ 3153 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3154 { 3155 return postcopy_ram_incoming_init(mis); 3156 } 3157 3158 /** 3159 * ram_load_postcopy: load a page in postcopy case 3160 * 3161 * Returns 0 for success or -errno in case of error 3162 * 3163 * Called in postcopy mode by ram_load(). 3164 * rcu_read_lock is taken prior to this being called. 3165 * 3166 * @f: QEMUFile where to send the data 3167 */ 3168 static int ram_load_postcopy(QEMUFile *f) 3169 { 3170 int flags = 0, ret = 0; 3171 bool place_needed = false; 3172 bool matches_target_page_size = false; 3173 MigrationIncomingState *mis = migration_incoming_get_current(); 3174 /* Temporary page that is later 'placed' */ 3175 void *postcopy_host_page = mis->postcopy_tmp_page; 3176 void *this_host = NULL; 3177 bool all_zero = true; 3178 int target_pages = 0; 3179 3180 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3181 ram_addr_t addr; 3182 void *host = NULL; 3183 void *page_buffer = NULL; 3184 void *place_source = NULL; 3185 RAMBlock *block = NULL; 3186 uint8_t ch; 3187 int len; 3188 3189 addr = qemu_get_be64(f); 3190 3191 /* 3192 * If qemu file error, we should stop here, and then "addr" 3193 * may be invalid 3194 */ 3195 ret = qemu_file_get_error(f); 3196 if (ret) { 3197 break; 3198 } 3199 3200 flags = addr & ~TARGET_PAGE_MASK; 3201 addr &= TARGET_PAGE_MASK; 3202 3203 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3204 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3205 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3206 block = ram_block_from_stream(f, flags); 3207 3208 host = host_from_ram_block_offset(block, addr); 3209 if (!host) { 3210 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3211 ret = -EINVAL; 3212 break; 3213 } 3214 target_pages++; 3215 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3216 /* 3217 * Postcopy requires that we place whole host pages atomically; 3218 * these may be huge pages for RAMBlocks that are backed by 3219 * hugetlbfs. 3220 * To make it atomic, the data is read into a temporary page 3221 * that's moved into place later. 3222 * The migration protocol uses, possibly smaller, target-pages 3223 * however the source ensures it always sends all the components 3224 * of a host page in one chunk. 3225 */ 3226 page_buffer = postcopy_host_page + 3227 ((uintptr_t)host & (block->page_size - 1)); 3228 if (target_pages == 1) { 3229 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3230 block->page_size); 3231 } else { 3232 /* not the 1st TP within the HP */ 3233 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) != 3234 (uintptr_t)this_host) { 3235 error_report("Non-same host page %p/%p", 3236 host, this_host); 3237 ret = -EINVAL; 3238 break; 3239 } 3240 } 3241 3242 /* 3243 * If it's the last part of a host page then we place the host 3244 * page 3245 */ 3246 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3247 place_needed = true; 3248 } 3249 place_source = postcopy_host_page; 3250 } 3251 3252 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3253 case RAM_SAVE_FLAG_ZERO: 3254 ch = qemu_get_byte(f); 3255 /* 3256 * Can skip to set page_buffer when 3257 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3258 */ 3259 if (ch || !matches_target_page_size) { 3260 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3261 } 3262 if (ch) { 3263 all_zero = false; 3264 } 3265 break; 3266 3267 case RAM_SAVE_FLAG_PAGE: 3268 all_zero = false; 3269 if (!matches_target_page_size) { 3270 /* For huge pages, we always use temporary buffer */ 3271 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3272 } else { 3273 /* 3274 * For small pages that matches target page size, we 3275 * avoid the qemu_file copy. Instead we directly use 3276 * the buffer of QEMUFile to place the page. Note: we 3277 * cannot do any QEMUFile operation before using that 3278 * buffer to make sure the buffer is valid when 3279 * placing the page. 3280 */ 3281 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3282 TARGET_PAGE_SIZE); 3283 } 3284 break; 3285 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3286 all_zero = false; 3287 len = qemu_get_be32(f); 3288 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3289 error_report("Invalid compressed data length: %d", len); 3290 ret = -EINVAL; 3291 break; 3292 } 3293 decompress_data_with_multi_threads(f, page_buffer, len); 3294 break; 3295 3296 case RAM_SAVE_FLAG_EOS: 3297 /* normal exit */ 3298 multifd_recv_sync_main(); 3299 break; 3300 default: 3301 error_report("Unknown combination of migration flags: %#x" 3302 " (postcopy mode)", flags); 3303 ret = -EINVAL; 3304 break; 3305 } 3306 3307 /* Got the whole host page, wait for decompress before placing. */ 3308 if (place_needed) { 3309 ret |= wait_for_decompress_done(); 3310 } 3311 3312 /* Detect for any possible file errors */ 3313 if (!ret && qemu_file_get_error(f)) { 3314 ret = qemu_file_get_error(f); 3315 } 3316 3317 if (!ret && place_needed) { 3318 /* This gets called at the last target page in the host page */ 3319 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3320 block->page_size); 3321 3322 if (all_zero) { 3323 ret = postcopy_place_page_zero(mis, place_dest, 3324 block); 3325 } else { 3326 ret = postcopy_place_page(mis, place_dest, 3327 place_source, block); 3328 } 3329 place_needed = false; 3330 target_pages = 0; 3331 /* Assume we have a zero page until we detect something different */ 3332 all_zero = true; 3333 } 3334 } 3335 3336 return ret; 3337 } 3338 3339 static bool postcopy_is_advised(void) 3340 { 3341 PostcopyState ps = postcopy_state_get(); 3342 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3343 } 3344 3345 static bool postcopy_is_running(void) 3346 { 3347 PostcopyState ps = postcopy_state_get(); 3348 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3349 } 3350 3351 /* 3352 * Flush content of RAM cache into SVM's memory. 3353 * Only flush the pages that be dirtied by PVM or SVM or both. 3354 */ 3355 void colo_flush_ram_cache(void) 3356 { 3357 RAMBlock *block = NULL; 3358 void *dst_host; 3359 void *src_host; 3360 unsigned long offset = 0; 3361 3362 memory_global_dirty_log_sync(); 3363 WITH_RCU_READ_LOCK_GUARD() { 3364 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3365 ramblock_sync_dirty_bitmap(ram_state, block); 3366 } 3367 } 3368 3369 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3370 WITH_RCU_READ_LOCK_GUARD() { 3371 block = QLIST_FIRST_RCU(&ram_list.blocks); 3372 3373 while (block) { 3374 offset = migration_bitmap_find_dirty(ram_state, block, offset); 3375 3376 if (((ram_addr_t)offset) << TARGET_PAGE_BITS 3377 >= block->used_length) { 3378 offset = 0; 3379 block = QLIST_NEXT_RCU(block, next); 3380 } else { 3381 migration_bitmap_clear_dirty(ram_state, block, offset); 3382 dst_host = block->host 3383 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3384 src_host = block->colo_cache 3385 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3386 memcpy(dst_host, src_host, TARGET_PAGE_SIZE); 3387 } 3388 } 3389 } 3390 trace_colo_flush_ram_cache_end(); 3391 } 3392 3393 /** 3394 * ram_load_precopy: load pages in precopy case 3395 * 3396 * Returns 0 for success or -errno in case of error 3397 * 3398 * Called in precopy mode by ram_load(). 3399 * rcu_read_lock is taken prior to this being called. 3400 * 3401 * @f: QEMUFile where to send the data 3402 */ 3403 static int ram_load_precopy(QEMUFile *f) 3404 { 3405 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3406 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3407 bool postcopy_advised = postcopy_is_advised(); 3408 if (!migrate_use_compression()) { 3409 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3410 } 3411 3412 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3413 ram_addr_t addr, total_ram_bytes; 3414 void *host = NULL, *host_bak = NULL; 3415 uint8_t ch; 3416 3417 /* 3418 * Yield periodically to let main loop run, but an iteration of 3419 * the main loop is expensive, so do it each some iterations 3420 */ 3421 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3422 aio_co_schedule(qemu_get_current_aio_context(), 3423 qemu_coroutine_self()); 3424 qemu_coroutine_yield(); 3425 } 3426 i++; 3427 3428 addr = qemu_get_be64(f); 3429 flags = addr & ~TARGET_PAGE_MASK; 3430 addr &= TARGET_PAGE_MASK; 3431 3432 if (flags & invalid_flags) { 3433 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3434 error_report("Received an unexpected compressed page"); 3435 } 3436 3437 ret = -EINVAL; 3438 break; 3439 } 3440 3441 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3442 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3443 RAMBlock *block = ram_block_from_stream(f, flags); 3444 3445 host = host_from_ram_block_offset(block, addr); 3446 /* 3447 * After going into COLO stage, we should not load the page 3448 * into SVM's memory directly, we put them into colo_cache firstly. 3449 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3450 * Previously, we copied all these memory in preparing stage of COLO 3451 * while we need to stop VM, which is a time-consuming process. 3452 * Here we optimize it by a trick, back-up every page while in 3453 * migration process while COLO is enabled, though it affects the 3454 * speed of the migration, but it obviously reduce the downtime of 3455 * back-up all SVM'S memory in COLO preparing stage. 3456 */ 3457 if (migration_incoming_colo_enabled()) { 3458 if (migration_incoming_in_colo_state()) { 3459 /* In COLO stage, put all pages into cache temporarily */ 3460 host = colo_cache_from_block_offset(block, addr, true); 3461 } else { 3462 /* 3463 * In migration stage but before COLO stage, 3464 * Put all pages into both cache and SVM's memory. 3465 */ 3466 host_bak = colo_cache_from_block_offset(block, addr, false); 3467 } 3468 } 3469 if (!host) { 3470 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3471 ret = -EINVAL; 3472 break; 3473 } 3474 if (!migration_incoming_in_colo_state()) { 3475 ramblock_recv_bitmap_set(block, host); 3476 } 3477 3478 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3479 } 3480 3481 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3482 case RAM_SAVE_FLAG_MEM_SIZE: 3483 /* Synchronize RAM block list */ 3484 total_ram_bytes = addr; 3485 while (!ret && total_ram_bytes) { 3486 RAMBlock *block; 3487 char id[256]; 3488 ram_addr_t length; 3489 3490 len = qemu_get_byte(f); 3491 qemu_get_buffer(f, (uint8_t *)id, len); 3492 id[len] = 0; 3493 length = qemu_get_be64(f); 3494 3495 block = qemu_ram_block_by_name(id); 3496 if (block && !qemu_ram_is_migratable(block)) { 3497 error_report("block %s should not be migrated !", id); 3498 ret = -EINVAL; 3499 } else if (block) { 3500 if (length != block->used_length) { 3501 Error *local_err = NULL; 3502 3503 ret = qemu_ram_resize(block, length, 3504 &local_err); 3505 if (local_err) { 3506 error_report_err(local_err); 3507 } 3508 } 3509 /* For postcopy we need to check hugepage sizes match */ 3510 if (postcopy_advised && 3511 block->page_size != qemu_host_page_size) { 3512 uint64_t remote_page_size = qemu_get_be64(f); 3513 if (remote_page_size != block->page_size) { 3514 error_report("Mismatched RAM page size %s " 3515 "(local) %zd != %" PRId64, 3516 id, block->page_size, 3517 remote_page_size); 3518 ret = -EINVAL; 3519 } 3520 } 3521 if (migrate_ignore_shared()) { 3522 hwaddr addr = qemu_get_be64(f); 3523 if (ramblock_is_ignored(block) && 3524 block->mr->addr != addr) { 3525 error_report("Mismatched GPAs for block %s " 3526 "%" PRId64 "!= %" PRId64, 3527 id, (uint64_t)addr, 3528 (uint64_t)block->mr->addr); 3529 ret = -EINVAL; 3530 } 3531 } 3532 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3533 block->idstr); 3534 } else { 3535 error_report("Unknown ramblock \"%s\", cannot " 3536 "accept migration", id); 3537 ret = -EINVAL; 3538 } 3539 3540 total_ram_bytes -= length; 3541 } 3542 break; 3543 3544 case RAM_SAVE_FLAG_ZERO: 3545 ch = qemu_get_byte(f); 3546 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3547 break; 3548 3549 case RAM_SAVE_FLAG_PAGE: 3550 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3551 break; 3552 3553 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3554 len = qemu_get_be32(f); 3555 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3556 error_report("Invalid compressed data length: %d", len); 3557 ret = -EINVAL; 3558 break; 3559 } 3560 decompress_data_with_multi_threads(f, host, len); 3561 break; 3562 3563 case RAM_SAVE_FLAG_XBZRLE: 3564 if (load_xbzrle(f, addr, host) < 0) { 3565 error_report("Failed to decompress XBZRLE page at " 3566 RAM_ADDR_FMT, addr); 3567 ret = -EINVAL; 3568 break; 3569 } 3570 break; 3571 case RAM_SAVE_FLAG_EOS: 3572 /* normal exit */ 3573 multifd_recv_sync_main(); 3574 break; 3575 default: 3576 if (flags & RAM_SAVE_FLAG_HOOK) { 3577 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3578 } else { 3579 error_report("Unknown combination of migration flags: %#x", 3580 flags); 3581 ret = -EINVAL; 3582 } 3583 } 3584 if (!ret) { 3585 ret = qemu_file_get_error(f); 3586 } 3587 if (!ret && host_bak) { 3588 memcpy(host_bak, host, TARGET_PAGE_SIZE); 3589 } 3590 } 3591 3592 ret |= wait_for_decompress_done(); 3593 return ret; 3594 } 3595 3596 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3597 { 3598 int ret = 0; 3599 static uint64_t seq_iter; 3600 /* 3601 * If system is running in postcopy mode, page inserts to host memory must 3602 * be atomic 3603 */ 3604 bool postcopy_running = postcopy_is_running(); 3605 3606 seq_iter++; 3607 3608 if (version_id != 4) { 3609 return -EINVAL; 3610 } 3611 3612 /* 3613 * This RCU critical section can be very long running. 3614 * When RCU reclaims in the code start to become numerous, 3615 * it will be necessary to reduce the granularity of this 3616 * critical section. 3617 */ 3618 WITH_RCU_READ_LOCK_GUARD() { 3619 if (postcopy_running) { 3620 ret = ram_load_postcopy(f); 3621 } else { 3622 ret = ram_load_precopy(f); 3623 } 3624 } 3625 trace_ram_load_complete(ret, seq_iter); 3626 3627 return ret; 3628 } 3629 3630 static bool ram_has_postcopy(void *opaque) 3631 { 3632 RAMBlock *rb; 3633 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3634 if (ramblock_is_pmem(rb)) { 3635 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 3636 "is not supported now!", rb->idstr, rb->host); 3637 return false; 3638 } 3639 } 3640 3641 return migrate_postcopy_ram(); 3642 } 3643 3644 /* Sync all the dirty bitmap with destination VM. */ 3645 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 3646 { 3647 RAMBlock *block; 3648 QEMUFile *file = s->to_dst_file; 3649 int ramblock_count = 0; 3650 3651 trace_ram_dirty_bitmap_sync_start(); 3652 3653 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3654 qemu_savevm_send_recv_bitmap(file, block->idstr); 3655 trace_ram_dirty_bitmap_request(block->idstr); 3656 ramblock_count++; 3657 } 3658 3659 trace_ram_dirty_bitmap_sync_wait(); 3660 3661 /* Wait until all the ramblocks' dirty bitmap synced */ 3662 while (ramblock_count--) { 3663 qemu_sem_wait(&s->rp_state.rp_sem); 3664 } 3665 3666 trace_ram_dirty_bitmap_sync_complete(); 3667 3668 return 0; 3669 } 3670 3671 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 3672 { 3673 qemu_sem_post(&s->rp_state.rp_sem); 3674 } 3675 3676 /* 3677 * Read the received bitmap, revert it as the initial dirty bitmap. 3678 * This is only used when the postcopy migration is paused but wants 3679 * to resume from a middle point. 3680 */ 3681 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 3682 { 3683 int ret = -EINVAL; 3684 QEMUFile *file = s->rp_state.from_dst_file; 3685 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 3686 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 3687 uint64_t size, end_mark; 3688 3689 trace_ram_dirty_bitmap_reload_begin(block->idstr); 3690 3691 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 3692 error_report("%s: incorrect state %s", __func__, 3693 MigrationStatus_str(s->state)); 3694 return -EINVAL; 3695 } 3696 3697 /* 3698 * Note: see comments in ramblock_recv_bitmap_send() on why we 3699 * need the endianness conversion, and the paddings. 3700 */ 3701 local_size = ROUND_UP(local_size, 8); 3702 3703 /* Add paddings */ 3704 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 3705 3706 size = qemu_get_be64(file); 3707 3708 /* The size of the bitmap should match with our ramblock */ 3709 if (size != local_size) { 3710 error_report("%s: ramblock '%s' bitmap size mismatch " 3711 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 3712 block->idstr, size, local_size); 3713 ret = -EINVAL; 3714 goto out; 3715 } 3716 3717 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 3718 end_mark = qemu_get_be64(file); 3719 3720 ret = qemu_file_get_error(file); 3721 if (ret || size != local_size) { 3722 error_report("%s: read bitmap failed for ramblock '%s': %d" 3723 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 3724 __func__, block->idstr, ret, local_size, size); 3725 ret = -EIO; 3726 goto out; 3727 } 3728 3729 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 3730 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64, 3731 __func__, block->idstr, end_mark); 3732 ret = -EINVAL; 3733 goto out; 3734 } 3735 3736 /* 3737 * Endianness conversion. We are during postcopy (though paused). 3738 * The dirty bitmap won't change. We can directly modify it. 3739 */ 3740 bitmap_from_le(block->bmap, le_bitmap, nbits); 3741 3742 /* 3743 * What we received is "received bitmap". Revert it as the initial 3744 * dirty bitmap for this ramblock. 3745 */ 3746 bitmap_complement(block->bmap, block->bmap, nbits); 3747 3748 trace_ram_dirty_bitmap_reload_complete(block->idstr); 3749 3750 /* 3751 * We succeeded to sync bitmap for current ramblock. If this is 3752 * the last one to sync, we need to notify the main send thread. 3753 */ 3754 ram_dirty_bitmap_reload_notify(s); 3755 3756 ret = 0; 3757 out: 3758 g_free(le_bitmap); 3759 return ret; 3760 } 3761 3762 static int ram_resume_prepare(MigrationState *s, void *opaque) 3763 { 3764 RAMState *rs = *(RAMState **)opaque; 3765 int ret; 3766 3767 ret = ram_dirty_bitmap_sync_all(s, rs); 3768 if (ret) { 3769 return ret; 3770 } 3771 3772 ram_state_resume_prepare(rs, s->to_dst_file); 3773 3774 return 0; 3775 } 3776 3777 static SaveVMHandlers savevm_ram_handlers = { 3778 .save_setup = ram_save_setup, 3779 .save_live_iterate = ram_save_iterate, 3780 .save_live_complete_postcopy = ram_save_complete, 3781 .save_live_complete_precopy = ram_save_complete, 3782 .has_postcopy = ram_has_postcopy, 3783 .save_live_pending = ram_save_pending, 3784 .load_state = ram_load, 3785 .save_cleanup = ram_save_cleanup, 3786 .load_setup = ram_load_setup, 3787 .load_cleanup = ram_load_cleanup, 3788 .resume_prepare = ram_resume_prepare, 3789 }; 3790 3791 void ram_mig_init(void) 3792 { 3793 qemu_mutex_init(&XBZRLE.lock); 3794 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 3795 } 3796