1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "cpu.h" 31 #include "qemu/cutils.h" 32 #include "qemu/bitops.h" 33 #include "qemu/bitmap.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration/register.h" 39 #include "migration/misc.h" 40 #include "qemu-file.h" 41 #include "postcopy-ram.h" 42 #include "page_cache.h" 43 #include "qemu/error-report.h" 44 #include "qapi/error.h" 45 #include "qapi/qapi-types-migration.h" 46 #include "qapi/qapi-events-migration.h" 47 #include "qapi/qmp/qerror.h" 48 #include "trace.h" 49 #include "exec/ram_addr.h" 50 #include "exec/target_page.h" 51 #include "qemu/rcu_queue.h" 52 #include "migration/colo.h" 53 #include "block.h" 54 #include "sysemu/sysemu.h" 55 #include "savevm.h" 56 #include "qemu/iov.h" 57 #include "multifd.h" 58 59 /***********************************************************/ 60 /* ram save/restore */ 61 62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 63 * worked for pages that where filled with the same char. We switched 64 * it to only search for the zero value. And to avoid confusion with 65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 66 */ 67 68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 69 #define RAM_SAVE_FLAG_ZERO 0x02 70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 71 #define RAM_SAVE_FLAG_PAGE 0x08 72 #define RAM_SAVE_FLAG_EOS 0x10 73 #define RAM_SAVE_FLAG_CONTINUE 0x20 74 #define RAM_SAVE_FLAG_XBZRLE 0x40 75 /* 0x80 is reserved in migration.h start with 0x100 next */ 76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 77 78 static inline bool is_zero_range(uint8_t *p, uint64_t size) 79 { 80 return buffer_is_zero(p, size); 81 } 82 83 XBZRLECacheStats xbzrle_counters; 84 85 /* struct contains XBZRLE cache and a static page 86 used by the compression */ 87 static struct { 88 /* buffer used for XBZRLE encoding */ 89 uint8_t *encoded_buf; 90 /* buffer for storing page content */ 91 uint8_t *current_buf; 92 /* Cache for XBZRLE, Protected by lock. */ 93 PageCache *cache; 94 QemuMutex lock; 95 /* it will store a page full of zeros */ 96 uint8_t *zero_target_page; 97 /* buffer used for XBZRLE decoding */ 98 uint8_t *decoded_buf; 99 } XBZRLE; 100 101 static void XBZRLE_cache_lock(void) 102 { 103 if (migrate_use_xbzrle()) 104 qemu_mutex_lock(&XBZRLE.lock); 105 } 106 107 static void XBZRLE_cache_unlock(void) 108 { 109 if (migrate_use_xbzrle()) 110 qemu_mutex_unlock(&XBZRLE.lock); 111 } 112 113 /** 114 * xbzrle_cache_resize: resize the xbzrle cache 115 * 116 * This function is called from qmp_migrate_set_cache_size in main 117 * thread, possibly while a migration is in progress. A running 118 * migration may be using the cache and might finish during this call, 119 * hence changes to the cache are protected by XBZRLE.lock(). 120 * 121 * Returns 0 for success or -1 for error 122 * 123 * @new_size: new cache size 124 * @errp: set *errp if the check failed, with reason 125 */ 126 int xbzrle_cache_resize(int64_t new_size, Error **errp) 127 { 128 PageCache *new_cache; 129 int64_t ret = 0; 130 131 /* Check for truncation */ 132 if (new_size != (size_t)new_size) { 133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 134 "exceeding address space"); 135 return -1; 136 } 137 138 if (new_size == migrate_xbzrle_cache_size()) { 139 /* nothing to do */ 140 return 0; 141 } 142 143 XBZRLE_cache_lock(); 144 145 if (XBZRLE.cache != NULL) { 146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 147 if (!new_cache) { 148 ret = -1; 149 goto out; 150 } 151 152 cache_fini(XBZRLE.cache); 153 XBZRLE.cache = new_cache; 154 } 155 out: 156 XBZRLE_cache_unlock(); 157 return ret; 158 } 159 160 static bool ramblock_is_ignored(RAMBlock *block) 161 { 162 return !qemu_ram_is_migratable(block) || 163 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 164 } 165 166 /* Should be holding either ram_list.mutex, or the RCU lock. */ 167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block) \ 168 INTERNAL_RAMBLOCK_FOREACH(block) \ 169 if (ramblock_is_ignored(block)) {} else 170 171 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \ 172 INTERNAL_RAMBLOCK_FOREACH(block) \ 173 if (!qemu_ram_is_migratable(block)) {} else 174 175 #undef RAMBLOCK_FOREACH 176 177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 178 { 179 RAMBlock *block; 180 int ret = 0; 181 182 RCU_READ_LOCK_GUARD(); 183 184 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 185 ret = func(block, opaque); 186 if (ret) { 187 break; 188 } 189 } 190 return ret; 191 } 192 193 static void ramblock_recv_map_init(void) 194 { 195 RAMBlock *rb; 196 197 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 198 assert(!rb->receivedmap); 199 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 200 } 201 } 202 203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 204 { 205 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 206 rb->receivedmap); 207 } 208 209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 210 { 211 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 212 } 213 214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 215 { 216 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 217 } 218 219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 220 size_t nr) 221 { 222 bitmap_set_atomic(rb->receivedmap, 223 ramblock_recv_bitmap_offset(host_addr, rb), 224 nr); 225 } 226 227 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 228 229 /* 230 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 231 * 232 * Returns >0 if success with sent bytes, or <0 if error. 233 */ 234 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 235 const char *block_name) 236 { 237 RAMBlock *block = qemu_ram_block_by_name(block_name); 238 unsigned long *le_bitmap, nbits; 239 uint64_t size; 240 241 if (!block) { 242 error_report("%s: invalid block name: %s", __func__, block_name); 243 return -1; 244 } 245 246 nbits = block->used_length >> TARGET_PAGE_BITS; 247 248 /* 249 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 250 * machines we may need 4 more bytes for padding (see below 251 * comment). So extend it a bit before hand. 252 */ 253 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 254 255 /* 256 * Always use little endian when sending the bitmap. This is 257 * required that when source and destination VMs are not using the 258 * same endianess. (Note: big endian won't work.) 259 */ 260 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 261 262 /* Size of the bitmap, in bytes */ 263 size = DIV_ROUND_UP(nbits, 8); 264 265 /* 266 * size is always aligned to 8 bytes for 64bit machines, but it 267 * may not be true for 32bit machines. We need this padding to 268 * make sure the migration can survive even between 32bit and 269 * 64bit machines. 270 */ 271 size = ROUND_UP(size, 8); 272 273 qemu_put_be64(file, size); 274 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 275 /* 276 * Mark as an end, in case the middle part is screwed up due to 277 * some "misterious" reason. 278 */ 279 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 280 qemu_fflush(file); 281 282 g_free(le_bitmap); 283 284 if (qemu_file_get_error(file)) { 285 return qemu_file_get_error(file); 286 } 287 288 return size + sizeof(size); 289 } 290 291 /* 292 * An outstanding page request, on the source, having been received 293 * and queued 294 */ 295 struct RAMSrcPageRequest { 296 RAMBlock *rb; 297 hwaddr offset; 298 hwaddr len; 299 300 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 301 }; 302 303 /* State of RAM for migration */ 304 struct RAMState { 305 /* QEMUFile used for this migration */ 306 QEMUFile *f; 307 /* Last block that we have visited searching for dirty pages */ 308 RAMBlock *last_seen_block; 309 /* Last block from where we have sent data */ 310 RAMBlock *last_sent_block; 311 /* Last dirty target page we have sent */ 312 ram_addr_t last_page; 313 /* last ram version we have seen */ 314 uint32_t last_version; 315 /* We are in the first round */ 316 bool ram_bulk_stage; 317 /* The free page optimization is enabled */ 318 bool fpo_enabled; 319 /* How many times we have dirty too many pages */ 320 int dirty_rate_high_cnt; 321 /* these variables are used for bitmap sync */ 322 /* last time we did a full bitmap_sync */ 323 int64_t time_last_bitmap_sync; 324 /* bytes transferred at start_time */ 325 uint64_t bytes_xfer_prev; 326 /* number of dirty pages since start_time */ 327 uint64_t num_dirty_pages_period; 328 /* xbzrle misses since the beginning of the period */ 329 uint64_t xbzrle_cache_miss_prev; 330 331 /* compression statistics since the beginning of the period */ 332 /* amount of count that no free thread to compress data */ 333 uint64_t compress_thread_busy_prev; 334 /* amount bytes after compression */ 335 uint64_t compressed_size_prev; 336 /* amount of compressed pages */ 337 uint64_t compress_pages_prev; 338 339 /* total handled target pages at the beginning of period */ 340 uint64_t target_page_count_prev; 341 /* total handled target pages since start */ 342 uint64_t target_page_count; 343 /* number of dirty bits in the bitmap */ 344 uint64_t migration_dirty_pages; 345 /* Protects modification of the bitmap and migration dirty pages */ 346 QemuMutex bitmap_mutex; 347 /* The RAMBlock used in the last src_page_requests */ 348 RAMBlock *last_req_rb; 349 /* Queue of outstanding page requests from the destination */ 350 QemuMutex src_page_req_mutex; 351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 352 }; 353 typedef struct RAMState RAMState; 354 355 static RAMState *ram_state; 356 357 static NotifierWithReturnList precopy_notifier_list; 358 359 void precopy_infrastructure_init(void) 360 { 361 notifier_with_return_list_init(&precopy_notifier_list); 362 } 363 364 void precopy_add_notifier(NotifierWithReturn *n) 365 { 366 notifier_with_return_list_add(&precopy_notifier_list, n); 367 } 368 369 void precopy_remove_notifier(NotifierWithReturn *n) 370 { 371 notifier_with_return_remove(n); 372 } 373 374 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 375 { 376 PrecopyNotifyData pnd; 377 pnd.reason = reason; 378 pnd.errp = errp; 379 380 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 381 } 382 383 void precopy_enable_free_page_optimization(void) 384 { 385 if (!ram_state) { 386 return; 387 } 388 389 ram_state->fpo_enabled = true; 390 } 391 392 uint64_t ram_bytes_remaining(void) 393 { 394 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 395 0; 396 } 397 398 MigrationStats ram_counters; 399 400 /* used by the search for pages to send */ 401 struct PageSearchStatus { 402 /* Current block being searched */ 403 RAMBlock *block; 404 /* Current page to search from */ 405 unsigned long page; 406 /* Set once we wrap around */ 407 bool complete_round; 408 }; 409 typedef struct PageSearchStatus PageSearchStatus; 410 411 CompressionStats compression_counters; 412 413 struct CompressParam { 414 bool done; 415 bool quit; 416 bool zero_page; 417 QEMUFile *file; 418 QemuMutex mutex; 419 QemuCond cond; 420 RAMBlock *block; 421 ram_addr_t offset; 422 423 /* internally used fields */ 424 z_stream stream; 425 uint8_t *originbuf; 426 }; 427 typedef struct CompressParam CompressParam; 428 429 struct DecompressParam { 430 bool done; 431 bool quit; 432 QemuMutex mutex; 433 QemuCond cond; 434 void *des; 435 uint8_t *compbuf; 436 int len; 437 z_stream stream; 438 }; 439 typedef struct DecompressParam DecompressParam; 440 441 static CompressParam *comp_param; 442 static QemuThread *compress_threads; 443 /* comp_done_cond is used to wake up the migration thread when 444 * one of the compression threads has finished the compression. 445 * comp_done_lock is used to co-work with comp_done_cond. 446 */ 447 static QemuMutex comp_done_lock; 448 static QemuCond comp_done_cond; 449 /* The empty QEMUFileOps will be used by file in CompressParam */ 450 static const QEMUFileOps empty_ops = { }; 451 452 static QEMUFile *decomp_file; 453 static DecompressParam *decomp_param; 454 static QemuThread *decompress_threads; 455 static QemuMutex decomp_done_lock; 456 static QemuCond decomp_done_cond; 457 458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 459 ram_addr_t offset, uint8_t *source_buf); 460 461 static void *do_data_compress(void *opaque) 462 { 463 CompressParam *param = opaque; 464 RAMBlock *block; 465 ram_addr_t offset; 466 bool zero_page; 467 468 qemu_mutex_lock(¶m->mutex); 469 while (!param->quit) { 470 if (param->block) { 471 block = param->block; 472 offset = param->offset; 473 param->block = NULL; 474 qemu_mutex_unlock(¶m->mutex); 475 476 zero_page = do_compress_ram_page(param->file, ¶m->stream, 477 block, offset, param->originbuf); 478 479 qemu_mutex_lock(&comp_done_lock); 480 param->done = true; 481 param->zero_page = zero_page; 482 qemu_cond_signal(&comp_done_cond); 483 qemu_mutex_unlock(&comp_done_lock); 484 485 qemu_mutex_lock(¶m->mutex); 486 } else { 487 qemu_cond_wait(¶m->cond, ¶m->mutex); 488 } 489 } 490 qemu_mutex_unlock(¶m->mutex); 491 492 return NULL; 493 } 494 495 static void compress_threads_save_cleanup(void) 496 { 497 int i, thread_count; 498 499 if (!migrate_use_compression() || !comp_param) { 500 return; 501 } 502 503 thread_count = migrate_compress_threads(); 504 for (i = 0; i < thread_count; i++) { 505 /* 506 * we use it as a indicator which shows if the thread is 507 * properly init'd or not 508 */ 509 if (!comp_param[i].file) { 510 break; 511 } 512 513 qemu_mutex_lock(&comp_param[i].mutex); 514 comp_param[i].quit = true; 515 qemu_cond_signal(&comp_param[i].cond); 516 qemu_mutex_unlock(&comp_param[i].mutex); 517 518 qemu_thread_join(compress_threads + i); 519 qemu_mutex_destroy(&comp_param[i].mutex); 520 qemu_cond_destroy(&comp_param[i].cond); 521 deflateEnd(&comp_param[i].stream); 522 g_free(comp_param[i].originbuf); 523 qemu_fclose(comp_param[i].file); 524 comp_param[i].file = NULL; 525 } 526 qemu_mutex_destroy(&comp_done_lock); 527 qemu_cond_destroy(&comp_done_cond); 528 g_free(compress_threads); 529 g_free(comp_param); 530 compress_threads = NULL; 531 comp_param = NULL; 532 } 533 534 static int compress_threads_save_setup(void) 535 { 536 int i, thread_count; 537 538 if (!migrate_use_compression()) { 539 return 0; 540 } 541 thread_count = migrate_compress_threads(); 542 compress_threads = g_new0(QemuThread, thread_count); 543 comp_param = g_new0(CompressParam, thread_count); 544 qemu_cond_init(&comp_done_cond); 545 qemu_mutex_init(&comp_done_lock); 546 for (i = 0; i < thread_count; i++) { 547 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 548 if (!comp_param[i].originbuf) { 549 goto exit; 550 } 551 552 if (deflateInit(&comp_param[i].stream, 553 migrate_compress_level()) != Z_OK) { 554 g_free(comp_param[i].originbuf); 555 goto exit; 556 } 557 558 /* comp_param[i].file is just used as a dummy buffer to save data, 559 * set its ops to empty. 560 */ 561 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 562 comp_param[i].done = true; 563 comp_param[i].quit = false; 564 qemu_mutex_init(&comp_param[i].mutex); 565 qemu_cond_init(&comp_param[i].cond); 566 qemu_thread_create(compress_threads + i, "compress", 567 do_data_compress, comp_param + i, 568 QEMU_THREAD_JOINABLE); 569 } 570 return 0; 571 572 exit: 573 compress_threads_save_cleanup(); 574 return -1; 575 } 576 577 /** 578 * save_page_header: write page header to wire 579 * 580 * If this is the 1st block, it also writes the block identification 581 * 582 * Returns the number of bytes written 583 * 584 * @f: QEMUFile where to send the data 585 * @block: block that contains the page we want to send 586 * @offset: offset inside the block for the page 587 * in the lower bits, it contains flags 588 */ 589 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 590 ram_addr_t offset) 591 { 592 size_t size, len; 593 594 if (block == rs->last_sent_block) { 595 offset |= RAM_SAVE_FLAG_CONTINUE; 596 } 597 qemu_put_be64(f, offset); 598 size = 8; 599 600 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 601 len = strlen(block->idstr); 602 qemu_put_byte(f, len); 603 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 604 size += 1 + len; 605 rs->last_sent_block = block; 606 } 607 return size; 608 } 609 610 /** 611 * mig_throttle_guest_down: throotle down the guest 612 * 613 * Reduce amount of guest cpu execution to hopefully slow down memory 614 * writes. If guest dirty memory rate is reduced below the rate at 615 * which we can transfer pages to the destination then we should be 616 * able to complete migration. Some workloads dirty memory way too 617 * fast and will not effectively converge, even with auto-converge. 618 */ 619 static void mig_throttle_guest_down(void) 620 { 621 MigrationState *s = migrate_get_current(); 622 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 623 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 624 int pct_max = s->parameters.max_cpu_throttle; 625 626 /* We have not started throttling yet. Let's start it. */ 627 if (!cpu_throttle_active()) { 628 cpu_throttle_set(pct_initial); 629 } else { 630 /* Throttling already on, just increase the rate */ 631 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement, 632 pct_max)); 633 } 634 } 635 636 /** 637 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 638 * 639 * @rs: current RAM state 640 * @current_addr: address for the zero page 641 * 642 * Update the xbzrle cache to reflect a page that's been sent as all 0. 643 * The important thing is that a stale (not-yet-0'd) page be replaced 644 * by the new data. 645 * As a bonus, if the page wasn't in the cache it gets added so that 646 * when a small write is made into the 0'd page it gets XBZRLE sent. 647 */ 648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 649 { 650 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 651 return; 652 } 653 654 /* We don't care if this fails to allocate a new cache page 655 * as long as it updated an old one */ 656 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 657 ram_counters.dirty_sync_count); 658 } 659 660 #define ENCODING_FLAG_XBZRLE 0x1 661 662 /** 663 * save_xbzrle_page: compress and send current page 664 * 665 * Returns: 1 means that we wrote the page 666 * 0 means that page is identical to the one already sent 667 * -1 means that xbzrle would be longer than normal 668 * 669 * @rs: current RAM state 670 * @current_data: pointer to the address of the page contents 671 * @current_addr: addr of the page 672 * @block: block that contains the page we want to send 673 * @offset: offset inside the block for the page 674 * @last_stage: if we are at the completion stage 675 */ 676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 677 ram_addr_t current_addr, RAMBlock *block, 678 ram_addr_t offset, bool last_stage) 679 { 680 int encoded_len = 0, bytes_xbzrle; 681 uint8_t *prev_cached_page; 682 683 if (!cache_is_cached(XBZRLE.cache, current_addr, 684 ram_counters.dirty_sync_count)) { 685 xbzrle_counters.cache_miss++; 686 if (!last_stage) { 687 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 688 ram_counters.dirty_sync_count) == -1) { 689 return -1; 690 } else { 691 /* update *current_data when the page has been 692 inserted into cache */ 693 *current_data = get_cached_data(XBZRLE.cache, current_addr); 694 } 695 } 696 return -1; 697 } 698 699 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 700 701 /* save current buffer into memory */ 702 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 703 704 /* XBZRLE encoding (if there is no overflow) */ 705 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 706 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 707 TARGET_PAGE_SIZE); 708 709 /* 710 * Update the cache contents, so that it corresponds to the data 711 * sent, in all cases except where we skip the page. 712 */ 713 if (!last_stage && encoded_len != 0) { 714 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 715 /* 716 * In the case where we couldn't compress, ensure that the caller 717 * sends the data from the cache, since the guest might have 718 * changed the RAM since we copied it. 719 */ 720 *current_data = prev_cached_page; 721 } 722 723 if (encoded_len == 0) { 724 trace_save_xbzrle_page_skipping(); 725 return 0; 726 } else if (encoded_len == -1) { 727 trace_save_xbzrle_page_overflow(); 728 xbzrle_counters.overflow++; 729 return -1; 730 } 731 732 /* Send XBZRLE based compressed page */ 733 bytes_xbzrle = save_page_header(rs, rs->f, block, 734 offset | RAM_SAVE_FLAG_XBZRLE); 735 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 736 qemu_put_be16(rs->f, encoded_len); 737 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 738 bytes_xbzrle += encoded_len + 1 + 2; 739 xbzrle_counters.pages++; 740 xbzrle_counters.bytes += bytes_xbzrle; 741 ram_counters.transferred += bytes_xbzrle; 742 743 return 1; 744 } 745 746 /** 747 * migration_bitmap_find_dirty: find the next dirty page from start 748 * 749 * Returns the page offset within memory region of the start of a dirty page 750 * 751 * @rs: current RAM state 752 * @rb: RAMBlock where to search for dirty pages 753 * @start: page where we start the search 754 */ 755 static inline 756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 757 unsigned long start) 758 { 759 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 760 unsigned long *bitmap = rb->bmap; 761 unsigned long next; 762 763 if (ramblock_is_ignored(rb)) { 764 return size; 765 } 766 767 /* 768 * When the free page optimization is enabled, we need to check the bitmap 769 * to send the non-free pages rather than all the pages in the bulk stage. 770 */ 771 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) { 772 next = start + 1; 773 } else { 774 next = find_next_bit(bitmap, size, start); 775 } 776 777 return next; 778 } 779 780 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 781 RAMBlock *rb, 782 unsigned long page) 783 { 784 bool ret; 785 786 qemu_mutex_lock(&rs->bitmap_mutex); 787 788 /* 789 * Clear dirty bitmap if needed. This _must_ be called before we 790 * send any of the page in the chunk because we need to make sure 791 * we can capture further page content changes when we sync dirty 792 * log the next time. So as long as we are going to send any of 793 * the page in the chunk we clear the remote dirty bitmap for all. 794 * Clearing it earlier won't be a problem, but too late will. 795 */ 796 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) { 797 uint8_t shift = rb->clear_bmap_shift; 798 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift); 799 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); 800 801 /* 802 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 803 * can make things easier sometimes since then start address 804 * of the small chunk will always be 64 pages aligned so the 805 * bitmap will always be aligned to unsigned long. We should 806 * even be able to remove this restriction but I'm simply 807 * keeping it. 808 */ 809 assert(shift >= 6); 810 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 811 memory_region_clear_dirty_bitmap(rb->mr, start, size); 812 } 813 814 ret = test_and_clear_bit(page, rb->bmap); 815 816 if (ret) { 817 rs->migration_dirty_pages--; 818 } 819 qemu_mutex_unlock(&rs->bitmap_mutex); 820 821 return ret; 822 } 823 824 /* Called with RCU critical section */ 825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 826 { 827 rs->migration_dirty_pages += 828 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length, 829 &rs->num_dirty_pages_period); 830 } 831 832 /** 833 * ram_pagesize_summary: calculate all the pagesizes of a VM 834 * 835 * Returns a summary bitmap of the page sizes of all RAMBlocks 836 * 837 * For VMs with just normal pages this is equivalent to the host page 838 * size. If it's got some huge pages then it's the OR of all the 839 * different page sizes. 840 */ 841 uint64_t ram_pagesize_summary(void) 842 { 843 RAMBlock *block; 844 uint64_t summary = 0; 845 846 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 847 summary |= block->page_size; 848 } 849 850 return summary; 851 } 852 853 uint64_t ram_get_total_transferred_pages(void) 854 { 855 return ram_counters.normal + ram_counters.duplicate + 856 compression_counters.pages + xbzrle_counters.pages; 857 } 858 859 static void migration_update_rates(RAMState *rs, int64_t end_time) 860 { 861 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 862 double compressed_size; 863 864 /* calculate period counters */ 865 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 866 / (end_time - rs->time_last_bitmap_sync); 867 868 if (!page_count) { 869 return; 870 } 871 872 if (migrate_use_xbzrle()) { 873 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 874 rs->xbzrle_cache_miss_prev) / page_count; 875 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 876 } 877 878 if (migrate_use_compression()) { 879 compression_counters.busy_rate = (double)(compression_counters.busy - 880 rs->compress_thread_busy_prev) / page_count; 881 rs->compress_thread_busy_prev = compression_counters.busy; 882 883 compressed_size = compression_counters.compressed_size - 884 rs->compressed_size_prev; 885 if (compressed_size) { 886 double uncompressed_size = (compression_counters.pages - 887 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 888 889 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 890 compression_counters.compression_rate = 891 uncompressed_size / compressed_size; 892 893 rs->compress_pages_prev = compression_counters.pages; 894 rs->compressed_size_prev = compression_counters.compressed_size; 895 } 896 } 897 } 898 899 static void migration_trigger_throttle(RAMState *rs) 900 { 901 MigrationState *s = migrate_get_current(); 902 uint64_t threshold = s->parameters.throttle_trigger_threshold; 903 904 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 905 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 906 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 907 908 /* During block migration the auto-converge logic incorrectly detects 909 * that ram migration makes no progress. Avoid this by disabling the 910 * throttling logic during the bulk phase of block migration. */ 911 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 912 /* The following detection logic can be refined later. For now: 913 Check to see if the ratio between dirtied bytes and the approx. 914 amount of bytes that just got transferred since the last time 915 we were in this routine reaches the threshold. If that happens 916 twice, start or increase throttling. */ 917 918 if ((bytes_dirty_period > bytes_dirty_threshold) && 919 (++rs->dirty_rate_high_cnt >= 2)) { 920 trace_migration_throttle(); 921 rs->dirty_rate_high_cnt = 0; 922 mig_throttle_guest_down(); 923 } 924 } 925 } 926 927 static void migration_bitmap_sync(RAMState *rs) 928 { 929 RAMBlock *block; 930 int64_t end_time; 931 932 ram_counters.dirty_sync_count++; 933 934 if (!rs->time_last_bitmap_sync) { 935 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 936 } 937 938 trace_migration_bitmap_sync_start(); 939 memory_global_dirty_log_sync(); 940 941 qemu_mutex_lock(&rs->bitmap_mutex); 942 WITH_RCU_READ_LOCK_GUARD() { 943 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 944 ramblock_sync_dirty_bitmap(rs, block); 945 } 946 ram_counters.remaining = ram_bytes_remaining(); 947 } 948 qemu_mutex_unlock(&rs->bitmap_mutex); 949 950 memory_global_after_dirty_log_sync(); 951 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 952 953 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 954 955 /* more than 1 second = 1000 millisecons */ 956 if (end_time > rs->time_last_bitmap_sync + 1000) { 957 migration_trigger_throttle(rs); 958 959 migration_update_rates(rs, end_time); 960 961 rs->target_page_count_prev = rs->target_page_count; 962 963 /* reset period counters */ 964 rs->time_last_bitmap_sync = end_time; 965 rs->num_dirty_pages_period = 0; 966 rs->bytes_xfer_prev = ram_counters.transferred; 967 } 968 if (migrate_use_events()) { 969 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 970 } 971 } 972 973 static void migration_bitmap_sync_precopy(RAMState *rs) 974 { 975 Error *local_err = NULL; 976 977 /* 978 * The current notifier usage is just an optimization to migration, so we 979 * don't stop the normal migration process in the error case. 980 */ 981 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 982 error_report_err(local_err); 983 local_err = NULL; 984 } 985 986 migration_bitmap_sync(rs); 987 988 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 989 error_report_err(local_err); 990 } 991 } 992 993 /** 994 * save_zero_page_to_file: send the zero page to the file 995 * 996 * Returns the size of data written to the file, 0 means the page is not 997 * a zero page 998 * 999 * @rs: current RAM state 1000 * @file: the file where the data is saved 1001 * @block: block that contains the page we want to send 1002 * @offset: offset inside the block for the page 1003 */ 1004 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1005 RAMBlock *block, ram_addr_t offset) 1006 { 1007 uint8_t *p = block->host + offset; 1008 int len = 0; 1009 1010 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1011 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1012 qemu_put_byte(file, 0); 1013 len += 1; 1014 } 1015 return len; 1016 } 1017 1018 /** 1019 * save_zero_page: send the zero page to the stream 1020 * 1021 * Returns the number of pages written. 1022 * 1023 * @rs: current RAM state 1024 * @block: block that contains the page we want to send 1025 * @offset: offset inside the block for the page 1026 */ 1027 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1028 { 1029 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1030 1031 if (len) { 1032 ram_counters.duplicate++; 1033 ram_counters.transferred += len; 1034 return 1; 1035 } 1036 return -1; 1037 } 1038 1039 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1040 { 1041 if (!migrate_release_ram() || !migration_in_postcopy()) { 1042 return; 1043 } 1044 1045 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1046 } 1047 1048 /* 1049 * @pages: the number of pages written by the control path, 1050 * < 0 - error 1051 * > 0 - number of pages written 1052 * 1053 * Return true if the pages has been saved, otherwise false is returned. 1054 */ 1055 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1056 int *pages) 1057 { 1058 uint64_t bytes_xmit = 0; 1059 int ret; 1060 1061 *pages = -1; 1062 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1063 &bytes_xmit); 1064 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1065 return false; 1066 } 1067 1068 if (bytes_xmit) { 1069 ram_counters.transferred += bytes_xmit; 1070 *pages = 1; 1071 } 1072 1073 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1074 return true; 1075 } 1076 1077 if (bytes_xmit > 0) { 1078 ram_counters.normal++; 1079 } else if (bytes_xmit == 0) { 1080 ram_counters.duplicate++; 1081 } 1082 1083 return true; 1084 } 1085 1086 /* 1087 * directly send the page to the stream 1088 * 1089 * Returns the number of pages written. 1090 * 1091 * @rs: current RAM state 1092 * @block: block that contains the page we want to send 1093 * @offset: offset inside the block for the page 1094 * @buf: the page to be sent 1095 * @async: send to page asyncly 1096 */ 1097 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1098 uint8_t *buf, bool async) 1099 { 1100 ram_counters.transferred += save_page_header(rs, rs->f, block, 1101 offset | RAM_SAVE_FLAG_PAGE); 1102 if (async) { 1103 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1104 migrate_release_ram() & 1105 migration_in_postcopy()); 1106 } else { 1107 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1108 } 1109 ram_counters.transferred += TARGET_PAGE_SIZE; 1110 ram_counters.normal++; 1111 return 1; 1112 } 1113 1114 /** 1115 * ram_save_page: send the given page to the stream 1116 * 1117 * Returns the number of pages written. 1118 * < 0 - error 1119 * >=0 - Number of pages written - this might legally be 0 1120 * if xbzrle noticed the page was the same. 1121 * 1122 * @rs: current RAM state 1123 * @block: block that contains the page we want to send 1124 * @offset: offset inside the block for the page 1125 * @last_stage: if we are at the completion stage 1126 */ 1127 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1128 { 1129 int pages = -1; 1130 uint8_t *p; 1131 bool send_async = true; 1132 RAMBlock *block = pss->block; 1133 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1134 ram_addr_t current_addr = block->offset + offset; 1135 1136 p = block->host + offset; 1137 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1138 1139 XBZRLE_cache_lock(); 1140 if (!rs->ram_bulk_stage && !migration_in_postcopy() && 1141 migrate_use_xbzrle()) { 1142 pages = save_xbzrle_page(rs, &p, current_addr, block, 1143 offset, last_stage); 1144 if (!last_stage) { 1145 /* Can't send this cached data async, since the cache page 1146 * might get updated before it gets to the wire 1147 */ 1148 send_async = false; 1149 } 1150 } 1151 1152 /* XBZRLE overflow or normal page */ 1153 if (pages == -1) { 1154 pages = save_normal_page(rs, block, offset, p, send_async); 1155 } 1156 1157 XBZRLE_cache_unlock(); 1158 1159 return pages; 1160 } 1161 1162 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1163 ram_addr_t offset) 1164 { 1165 if (multifd_queue_page(rs->f, block, offset) < 0) { 1166 return -1; 1167 } 1168 ram_counters.normal++; 1169 1170 return 1; 1171 } 1172 1173 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1174 ram_addr_t offset, uint8_t *source_buf) 1175 { 1176 RAMState *rs = ram_state; 1177 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1178 bool zero_page = false; 1179 int ret; 1180 1181 if (save_zero_page_to_file(rs, f, block, offset)) { 1182 zero_page = true; 1183 goto exit; 1184 } 1185 1186 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1187 1188 /* 1189 * copy it to a internal buffer to avoid it being modified by VM 1190 * so that we can catch up the error during compression and 1191 * decompression 1192 */ 1193 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1194 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1195 if (ret < 0) { 1196 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1197 error_report("compressed data failed!"); 1198 return false; 1199 } 1200 1201 exit: 1202 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1203 return zero_page; 1204 } 1205 1206 static void 1207 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1208 { 1209 ram_counters.transferred += bytes_xmit; 1210 1211 if (param->zero_page) { 1212 ram_counters.duplicate++; 1213 return; 1214 } 1215 1216 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1217 compression_counters.compressed_size += bytes_xmit - 8; 1218 compression_counters.pages++; 1219 } 1220 1221 static bool save_page_use_compression(RAMState *rs); 1222 1223 static void flush_compressed_data(RAMState *rs) 1224 { 1225 int idx, len, thread_count; 1226 1227 if (!save_page_use_compression(rs)) { 1228 return; 1229 } 1230 thread_count = migrate_compress_threads(); 1231 1232 qemu_mutex_lock(&comp_done_lock); 1233 for (idx = 0; idx < thread_count; idx++) { 1234 while (!comp_param[idx].done) { 1235 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1236 } 1237 } 1238 qemu_mutex_unlock(&comp_done_lock); 1239 1240 for (idx = 0; idx < thread_count; idx++) { 1241 qemu_mutex_lock(&comp_param[idx].mutex); 1242 if (!comp_param[idx].quit) { 1243 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1244 /* 1245 * it's safe to fetch zero_page without holding comp_done_lock 1246 * as there is no further request submitted to the thread, 1247 * i.e, the thread should be waiting for a request at this point. 1248 */ 1249 update_compress_thread_counts(&comp_param[idx], len); 1250 } 1251 qemu_mutex_unlock(&comp_param[idx].mutex); 1252 } 1253 } 1254 1255 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1256 ram_addr_t offset) 1257 { 1258 param->block = block; 1259 param->offset = offset; 1260 } 1261 1262 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1263 ram_addr_t offset) 1264 { 1265 int idx, thread_count, bytes_xmit = -1, pages = -1; 1266 bool wait = migrate_compress_wait_thread(); 1267 1268 thread_count = migrate_compress_threads(); 1269 qemu_mutex_lock(&comp_done_lock); 1270 retry: 1271 for (idx = 0; idx < thread_count; idx++) { 1272 if (comp_param[idx].done) { 1273 comp_param[idx].done = false; 1274 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1275 qemu_mutex_lock(&comp_param[idx].mutex); 1276 set_compress_params(&comp_param[idx], block, offset); 1277 qemu_cond_signal(&comp_param[idx].cond); 1278 qemu_mutex_unlock(&comp_param[idx].mutex); 1279 pages = 1; 1280 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1281 break; 1282 } 1283 } 1284 1285 /* 1286 * wait for the free thread if the user specifies 'compress-wait-thread', 1287 * otherwise we will post the page out in the main thread as normal page. 1288 */ 1289 if (pages < 0 && wait) { 1290 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1291 goto retry; 1292 } 1293 qemu_mutex_unlock(&comp_done_lock); 1294 1295 return pages; 1296 } 1297 1298 /** 1299 * find_dirty_block: find the next dirty page and update any state 1300 * associated with the search process. 1301 * 1302 * Returns true if a page is found 1303 * 1304 * @rs: current RAM state 1305 * @pss: data about the state of the current dirty page scan 1306 * @again: set to false if the search has scanned the whole of RAM 1307 */ 1308 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1309 { 1310 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1311 if (pss->complete_round && pss->block == rs->last_seen_block && 1312 pss->page >= rs->last_page) { 1313 /* 1314 * We've been once around the RAM and haven't found anything. 1315 * Give up. 1316 */ 1317 *again = false; 1318 return false; 1319 } 1320 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS) 1321 >= pss->block->used_length) { 1322 /* Didn't find anything in this RAM Block */ 1323 pss->page = 0; 1324 pss->block = QLIST_NEXT_RCU(pss->block, next); 1325 if (!pss->block) { 1326 /* 1327 * If memory migration starts over, we will meet a dirtied page 1328 * which may still exists in compression threads's ring, so we 1329 * should flush the compressed data to make sure the new page 1330 * is not overwritten by the old one in the destination. 1331 * 1332 * Also If xbzrle is on, stop using the data compression at this 1333 * point. In theory, xbzrle can do better than compression. 1334 */ 1335 flush_compressed_data(rs); 1336 1337 /* Hit the end of the list */ 1338 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1339 /* Flag that we've looped */ 1340 pss->complete_round = true; 1341 rs->ram_bulk_stage = false; 1342 } 1343 /* Didn't find anything this time, but try again on the new block */ 1344 *again = true; 1345 return false; 1346 } else { 1347 /* Can go around again, but... */ 1348 *again = true; 1349 /* We've found something so probably don't need to */ 1350 return true; 1351 } 1352 } 1353 1354 /** 1355 * unqueue_page: gets a page of the queue 1356 * 1357 * Helper for 'get_queued_page' - gets a page off the queue 1358 * 1359 * Returns the block of the page (or NULL if none available) 1360 * 1361 * @rs: current RAM state 1362 * @offset: used to return the offset within the RAMBlock 1363 */ 1364 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1365 { 1366 RAMBlock *block = NULL; 1367 1368 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1369 return NULL; 1370 } 1371 1372 qemu_mutex_lock(&rs->src_page_req_mutex); 1373 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1374 struct RAMSrcPageRequest *entry = 1375 QSIMPLEQ_FIRST(&rs->src_page_requests); 1376 block = entry->rb; 1377 *offset = entry->offset; 1378 1379 if (entry->len > TARGET_PAGE_SIZE) { 1380 entry->len -= TARGET_PAGE_SIZE; 1381 entry->offset += TARGET_PAGE_SIZE; 1382 } else { 1383 memory_region_unref(block->mr); 1384 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1385 g_free(entry); 1386 migration_consume_urgent_request(); 1387 } 1388 } 1389 qemu_mutex_unlock(&rs->src_page_req_mutex); 1390 1391 return block; 1392 } 1393 1394 /** 1395 * get_queued_page: unqueue a page from the postcopy requests 1396 * 1397 * Skips pages that are already sent (!dirty) 1398 * 1399 * Returns true if a queued page is found 1400 * 1401 * @rs: current RAM state 1402 * @pss: data about the state of the current dirty page scan 1403 */ 1404 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1405 { 1406 RAMBlock *block; 1407 ram_addr_t offset; 1408 bool dirty; 1409 1410 do { 1411 block = unqueue_page(rs, &offset); 1412 /* 1413 * We're sending this page, and since it's postcopy nothing else 1414 * will dirty it, and we must make sure it doesn't get sent again 1415 * even if this queue request was received after the background 1416 * search already sent it. 1417 */ 1418 if (block) { 1419 unsigned long page; 1420 1421 page = offset >> TARGET_PAGE_BITS; 1422 dirty = test_bit(page, block->bmap); 1423 if (!dirty) { 1424 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1425 page); 1426 } else { 1427 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1428 } 1429 } 1430 1431 } while (block && !dirty); 1432 1433 if (block) { 1434 /* 1435 * As soon as we start servicing pages out of order, then we have 1436 * to kill the bulk stage, since the bulk stage assumes 1437 * in (migration_bitmap_find_and_reset_dirty) that every page is 1438 * dirty, that's no longer true. 1439 */ 1440 rs->ram_bulk_stage = false; 1441 1442 /* 1443 * We want the background search to continue from the queued page 1444 * since the guest is likely to want other pages near to the page 1445 * it just requested. 1446 */ 1447 pss->block = block; 1448 pss->page = offset >> TARGET_PAGE_BITS; 1449 1450 /* 1451 * This unqueued page would break the "one round" check, even is 1452 * really rare. 1453 */ 1454 pss->complete_round = false; 1455 } 1456 1457 return !!block; 1458 } 1459 1460 /** 1461 * migration_page_queue_free: drop any remaining pages in the ram 1462 * request queue 1463 * 1464 * It should be empty at the end anyway, but in error cases there may 1465 * be some left. in case that there is any page left, we drop it. 1466 * 1467 */ 1468 static void migration_page_queue_free(RAMState *rs) 1469 { 1470 struct RAMSrcPageRequest *mspr, *next_mspr; 1471 /* This queue generally should be empty - but in the case of a failed 1472 * migration might have some droppings in. 1473 */ 1474 RCU_READ_LOCK_GUARD(); 1475 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1476 memory_region_unref(mspr->rb->mr); 1477 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1478 g_free(mspr); 1479 } 1480 } 1481 1482 /** 1483 * ram_save_queue_pages: queue the page for transmission 1484 * 1485 * A request from postcopy destination for example. 1486 * 1487 * Returns zero on success or negative on error 1488 * 1489 * @rbname: Name of the RAMBLock of the request. NULL means the 1490 * same that last one. 1491 * @start: starting address from the start of the RAMBlock 1492 * @len: length (in bytes) to send 1493 */ 1494 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1495 { 1496 RAMBlock *ramblock; 1497 RAMState *rs = ram_state; 1498 1499 ram_counters.postcopy_requests++; 1500 RCU_READ_LOCK_GUARD(); 1501 1502 if (!rbname) { 1503 /* Reuse last RAMBlock */ 1504 ramblock = rs->last_req_rb; 1505 1506 if (!ramblock) { 1507 /* 1508 * Shouldn't happen, we can't reuse the last RAMBlock if 1509 * it's the 1st request. 1510 */ 1511 error_report("ram_save_queue_pages no previous block"); 1512 return -1; 1513 } 1514 } else { 1515 ramblock = qemu_ram_block_by_name(rbname); 1516 1517 if (!ramblock) { 1518 /* We shouldn't be asked for a non-existent RAMBlock */ 1519 error_report("ram_save_queue_pages no block '%s'", rbname); 1520 return -1; 1521 } 1522 rs->last_req_rb = ramblock; 1523 } 1524 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1525 if (start+len > ramblock->used_length) { 1526 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1527 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1528 __func__, start, len, ramblock->used_length); 1529 return -1; 1530 } 1531 1532 struct RAMSrcPageRequest *new_entry = 1533 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1534 new_entry->rb = ramblock; 1535 new_entry->offset = start; 1536 new_entry->len = len; 1537 1538 memory_region_ref(ramblock->mr); 1539 qemu_mutex_lock(&rs->src_page_req_mutex); 1540 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1541 migration_make_urgent_request(); 1542 qemu_mutex_unlock(&rs->src_page_req_mutex); 1543 1544 return 0; 1545 } 1546 1547 static bool save_page_use_compression(RAMState *rs) 1548 { 1549 if (!migrate_use_compression()) { 1550 return false; 1551 } 1552 1553 /* 1554 * If xbzrle is on, stop using the data compression after first 1555 * round of migration even if compression is enabled. In theory, 1556 * xbzrle can do better than compression. 1557 */ 1558 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 1559 return true; 1560 } 1561 1562 return false; 1563 } 1564 1565 /* 1566 * try to compress the page before posting it out, return true if the page 1567 * has been properly handled by compression, otherwise needs other 1568 * paths to handle it 1569 */ 1570 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1571 { 1572 if (!save_page_use_compression(rs)) { 1573 return false; 1574 } 1575 1576 /* 1577 * When starting the process of a new block, the first page of 1578 * the block should be sent out before other pages in the same 1579 * block, and all the pages in last block should have been sent 1580 * out, keeping this order is important, because the 'cont' flag 1581 * is used to avoid resending the block name. 1582 * 1583 * We post the fist page as normal page as compression will take 1584 * much CPU resource. 1585 */ 1586 if (block != rs->last_sent_block) { 1587 flush_compressed_data(rs); 1588 return false; 1589 } 1590 1591 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 1592 return true; 1593 } 1594 1595 compression_counters.busy++; 1596 return false; 1597 } 1598 1599 /** 1600 * ram_save_target_page: save one target page 1601 * 1602 * Returns the number of pages written 1603 * 1604 * @rs: current RAM state 1605 * @pss: data about the page we want to send 1606 * @last_stage: if we are at the completion stage 1607 */ 1608 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1609 bool last_stage) 1610 { 1611 RAMBlock *block = pss->block; 1612 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1613 int res; 1614 1615 if (control_save_page(rs, block, offset, &res)) { 1616 return res; 1617 } 1618 1619 if (save_compress_page(rs, block, offset)) { 1620 return 1; 1621 } 1622 1623 res = save_zero_page(rs, block, offset); 1624 if (res > 0) { 1625 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1626 * page would be stale 1627 */ 1628 if (!save_page_use_compression(rs)) { 1629 XBZRLE_cache_lock(); 1630 xbzrle_cache_zero_page(rs, block->offset + offset); 1631 XBZRLE_cache_unlock(); 1632 } 1633 ram_release_pages(block->idstr, offset, res); 1634 return res; 1635 } 1636 1637 /* 1638 * Do not use multifd for: 1639 * 1. Compression as the first page in the new block should be posted out 1640 * before sending the compressed page 1641 * 2. In postcopy as one whole host page should be placed 1642 */ 1643 if (!save_page_use_compression(rs) && migrate_use_multifd() 1644 && !migration_in_postcopy()) { 1645 return ram_save_multifd_page(rs, block, offset); 1646 } 1647 1648 return ram_save_page(rs, pss, last_stage); 1649 } 1650 1651 /** 1652 * ram_save_host_page: save a whole host page 1653 * 1654 * Starting at *offset send pages up to the end of the current host 1655 * page. It's valid for the initial offset to point into the middle of 1656 * a host page in which case the remainder of the hostpage is sent. 1657 * Only dirty target pages are sent. Note that the host page size may 1658 * be a huge page for this block. 1659 * The saving stops at the boundary of the used_length of the block 1660 * if the RAMBlock isn't a multiple of the host page size. 1661 * 1662 * Returns the number of pages written or negative on error 1663 * 1664 * @rs: current RAM state 1665 * @ms: current migration state 1666 * @pss: data about the page we want to send 1667 * @last_stage: if we are at the completion stage 1668 */ 1669 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1670 bool last_stage) 1671 { 1672 int tmppages, pages = 0; 1673 size_t pagesize_bits = 1674 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1675 1676 if (ramblock_is_ignored(pss->block)) { 1677 error_report("block %s should not be migrated !", pss->block->idstr); 1678 return 0; 1679 } 1680 1681 do { 1682 /* Check the pages is dirty and if it is send it */ 1683 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 1684 pss->page++; 1685 continue; 1686 } 1687 1688 tmppages = ram_save_target_page(rs, pss, last_stage); 1689 if (tmppages < 0) { 1690 return tmppages; 1691 } 1692 1693 pages += tmppages; 1694 pss->page++; 1695 /* Allow rate limiting to happen in the middle of huge pages */ 1696 migration_rate_limit(); 1697 } while ((pss->page & (pagesize_bits - 1)) && 1698 offset_in_ramblock(pss->block, 1699 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 1700 1701 /* The offset we leave with is the last one we looked at */ 1702 pss->page--; 1703 return pages; 1704 } 1705 1706 /** 1707 * ram_find_and_save_block: finds a dirty page and sends it to f 1708 * 1709 * Called within an RCU critical section. 1710 * 1711 * Returns the number of pages written where zero means no dirty pages, 1712 * or negative on error 1713 * 1714 * @rs: current RAM state 1715 * @last_stage: if we are at the completion stage 1716 * 1717 * On systems where host-page-size > target-page-size it will send all the 1718 * pages in a host page that are dirty. 1719 */ 1720 1721 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 1722 { 1723 PageSearchStatus pss; 1724 int pages = 0; 1725 bool again, found; 1726 1727 /* No dirty page as there is zero RAM */ 1728 if (!ram_bytes_total()) { 1729 return pages; 1730 } 1731 1732 pss.block = rs->last_seen_block; 1733 pss.page = rs->last_page; 1734 pss.complete_round = false; 1735 1736 if (!pss.block) { 1737 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1738 } 1739 1740 do { 1741 again = true; 1742 found = get_queued_page(rs, &pss); 1743 1744 if (!found) { 1745 /* priority queue empty, so just search for something dirty */ 1746 found = find_dirty_block(rs, &pss, &again); 1747 } 1748 1749 if (found) { 1750 pages = ram_save_host_page(rs, &pss, last_stage); 1751 } 1752 } while (!pages && again); 1753 1754 rs->last_seen_block = pss.block; 1755 rs->last_page = pss.page; 1756 1757 return pages; 1758 } 1759 1760 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1761 { 1762 uint64_t pages = size / TARGET_PAGE_SIZE; 1763 1764 if (zero) { 1765 ram_counters.duplicate += pages; 1766 } else { 1767 ram_counters.normal += pages; 1768 ram_counters.transferred += size; 1769 qemu_update_position(f, size); 1770 } 1771 } 1772 1773 static uint64_t ram_bytes_total_common(bool count_ignored) 1774 { 1775 RAMBlock *block; 1776 uint64_t total = 0; 1777 1778 RCU_READ_LOCK_GUARD(); 1779 1780 if (count_ignored) { 1781 RAMBLOCK_FOREACH_MIGRATABLE(block) { 1782 total += block->used_length; 1783 } 1784 } else { 1785 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1786 total += block->used_length; 1787 } 1788 } 1789 return total; 1790 } 1791 1792 uint64_t ram_bytes_total(void) 1793 { 1794 return ram_bytes_total_common(false); 1795 } 1796 1797 static void xbzrle_load_setup(void) 1798 { 1799 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 1800 } 1801 1802 static void xbzrle_load_cleanup(void) 1803 { 1804 g_free(XBZRLE.decoded_buf); 1805 XBZRLE.decoded_buf = NULL; 1806 } 1807 1808 static void ram_state_cleanup(RAMState **rsp) 1809 { 1810 if (*rsp) { 1811 migration_page_queue_free(*rsp); 1812 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 1813 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 1814 g_free(*rsp); 1815 *rsp = NULL; 1816 } 1817 } 1818 1819 static void xbzrle_cleanup(void) 1820 { 1821 XBZRLE_cache_lock(); 1822 if (XBZRLE.cache) { 1823 cache_fini(XBZRLE.cache); 1824 g_free(XBZRLE.encoded_buf); 1825 g_free(XBZRLE.current_buf); 1826 g_free(XBZRLE.zero_target_page); 1827 XBZRLE.cache = NULL; 1828 XBZRLE.encoded_buf = NULL; 1829 XBZRLE.current_buf = NULL; 1830 XBZRLE.zero_target_page = NULL; 1831 } 1832 XBZRLE_cache_unlock(); 1833 } 1834 1835 static void ram_save_cleanup(void *opaque) 1836 { 1837 RAMState **rsp = opaque; 1838 RAMBlock *block; 1839 1840 /* caller have hold iothread lock or is in a bh, so there is 1841 * no writing race against the migration bitmap 1842 */ 1843 memory_global_dirty_log_stop(); 1844 1845 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1846 g_free(block->clear_bmap); 1847 block->clear_bmap = NULL; 1848 g_free(block->bmap); 1849 block->bmap = NULL; 1850 } 1851 1852 xbzrle_cleanup(); 1853 compress_threads_save_cleanup(); 1854 ram_state_cleanup(rsp); 1855 } 1856 1857 static void ram_state_reset(RAMState *rs) 1858 { 1859 rs->last_seen_block = NULL; 1860 rs->last_sent_block = NULL; 1861 rs->last_page = 0; 1862 rs->last_version = ram_list.version; 1863 rs->ram_bulk_stage = true; 1864 rs->fpo_enabled = false; 1865 } 1866 1867 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1868 1869 /* 1870 * 'expected' is the value you expect the bitmap mostly to be full 1871 * of; it won't bother printing lines that are all this value. 1872 * If 'todump' is null the migration bitmap is dumped. 1873 */ 1874 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 1875 unsigned long pages) 1876 { 1877 int64_t cur; 1878 int64_t linelen = 128; 1879 char linebuf[129]; 1880 1881 for (cur = 0; cur < pages; cur += linelen) { 1882 int64_t curb; 1883 bool found = false; 1884 /* 1885 * Last line; catch the case where the line length 1886 * is longer than remaining ram 1887 */ 1888 if (cur + linelen > pages) { 1889 linelen = pages - cur; 1890 } 1891 for (curb = 0; curb < linelen; curb++) { 1892 bool thisbit = test_bit(cur + curb, todump); 1893 linebuf[curb] = thisbit ? '1' : '.'; 1894 found = found || (thisbit != expected); 1895 } 1896 if (found) { 1897 linebuf[curb] = '\0'; 1898 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1899 } 1900 } 1901 } 1902 1903 /* **** functions for postcopy ***** */ 1904 1905 void ram_postcopy_migrated_memory_release(MigrationState *ms) 1906 { 1907 struct RAMBlock *block; 1908 1909 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1910 unsigned long *bitmap = block->bmap; 1911 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 1912 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 1913 1914 while (run_start < range) { 1915 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 1916 ram_discard_range(block->idstr, 1917 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 1918 ((ram_addr_t)(run_end - run_start)) 1919 << TARGET_PAGE_BITS); 1920 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 1921 } 1922 } 1923 } 1924 1925 /** 1926 * postcopy_send_discard_bm_ram: discard a RAMBlock 1927 * 1928 * Returns zero on success 1929 * 1930 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1931 * 1932 * @ms: current migration state 1933 * @block: RAMBlock to discard 1934 */ 1935 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 1936 { 1937 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 1938 unsigned long current; 1939 unsigned long *bitmap = block->bmap; 1940 1941 for (current = 0; current < end; ) { 1942 unsigned long one = find_next_bit(bitmap, end, current); 1943 unsigned long zero, discard_length; 1944 1945 if (one >= end) { 1946 break; 1947 } 1948 1949 zero = find_next_zero_bit(bitmap, end, one + 1); 1950 1951 if (zero >= end) { 1952 discard_length = end - one; 1953 } else { 1954 discard_length = zero - one; 1955 } 1956 postcopy_discard_send_range(ms, one, discard_length); 1957 current = one + discard_length; 1958 } 1959 1960 return 0; 1961 } 1962 1963 /** 1964 * postcopy_each_ram_send_discard: discard all RAMBlocks 1965 * 1966 * Returns 0 for success or negative for error 1967 * 1968 * Utility for the outgoing postcopy code. 1969 * Calls postcopy_send_discard_bm_ram for each RAMBlock 1970 * passing it bitmap indexes and name. 1971 * (qemu_ram_foreach_block ends up passing unscaled lengths 1972 * which would mean postcopy code would have to deal with target page) 1973 * 1974 * @ms: current migration state 1975 */ 1976 static int postcopy_each_ram_send_discard(MigrationState *ms) 1977 { 1978 struct RAMBlock *block; 1979 int ret; 1980 1981 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1982 postcopy_discard_send_init(ms, block->idstr); 1983 1984 /* 1985 * Postcopy sends chunks of bitmap over the wire, but it 1986 * just needs indexes at this point, avoids it having 1987 * target page specific code. 1988 */ 1989 ret = postcopy_send_discard_bm_ram(ms, block); 1990 postcopy_discard_send_finish(ms); 1991 if (ret) { 1992 return ret; 1993 } 1994 } 1995 1996 return 0; 1997 } 1998 1999 /** 2000 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2001 * 2002 * Helper for postcopy_chunk_hostpages; it's called twice to 2003 * canonicalize the two bitmaps, that are similar, but one is 2004 * inverted. 2005 * 2006 * Postcopy requires that all target pages in a hostpage are dirty or 2007 * clean, not a mix. This function canonicalizes the bitmaps. 2008 * 2009 * @ms: current migration state 2010 * @block: block that contains the page we want to canonicalize 2011 */ 2012 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2013 { 2014 RAMState *rs = ram_state; 2015 unsigned long *bitmap = block->bmap; 2016 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2017 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2018 unsigned long run_start; 2019 2020 if (block->page_size == TARGET_PAGE_SIZE) { 2021 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2022 return; 2023 } 2024 2025 /* Find a dirty page */ 2026 run_start = find_next_bit(bitmap, pages, 0); 2027 2028 while (run_start < pages) { 2029 2030 /* 2031 * If the start of this run of pages is in the middle of a host 2032 * page, then we need to fixup this host page. 2033 */ 2034 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2035 /* Find the end of this run */ 2036 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2037 /* 2038 * If the end isn't at the start of a host page, then the 2039 * run doesn't finish at the end of a host page 2040 * and we need to discard. 2041 */ 2042 } 2043 2044 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2045 unsigned long page; 2046 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2047 host_ratio); 2048 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2049 2050 /* Clean up the bitmap */ 2051 for (page = fixup_start_addr; 2052 page < fixup_start_addr + host_ratio; page++) { 2053 /* 2054 * Remark them as dirty, updating the count for any pages 2055 * that weren't previously dirty. 2056 */ 2057 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2058 } 2059 } 2060 2061 /* Find the next dirty page for the next iteration */ 2062 run_start = find_next_bit(bitmap, pages, run_start); 2063 } 2064 } 2065 2066 /** 2067 * postcopy_chunk_hostpages: discard any partially sent host page 2068 * 2069 * Utility for the outgoing postcopy code. 2070 * 2071 * Discard any partially sent host-page size chunks, mark any partially 2072 * dirty host-page size chunks as all dirty. In this case the host-page 2073 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2074 * 2075 * Returns zero on success 2076 * 2077 * @ms: current migration state 2078 * @block: block we want to work with 2079 */ 2080 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2081 { 2082 postcopy_discard_send_init(ms, block->idstr); 2083 2084 /* 2085 * Ensure that all partially dirty host pages are made fully dirty. 2086 */ 2087 postcopy_chunk_hostpages_pass(ms, block); 2088 2089 postcopy_discard_send_finish(ms); 2090 return 0; 2091 } 2092 2093 /** 2094 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2095 * 2096 * Returns zero on success 2097 * 2098 * Transmit the set of pages to be discarded after precopy to the target 2099 * these are pages that: 2100 * a) Have been previously transmitted but are now dirty again 2101 * b) Pages that have never been transmitted, this ensures that 2102 * any pages on the destination that have been mapped by background 2103 * tasks get discarded (transparent huge pages is the specific concern) 2104 * Hopefully this is pretty sparse 2105 * 2106 * @ms: current migration state 2107 */ 2108 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2109 { 2110 RAMState *rs = ram_state; 2111 RAMBlock *block; 2112 int ret; 2113 2114 RCU_READ_LOCK_GUARD(); 2115 2116 /* This should be our last sync, the src is now paused */ 2117 migration_bitmap_sync(rs); 2118 2119 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2120 rs->last_seen_block = NULL; 2121 rs->last_sent_block = NULL; 2122 rs->last_page = 0; 2123 2124 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2125 /* Deal with TPS != HPS and huge pages */ 2126 ret = postcopy_chunk_hostpages(ms, block); 2127 if (ret) { 2128 return ret; 2129 } 2130 2131 #ifdef DEBUG_POSTCOPY 2132 ram_debug_dump_bitmap(block->bmap, true, 2133 block->used_length >> TARGET_PAGE_BITS); 2134 #endif 2135 } 2136 trace_ram_postcopy_send_discard_bitmap(); 2137 2138 return postcopy_each_ram_send_discard(ms); 2139 } 2140 2141 /** 2142 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2143 * 2144 * Returns zero on success 2145 * 2146 * @rbname: name of the RAMBlock of the request. NULL means the 2147 * same that last one. 2148 * @start: RAMBlock starting page 2149 * @length: RAMBlock size 2150 */ 2151 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2152 { 2153 trace_ram_discard_range(rbname, start, length); 2154 2155 RCU_READ_LOCK_GUARD(); 2156 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2157 2158 if (!rb) { 2159 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2160 return -1; 2161 } 2162 2163 /* 2164 * On source VM, we don't need to update the received bitmap since 2165 * we don't even have one. 2166 */ 2167 if (rb->receivedmap) { 2168 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2169 length >> qemu_target_page_bits()); 2170 } 2171 2172 return ram_block_discard_range(rb, start, length); 2173 } 2174 2175 /* 2176 * For every allocation, we will try not to crash the VM if the 2177 * allocation failed. 2178 */ 2179 static int xbzrle_init(void) 2180 { 2181 Error *local_err = NULL; 2182 2183 if (!migrate_use_xbzrle()) { 2184 return 0; 2185 } 2186 2187 XBZRLE_cache_lock(); 2188 2189 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2190 if (!XBZRLE.zero_target_page) { 2191 error_report("%s: Error allocating zero page", __func__); 2192 goto err_out; 2193 } 2194 2195 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2196 TARGET_PAGE_SIZE, &local_err); 2197 if (!XBZRLE.cache) { 2198 error_report_err(local_err); 2199 goto free_zero_page; 2200 } 2201 2202 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2203 if (!XBZRLE.encoded_buf) { 2204 error_report("%s: Error allocating encoded_buf", __func__); 2205 goto free_cache; 2206 } 2207 2208 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2209 if (!XBZRLE.current_buf) { 2210 error_report("%s: Error allocating current_buf", __func__); 2211 goto free_encoded_buf; 2212 } 2213 2214 /* We are all good */ 2215 XBZRLE_cache_unlock(); 2216 return 0; 2217 2218 free_encoded_buf: 2219 g_free(XBZRLE.encoded_buf); 2220 XBZRLE.encoded_buf = NULL; 2221 free_cache: 2222 cache_fini(XBZRLE.cache); 2223 XBZRLE.cache = NULL; 2224 free_zero_page: 2225 g_free(XBZRLE.zero_target_page); 2226 XBZRLE.zero_target_page = NULL; 2227 err_out: 2228 XBZRLE_cache_unlock(); 2229 return -ENOMEM; 2230 } 2231 2232 static int ram_state_init(RAMState **rsp) 2233 { 2234 *rsp = g_try_new0(RAMState, 1); 2235 2236 if (!*rsp) { 2237 error_report("%s: Init ramstate fail", __func__); 2238 return -1; 2239 } 2240 2241 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2242 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2243 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2244 2245 /* 2246 * Count the total number of pages used by ram blocks not including any 2247 * gaps due to alignment or unplugs. 2248 * This must match with the initial values of dirty bitmap. 2249 */ 2250 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2251 ram_state_reset(*rsp); 2252 2253 return 0; 2254 } 2255 2256 static void ram_list_init_bitmaps(void) 2257 { 2258 MigrationState *ms = migrate_get_current(); 2259 RAMBlock *block; 2260 unsigned long pages; 2261 uint8_t shift; 2262 2263 /* Skip setting bitmap if there is no RAM */ 2264 if (ram_bytes_total()) { 2265 shift = ms->clear_bitmap_shift; 2266 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2267 error_report("clear_bitmap_shift (%u) too big, using " 2268 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2269 shift = CLEAR_BITMAP_SHIFT_MAX; 2270 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2271 error_report("clear_bitmap_shift (%u) too small, using " 2272 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2273 shift = CLEAR_BITMAP_SHIFT_MIN; 2274 } 2275 2276 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2277 pages = block->max_length >> TARGET_PAGE_BITS; 2278 /* 2279 * The initial dirty bitmap for migration must be set with all 2280 * ones to make sure we'll migrate every guest RAM page to 2281 * destination. 2282 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2283 * new migration after a failed migration, ram_list. 2284 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2285 * guest memory. 2286 */ 2287 block->bmap = bitmap_new(pages); 2288 bitmap_set(block->bmap, 0, pages); 2289 block->clear_bmap_shift = shift; 2290 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2291 } 2292 } 2293 } 2294 2295 static void ram_init_bitmaps(RAMState *rs) 2296 { 2297 /* For memory_global_dirty_log_start below. */ 2298 qemu_mutex_lock_iothread(); 2299 qemu_mutex_lock_ramlist(); 2300 2301 WITH_RCU_READ_LOCK_GUARD() { 2302 ram_list_init_bitmaps(); 2303 memory_global_dirty_log_start(); 2304 migration_bitmap_sync_precopy(rs); 2305 } 2306 qemu_mutex_unlock_ramlist(); 2307 qemu_mutex_unlock_iothread(); 2308 } 2309 2310 static int ram_init_all(RAMState **rsp) 2311 { 2312 if (ram_state_init(rsp)) { 2313 return -1; 2314 } 2315 2316 if (xbzrle_init()) { 2317 ram_state_cleanup(rsp); 2318 return -1; 2319 } 2320 2321 ram_init_bitmaps(*rsp); 2322 2323 return 0; 2324 } 2325 2326 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2327 { 2328 RAMBlock *block; 2329 uint64_t pages = 0; 2330 2331 /* 2332 * Postcopy is not using xbzrle/compression, so no need for that. 2333 * Also, since source are already halted, we don't need to care 2334 * about dirty page logging as well. 2335 */ 2336 2337 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2338 pages += bitmap_count_one(block->bmap, 2339 block->used_length >> TARGET_PAGE_BITS); 2340 } 2341 2342 /* This may not be aligned with current bitmaps. Recalculate. */ 2343 rs->migration_dirty_pages = pages; 2344 2345 rs->last_seen_block = NULL; 2346 rs->last_sent_block = NULL; 2347 rs->last_page = 0; 2348 rs->last_version = ram_list.version; 2349 /* 2350 * Disable the bulk stage, otherwise we'll resend the whole RAM no 2351 * matter what we have sent. 2352 */ 2353 rs->ram_bulk_stage = false; 2354 2355 /* Update RAMState cache of output QEMUFile */ 2356 rs->f = out; 2357 2358 trace_ram_state_resume_prepare(pages); 2359 } 2360 2361 /* 2362 * This function clears bits of the free pages reported by the caller from the 2363 * migration dirty bitmap. @addr is the host address corresponding to the 2364 * start of the continuous guest free pages, and @len is the total bytes of 2365 * those pages. 2366 */ 2367 void qemu_guest_free_page_hint(void *addr, size_t len) 2368 { 2369 RAMBlock *block; 2370 ram_addr_t offset; 2371 size_t used_len, start, npages; 2372 MigrationState *s = migrate_get_current(); 2373 2374 /* This function is currently expected to be used during live migration */ 2375 if (!migration_is_setup_or_active(s->state)) { 2376 return; 2377 } 2378 2379 for (; len > 0; len -= used_len, addr += used_len) { 2380 block = qemu_ram_block_from_host(addr, false, &offset); 2381 if (unlikely(!block || offset >= block->used_length)) { 2382 /* 2383 * The implementation might not support RAMBlock resize during 2384 * live migration, but it could happen in theory with future 2385 * updates. So we add a check here to capture that case. 2386 */ 2387 error_report_once("%s unexpected error", __func__); 2388 return; 2389 } 2390 2391 if (len <= block->used_length - offset) { 2392 used_len = len; 2393 } else { 2394 used_len = block->used_length - offset; 2395 } 2396 2397 start = offset >> TARGET_PAGE_BITS; 2398 npages = used_len >> TARGET_PAGE_BITS; 2399 2400 qemu_mutex_lock(&ram_state->bitmap_mutex); 2401 ram_state->migration_dirty_pages -= 2402 bitmap_count_one_with_offset(block->bmap, start, npages); 2403 bitmap_clear(block->bmap, start, npages); 2404 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2405 } 2406 } 2407 2408 /* 2409 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2410 * long-running RCU critical section. When rcu-reclaims in the code 2411 * start to become numerous it will be necessary to reduce the 2412 * granularity of these critical sections. 2413 */ 2414 2415 /** 2416 * ram_save_setup: Setup RAM for migration 2417 * 2418 * Returns zero to indicate success and negative for error 2419 * 2420 * @f: QEMUFile where to send the data 2421 * @opaque: RAMState pointer 2422 */ 2423 static int ram_save_setup(QEMUFile *f, void *opaque) 2424 { 2425 RAMState **rsp = opaque; 2426 RAMBlock *block; 2427 2428 if (compress_threads_save_setup()) { 2429 return -1; 2430 } 2431 2432 /* migration has already setup the bitmap, reuse it. */ 2433 if (!migration_in_colo_state()) { 2434 if (ram_init_all(rsp) != 0) { 2435 compress_threads_save_cleanup(); 2436 return -1; 2437 } 2438 } 2439 (*rsp)->f = f; 2440 2441 WITH_RCU_READ_LOCK_GUARD() { 2442 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2443 2444 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2445 qemu_put_byte(f, strlen(block->idstr)); 2446 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2447 qemu_put_be64(f, block->used_length); 2448 if (migrate_postcopy_ram() && block->page_size != 2449 qemu_host_page_size) { 2450 qemu_put_be64(f, block->page_size); 2451 } 2452 if (migrate_ignore_shared()) { 2453 qemu_put_be64(f, block->mr->addr); 2454 } 2455 } 2456 } 2457 2458 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2459 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2460 2461 multifd_send_sync_main(f); 2462 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2463 qemu_fflush(f); 2464 2465 return 0; 2466 } 2467 2468 /** 2469 * ram_save_iterate: iterative stage for migration 2470 * 2471 * Returns zero to indicate success and negative for error 2472 * 2473 * @f: QEMUFile where to send the data 2474 * @opaque: RAMState pointer 2475 */ 2476 static int ram_save_iterate(QEMUFile *f, void *opaque) 2477 { 2478 RAMState **temp = opaque; 2479 RAMState *rs = *temp; 2480 int ret = 0; 2481 int i; 2482 int64_t t0; 2483 int done = 0; 2484 2485 if (blk_mig_bulk_active()) { 2486 /* Avoid transferring ram during bulk phase of block migration as 2487 * the bulk phase will usually take a long time and transferring 2488 * ram updates during that time is pointless. */ 2489 goto out; 2490 } 2491 2492 WITH_RCU_READ_LOCK_GUARD() { 2493 if (ram_list.version != rs->last_version) { 2494 ram_state_reset(rs); 2495 } 2496 2497 /* Read version before ram_list.blocks */ 2498 smp_rmb(); 2499 2500 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2501 2502 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2503 i = 0; 2504 while ((ret = qemu_file_rate_limit(f)) == 0 || 2505 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 2506 int pages; 2507 2508 if (qemu_file_get_error(f)) { 2509 break; 2510 } 2511 2512 pages = ram_find_and_save_block(rs, false); 2513 /* no more pages to sent */ 2514 if (pages == 0) { 2515 done = 1; 2516 break; 2517 } 2518 2519 if (pages < 0) { 2520 qemu_file_set_error(f, pages); 2521 break; 2522 } 2523 2524 rs->target_page_count += pages; 2525 2526 /* 2527 * During postcopy, it is necessary to make sure one whole host 2528 * page is sent in one chunk. 2529 */ 2530 if (migrate_postcopy_ram()) { 2531 flush_compressed_data(rs); 2532 } 2533 2534 /* 2535 * we want to check in the 1st loop, just in case it was the 1st 2536 * time and we had to sync the dirty bitmap. 2537 * qemu_clock_get_ns() is a bit expensive, so we only check each 2538 * some iterations 2539 */ 2540 if ((i & 63) == 0) { 2541 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 2542 1000000; 2543 if (t1 > MAX_WAIT) { 2544 trace_ram_save_iterate_big_wait(t1, i); 2545 break; 2546 } 2547 } 2548 i++; 2549 } 2550 } 2551 2552 /* 2553 * Must occur before EOS (or any QEMUFile operation) 2554 * because of RDMA protocol. 2555 */ 2556 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2557 2558 out: 2559 if (ret >= 0 2560 && migration_is_setup_or_active(migrate_get_current()->state)) { 2561 multifd_send_sync_main(rs->f); 2562 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2563 qemu_fflush(f); 2564 ram_counters.transferred += 8; 2565 2566 ret = qemu_file_get_error(f); 2567 } 2568 if (ret < 0) { 2569 return ret; 2570 } 2571 2572 return done; 2573 } 2574 2575 /** 2576 * ram_save_complete: function called to send the remaining amount of ram 2577 * 2578 * Returns zero to indicate success or negative on error 2579 * 2580 * Called with iothread lock 2581 * 2582 * @f: QEMUFile where to send the data 2583 * @opaque: RAMState pointer 2584 */ 2585 static int ram_save_complete(QEMUFile *f, void *opaque) 2586 { 2587 RAMState **temp = opaque; 2588 RAMState *rs = *temp; 2589 int ret = 0; 2590 2591 WITH_RCU_READ_LOCK_GUARD() { 2592 if (!migration_in_postcopy()) { 2593 migration_bitmap_sync_precopy(rs); 2594 } 2595 2596 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2597 2598 /* try transferring iterative blocks of memory */ 2599 2600 /* flush all remaining blocks regardless of rate limiting */ 2601 while (true) { 2602 int pages; 2603 2604 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2605 /* no more blocks to sent */ 2606 if (pages == 0) { 2607 break; 2608 } 2609 if (pages < 0) { 2610 ret = pages; 2611 break; 2612 } 2613 } 2614 2615 flush_compressed_data(rs); 2616 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2617 } 2618 2619 if (ret >= 0) { 2620 multifd_send_sync_main(rs->f); 2621 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2622 qemu_fflush(f); 2623 } 2624 2625 return ret; 2626 } 2627 2628 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2629 uint64_t *res_precopy_only, 2630 uint64_t *res_compatible, 2631 uint64_t *res_postcopy_only) 2632 { 2633 RAMState **temp = opaque; 2634 RAMState *rs = *temp; 2635 uint64_t remaining_size; 2636 2637 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2638 2639 if (!migration_in_postcopy() && 2640 remaining_size < max_size) { 2641 qemu_mutex_lock_iothread(); 2642 WITH_RCU_READ_LOCK_GUARD() { 2643 migration_bitmap_sync_precopy(rs); 2644 } 2645 qemu_mutex_unlock_iothread(); 2646 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2647 } 2648 2649 if (migrate_postcopy_ram()) { 2650 /* We can do postcopy, and all the data is postcopiable */ 2651 *res_compatible += remaining_size; 2652 } else { 2653 *res_precopy_only += remaining_size; 2654 } 2655 } 2656 2657 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2658 { 2659 unsigned int xh_len; 2660 int xh_flags; 2661 uint8_t *loaded_data; 2662 2663 /* extract RLE header */ 2664 xh_flags = qemu_get_byte(f); 2665 xh_len = qemu_get_be16(f); 2666 2667 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2668 error_report("Failed to load XBZRLE page - wrong compression!"); 2669 return -1; 2670 } 2671 2672 if (xh_len > TARGET_PAGE_SIZE) { 2673 error_report("Failed to load XBZRLE page - len overflow!"); 2674 return -1; 2675 } 2676 loaded_data = XBZRLE.decoded_buf; 2677 /* load data and decode */ 2678 /* it can change loaded_data to point to an internal buffer */ 2679 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2680 2681 /* decode RLE */ 2682 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2683 TARGET_PAGE_SIZE) == -1) { 2684 error_report("Failed to load XBZRLE page - decode error!"); 2685 return -1; 2686 } 2687 2688 return 0; 2689 } 2690 2691 /** 2692 * ram_block_from_stream: read a RAMBlock id from the migration stream 2693 * 2694 * Must be called from within a rcu critical section. 2695 * 2696 * Returns a pointer from within the RCU-protected ram_list. 2697 * 2698 * @f: QEMUFile where to read the data from 2699 * @flags: Page flags (mostly to see if it's a continuation of previous block) 2700 */ 2701 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 2702 { 2703 static RAMBlock *block = NULL; 2704 char id[256]; 2705 uint8_t len; 2706 2707 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2708 if (!block) { 2709 error_report("Ack, bad migration stream!"); 2710 return NULL; 2711 } 2712 return block; 2713 } 2714 2715 len = qemu_get_byte(f); 2716 qemu_get_buffer(f, (uint8_t *)id, len); 2717 id[len] = 0; 2718 2719 block = qemu_ram_block_by_name(id); 2720 if (!block) { 2721 error_report("Can't find block %s", id); 2722 return NULL; 2723 } 2724 2725 if (ramblock_is_ignored(block)) { 2726 error_report("block %s should not be migrated !", id); 2727 return NULL; 2728 } 2729 2730 return block; 2731 } 2732 2733 static inline void *host_from_ram_block_offset(RAMBlock *block, 2734 ram_addr_t offset) 2735 { 2736 if (!offset_in_ramblock(block, offset)) { 2737 return NULL; 2738 } 2739 2740 return block->host + offset; 2741 } 2742 2743 static inline void *colo_cache_from_block_offset(RAMBlock *block, 2744 ram_addr_t offset, bool record_bitmap) 2745 { 2746 if (!offset_in_ramblock(block, offset)) { 2747 return NULL; 2748 } 2749 if (!block->colo_cache) { 2750 error_report("%s: colo_cache is NULL in block :%s", 2751 __func__, block->idstr); 2752 return NULL; 2753 } 2754 2755 /* 2756 * During colo checkpoint, we need bitmap of these migrated pages. 2757 * It help us to decide which pages in ram cache should be flushed 2758 * into VM's RAM later. 2759 */ 2760 if (record_bitmap && 2761 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 2762 ram_state->migration_dirty_pages++; 2763 } 2764 return block->colo_cache + offset; 2765 } 2766 2767 /** 2768 * ram_handle_compressed: handle the zero page case 2769 * 2770 * If a page (or a whole RDMA chunk) has been 2771 * determined to be zero, then zap it. 2772 * 2773 * @host: host address for the zero page 2774 * @ch: what the page is filled from. We only support zero 2775 * @size: size of the zero page 2776 */ 2777 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2778 { 2779 if (ch != 0 || !is_zero_range(host, size)) { 2780 memset(host, ch, size); 2781 } 2782 } 2783 2784 /* return the size after decompression, or negative value on error */ 2785 static int 2786 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 2787 const uint8_t *source, size_t source_len) 2788 { 2789 int err; 2790 2791 err = inflateReset(stream); 2792 if (err != Z_OK) { 2793 return -1; 2794 } 2795 2796 stream->avail_in = source_len; 2797 stream->next_in = (uint8_t *)source; 2798 stream->avail_out = dest_len; 2799 stream->next_out = dest; 2800 2801 err = inflate(stream, Z_NO_FLUSH); 2802 if (err != Z_STREAM_END) { 2803 return -1; 2804 } 2805 2806 return stream->total_out; 2807 } 2808 2809 static void *do_data_decompress(void *opaque) 2810 { 2811 DecompressParam *param = opaque; 2812 unsigned long pagesize; 2813 uint8_t *des; 2814 int len, ret; 2815 2816 qemu_mutex_lock(¶m->mutex); 2817 while (!param->quit) { 2818 if (param->des) { 2819 des = param->des; 2820 len = param->len; 2821 param->des = 0; 2822 qemu_mutex_unlock(¶m->mutex); 2823 2824 pagesize = TARGET_PAGE_SIZE; 2825 2826 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 2827 param->compbuf, len); 2828 if (ret < 0 && migrate_get_current()->decompress_error_check) { 2829 error_report("decompress data failed"); 2830 qemu_file_set_error(decomp_file, ret); 2831 } 2832 2833 qemu_mutex_lock(&decomp_done_lock); 2834 param->done = true; 2835 qemu_cond_signal(&decomp_done_cond); 2836 qemu_mutex_unlock(&decomp_done_lock); 2837 2838 qemu_mutex_lock(¶m->mutex); 2839 } else { 2840 qemu_cond_wait(¶m->cond, ¶m->mutex); 2841 } 2842 } 2843 qemu_mutex_unlock(¶m->mutex); 2844 2845 return NULL; 2846 } 2847 2848 static int wait_for_decompress_done(void) 2849 { 2850 int idx, thread_count; 2851 2852 if (!migrate_use_compression()) { 2853 return 0; 2854 } 2855 2856 thread_count = migrate_decompress_threads(); 2857 qemu_mutex_lock(&decomp_done_lock); 2858 for (idx = 0; idx < thread_count; idx++) { 2859 while (!decomp_param[idx].done) { 2860 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2861 } 2862 } 2863 qemu_mutex_unlock(&decomp_done_lock); 2864 return qemu_file_get_error(decomp_file); 2865 } 2866 2867 static void compress_threads_load_cleanup(void) 2868 { 2869 int i, thread_count; 2870 2871 if (!migrate_use_compression()) { 2872 return; 2873 } 2874 thread_count = migrate_decompress_threads(); 2875 for (i = 0; i < thread_count; i++) { 2876 /* 2877 * we use it as a indicator which shows if the thread is 2878 * properly init'd or not 2879 */ 2880 if (!decomp_param[i].compbuf) { 2881 break; 2882 } 2883 2884 qemu_mutex_lock(&decomp_param[i].mutex); 2885 decomp_param[i].quit = true; 2886 qemu_cond_signal(&decomp_param[i].cond); 2887 qemu_mutex_unlock(&decomp_param[i].mutex); 2888 } 2889 for (i = 0; i < thread_count; i++) { 2890 if (!decomp_param[i].compbuf) { 2891 break; 2892 } 2893 2894 qemu_thread_join(decompress_threads + i); 2895 qemu_mutex_destroy(&decomp_param[i].mutex); 2896 qemu_cond_destroy(&decomp_param[i].cond); 2897 inflateEnd(&decomp_param[i].stream); 2898 g_free(decomp_param[i].compbuf); 2899 decomp_param[i].compbuf = NULL; 2900 } 2901 g_free(decompress_threads); 2902 g_free(decomp_param); 2903 decompress_threads = NULL; 2904 decomp_param = NULL; 2905 decomp_file = NULL; 2906 } 2907 2908 static int compress_threads_load_setup(QEMUFile *f) 2909 { 2910 int i, thread_count; 2911 2912 if (!migrate_use_compression()) { 2913 return 0; 2914 } 2915 2916 thread_count = migrate_decompress_threads(); 2917 decompress_threads = g_new0(QemuThread, thread_count); 2918 decomp_param = g_new0(DecompressParam, thread_count); 2919 qemu_mutex_init(&decomp_done_lock); 2920 qemu_cond_init(&decomp_done_cond); 2921 decomp_file = f; 2922 for (i = 0; i < thread_count; i++) { 2923 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 2924 goto exit; 2925 } 2926 2927 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2928 qemu_mutex_init(&decomp_param[i].mutex); 2929 qemu_cond_init(&decomp_param[i].cond); 2930 decomp_param[i].done = true; 2931 decomp_param[i].quit = false; 2932 qemu_thread_create(decompress_threads + i, "decompress", 2933 do_data_decompress, decomp_param + i, 2934 QEMU_THREAD_JOINABLE); 2935 } 2936 return 0; 2937 exit: 2938 compress_threads_load_cleanup(); 2939 return -1; 2940 } 2941 2942 static void decompress_data_with_multi_threads(QEMUFile *f, 2943 void *host, int len) 2944 { 2945 int idx, thread_count; 2946 2947 thread_count = migrate_decompress_threads(); 2948 qemu_mutex_lock(&decomp_done_lock); 2949 while (true) { 2950 for (idx = 0; idx < thread_count; idx++) { 2951 if (decomp_param[idx].done) { 2952 decomp_param[idx].done = false; 2953 qemu_mutex_lock(&decomp_param[idx].mutex); 2954 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2955 decomp_param[idx].des = host; 2956 decomp_param[idx].len = len; 2957 qemu_cond_signal(&decomp_param[idx].cond); 2958 qemu_mutex_unlock(&decomp_param[idx].mutex); 2959 break; 2960 } 2961 } 2962 if (idx < thread_count) { 2963 break; 2964 } else { 2965 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2966 } 2967 } 2968 qemu_mutex_unlock(&decomp_done_lock); 2969 } 2970 2971 /* 2972 * colo cache: this is for secondary VM, we cache the whole 2973 * memory of the secondary VM, it is need to hold the global lock 2974 * to call this helper. 2975 */ 2976 int colo_init_ram_cache(void) 2977 { 2978 RAMBlock *block; 2979 2980 WITH_RCU_READ_LOCK_GUARD() { 2981 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2982 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 2983 NULL, 2984 false); 2985 if (!block->colo_cache) { 2986 error_report("%s: Can't alloc memory for COLO cache of block %s," 2987 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 2988 block->used_length); 2989 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2990 if (block->colo_cache) { 2991 qemu_anon_ram_free(block->colo_cache, block->used_length); 2992 block->colo_cache = NULL; 2993 } 2994 } 2995 return -errno; 2996 } 2997 } 2998 } 2999 3000 /* 3001 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3002 * with to decide which page in cache should be flushed into SVM's RAM. Here 3003 * we use the same name 'ram_bitmap' as for migration. 3004 */ 3005 if (ram_bytes_total()) { 3006 RAMBlock *block; 3007 3008 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3009 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3010 block->bmap = bitmap_new(pages); 3011 } 3012 } 3013 3014 ram_state_init(&ram_state); 3015 return 0; 3016 } 3017 3018 /* TODO: duplicated with ram_init_bitmaps */ 3019 void colo_incoming_start_dirty_log(void) 3020 { 3021 RAMBlock *block = NULL; 3022 /* For memory_global_dirty_log_start below. */ 3023 qemu_mutex_lock_iothread(); 3024 qemu_mutex_lock_ramlist(); 3025 3026 memory_global_dirty_log_sync(); 3027 WITH_RCU_READ_LOCK_GUARD() { 3028 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3029 ramblock_sync_dirty_bitmap(ram_state, block); 3030 /* Discard this dirty bitmap record */ 3031 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3032 } 3033 memory_global_dirty_log_start(); 3034 } 3035 ram_state->migration_dirty_pages = 0; 3036 qemu_mutex_unlock_ramlist(); 3037 qemu_mutex_unlock_iothread(); 3038 } 3039 3040 /* It is need to hold the global lock to call this helper */ 3041 void colo_release_ram_cache(void) 3042 { 3043 RAMBlock *block; 3044 3045 memory_global_dirty_log_stop(); 3046 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3047 g_free(block->bmap); 3048 block->bmap = NULL; 3049 } 3050 3051 WITH_RCU_READ_LOCK_GUARD() { 3052 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3053 if (block->colo_cache) { 3054 qemu_anon_ram_free(block->colo_cache, block->used_length); 3055 block->colo_cache = NULL; 3056 } 3057 } 3058 } 3059 ram_state_cleanup(&ram_state); 3060 } 3061 3062 /** 3063 * ram_load_setup: Setup RAM for migration incoming side 3064 * 3065 * Returns zero to indicate success and negative for error 3066 * 3067 * @f: QEMUFile where to receive the data 3068 * @opaque: RAMState pointer 3069 */ 3070 static int ram_load_setup(QEMUFile *f, void *opaque) 3071 { 3072 if (compress_threads_load_setup(f)) { 3073 return -1; 3074 } 3075 3076 xbzrle_load_setup(); 3077 ramblock_recv_map_init(); 3078 3079 return 0; 3080 } 3081 3082 static int ram_load_cleanup(void *opaque) 3083 { 3084 RAMBlock *rb; 3085 3086 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3087 qemu_ram_block_writeback(rb); 3088 } 3089 3090 xbzrle_load_cleanup(); 3091 compress_threads_load_cleanup(); 3092 3093 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3094 g_free(rb->receivedmap); 3095 rb->receivedmap = NULL; 3096 } 3097 3098 return 0; 3099 } 3100 3101 /** 3102 * ram_postcopy_incoming_init: allocate postcopy data structures 3103 * 3104 * Returns 0 for success and negative if there was one error 3105 * 3106 * @mis: current migration incoming state 3107 * 3108 * Allocate data structures etc needed by incoming migration with 3109 * postcopy-ram. postcopy-ram's similarly names 3110 * postcopy_ram_incoming_init does the work. 3111 */ 3112 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3113 { 3114 return postcopy_ram_incoming_init(mis); 3115 } 3116 3117 /** 3118 * ram_load_postcopy: load a page in postcopy case 3119 * 3120 * Returns 0 for success or -errno in case of error 3121 * 3122 * Called in postcopy mode by ram_load(). 3123 * rcu_read_lock is taken prior to this being called. 3124 * 3125 * @f: QEMUFile where to send the data 3126 */ 3127 static int ram_load_postcopy(QEMUFile *f) 3128 { 3129 int flags = 0, ret = 0; 3130 bool place_needed = false; 3131 bool matches_target_page_size = false; 3132 MigrationIncomingState *mis = migration_incoming_get_current(); 3133 /* Temporary page that is later 'placed' */ 3134 void *postcopy_host_page = mis->postcopy_tmp_page; 3135 void *this_host = NULL; 3136 bool all_zero = false; 3137 int target_pages = 0; 3138 3139 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3140 ram_addr_t addr; 3141 void *host = NULL; 3142 void *page_buffer = NULL; 3143 void *place_source = NULL; 3144 RAMBlock *block = NULL; 3145 uint8_t ch; 3146 int len; 3147 3148 addr = qemu_get_be64(f); 3149 3150 /* 3151 * If qemu file error, we should stop here, and then "addr" 3152 * may be invalid 3153 */ 3154 ret = qemu_file_get_error(f); 3155 if (ret) { 3156 break; 3157 } 3158 3159 flags = addr & ~TARGET_PAGE_MASK; 3160 addr &= TARGET_PAGE_MASK; 3161 3162 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3163 place_needed = false; 3164 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3165 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3166 block = ram_block_from_stream(f, flags); 3167 3168 host = host_from_ram_block_offset(block, addr); 3169 if (!host) { 3170 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3171 ret = -EINVAL; 3172 break; 3173 } 3174 target_pages++; 3175 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3176 /* 3177 * Postcopy requires that we place whole host pages atomically; 3178 * these may be huge pages for RAMBlocks that are backed by 3179 * hugetlbfs. 3180 * To make it atomic, the data is read into a temporary page 3181 * that's moved into place later. 3182 * The migration protocol uses, possibly smaller, target-pages 3183 * however the source ensures it always sends all the components 3184 * of a host page in one chunk. 3185 */ 3186 page_buffer = postcopy_host_page + 3187 ((uintptr_t)host & (block->page_size - 1)); 3188 /* If all TP are zero then we can optimise the place */ 3189 if (target_pages == 1) { 3190 all_zero = true; 3191 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3192 block->page_size); 3193 } else { 3194 /* not the 1st TP within the HP */ 3195 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) != 3196 (uintptr_t)this_host) { 3197 error_report("Non-same host page %p/%p", 3198 host, this_host); 3199 ret = -EINVAL; 3200 break; 3201 } 3202 } 3203 3204 /* 3205 * If it's the last part of a host page then we place the host 3206 * page 3207 */ 3208 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3209 place_needed = true; 3210 target_pages = 0; 3211 } 3212 place_source = postcopy_host_page; 3213 } 3214 3215 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3216 case RAM_SAVE_FLAG_ZERO: 3217 ch = qemu_get_byte(f); 3218 /* 3219 * Can skip to set page_buffer when 3220 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3221 */ 3222 if (ch || !matches_target_page_size) { 3223 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3224 } 3225 if (ch) { 3226 all_zero = false; 3227 } 3228 break; 3229 3230 case RAM_SAVE_FLAG_PAGE: 3231 all_zero = false; 3232 if (!matches_target_page_size) { 3233 /* For huge pages, we always use temporary buffer */ 3234 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3235 } else { 3236 /* 3237 * For small pages that matches target page size, we 3238 * avoid the qemu_file copy. Instead we directly use 3239 * the buffer of QEMUFile to place the page. Note: we 3240 * cannot do any QEMUFile operation before using that 3241 * buffer to make sure the buffer is valid when 3242 * placing the page. 3243 */ 3244 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3245 TARGET_PAGE_SIZE); 3246 } 3247 break; 3248 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3249 all_zero = false; 3250 len = qemu_get_be32(f); 3251 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3252 error_report("Invalid compressed data length: %d", len); 3253 ret = -EINVAL; 3254 break; 3255 } 3256 decompress_data_with_multi_threads(f, page_buffer, len); 3257 break; 3258 3259 case RAM_SAVE_FLAG_EOS: 3260 /* normal exit */ 3261 multifd_recv_sync_main(); 3262 break; 3263 default: 3264 error_report("Unknown combination of migration flags: %#x" 3265 " (postcopy mode)", flags); 3266 ret = -EINVAL; 3267 break; 3268 } 3269 3270 /* Got the whole host page, wait for decompress before placing. */ 3271 if (place_needed) { 3272 ret |= wait_for_decompress_done(); 3273 } 3274 3275 /* Detect for any possible file errors */ 3276 if (!ret && qemu_file_get_error(f)) { 3277 ret = qemu_file_get_error(f); 3278 } 3279 3280 if (!ret && place_needed) { 3281 /* This gets called at the last target page in the host page */ 3282 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3283 block->page_size); 3284 3285 if (all_zero) { 3286 ret = postcopy_place_page_zero(mis, place_dest, 3287 block); 3288 } else { 3289 ret = postcopy_place_page(mis, place_dest, 3290 place_source, block); 3291 } 3292 } 3293 } 3294 3295 return ret; 3296 } 3297 3298 static bool postcopy_is_advised(void) 3299 { 3300 PostcopyState ps = postcopy_state_get(); 3301 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3302 } 3303 3304 static bool postcopy_is_running(void) 3305 { 3306 PostcopyState ps = postcopy_state_get(); 3307 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3308 } 3309 3310 /* 3311 * Flush content of RAM cache into SVM's memory. 3312 * Only flush the pages that be dirtied by PVM or SVM or both. 3313 */ 3314 static void colo_flush_ram_cache(void) 3315 { 3316 RAMBlock *block = NULL; 3317 void *dst_host; 3318 void *src_host; 3319 unsigned long offset = 0; 3320 3321 memory_global_dirty_log_sync(); 3322 WITH_RCU_READ_LOCK_GUARD() { 3323 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3324 ramblock_sync_dirty_bitmap(ram_state, block); 3325 } 3326 } 3327 3328 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3329 WITH_RCU_READ_LOCK_GUARD() { 3330 block = QLIST_FIRST_RCU(&ram_list.blocks); 3331 3332 while (block) { 3333 offset = migration_bitmap_find_dirty(ram_state, block, offset); 3334 3335 if (((ram_addr_t)offset) << TARGET_PAGE_BITS 3336 >= block->used_length) { 3337 offset = 0; 3338 block = QLIST_NEXT_RCU(block, next); 3339 } else { 3340 migration_bitmap_clear_dirty(ram_state, block, offset); 3341 dst_host = block->host 3342 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3343 src_host = block->colo_cache 3344 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3345 memcpy(dst_host, src_host, TARGET_PAGE_SIZE); 3346 } 3347 } 3348 } 3349 trace_colo_flush_ram_cache_end(); 3350 } 3351 3352 /** 3353 * ram_load_precopy: load pages in precopy case 3354 * 3355 * Returns 0 for success or -errno in case of error 3356 * 3357 * Called in precopy mode by ram_load(). 3358 * rcu_read_lock is taken prior to this being called. 3359 * 3360 * @f: QEMUFile where to send the data 3361 */ 3362 static int ram_load_precopy(QEMUFile *f) 3363 { 3364 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3365 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3366 bool postcopy_advised = postcopy_is_advised(); 3367 if (!migrate_use_compression()) { 3368 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3369 } 3370 3371 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3372 ram_addr_t addr, total_ram_bytes; 3373 void *host = NULL, *host_bak = NULL; 3374 uint8_t ch; 3375 3376 /* 3377 * Yield periodically to let main loop run, but an iteration of 3378 * the main loop is expensive, so do it each some iterations 3379 */ 3380 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3381 aio_co_schedule(qemu_get_current_aio_context(), 3382 qemu_coroutine_self()); 3383 qemu_coroutine_yield(); 3384 } 3385 i++; 3386 3387 addr = qemu_get_be64(f); 3388 flags = addr & ~TARGET_PAGE_MASK; 3389 addr &= TARGET_PAGE_MASK; 3390 3391 if (flags & invalid_flags) { 3392 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3393 error_report("Received an unexpected compressed page"); 3394 } 3395 3396 ret = -EINVAL; 3397 break; 3398 } 3399 3400 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3401 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3402 RAMBlock *block = ram_block_from_stream(f, flags); 3403 3404 host = host_from_ram_block_offset(block, addr); 3405 /* 3406 * After going into COLO stage, we should not load the page 3407 * into SVM's memory directly, we put them into colo_cache firstly. 3408 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3409 * Previously, we copied all these memory in preparing stage of COLO 3410 * while we need to stop VM, which is a time-consuming process. 3411 * Here we optimize it by a trick, back-up every page while in 3412 * migration process while COLO is enabled, though it affects the 3413 * speed of the migration, but it obviously reduce the downtime of 3414 * back-up all SVM'S memory in COLO preparing stage. 3415 */ 3416 if (migration_incoming_colo_enabled()) { 3417 if (migration_incoming_in_colo_state()) { 3418 /* In COLO stage, put all pages into cache temporarily */ 3419 host = colo_cache_from_block_offset(block, addr, true); 3420 } else { 3421 /* 3422 * In migration stage but before COLO stage, 3423 * Put all pages into both cache and SVM's memory. 3424 */ 3425 host_bak = colo_cache_from_block_offset(block, addr, false); 3426 } 3427 } 3428 if (!host) { 3429 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3430 ret = -EINVAL; 3431 break; 3432 } 3433 if (!migration_incoming_in_colo_state()) { 3434 ramblock_recv_bitmap_set(block, host); 3435 } 3436 3437 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3438 } 3439 3440 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3441 case RAM_SAVE_FLAG_MEM_SIZE: 3442 /* Synchronize RAM block list */ 3443 total_ram_bytes = addr; 3444 while (!ret && total_ram_bytes) { 3445 RAMBlock *block; 3446 char id[256]; 3447 ram_addr_t length; 3448 3449 len = qemu_get_byte(f); 3450 qemu_get_buffer(f, (uint8_t *)id, len); 3451 id[len] = 0; 3452 length = qemu_get_be64(f); 3453 3454 block = qemu_ram_block_by_name(id); 3455 if (block && !qemu_ram_is_migratable(block)) { 3456 error_report("block %s should not be migrated !", id); 3457 ret = -EINVAL; 3458 } else if (block) { 3459 if (length != block->used_length) { 3460 Error *local_err = NULL; 3461 3462 ret = qemu_ram_resize(block, length, 3463 &local_err); 3464 if (local_err) { 3465 error_report_err(local_err); 3466 } 3467 } 3468 /* For postcopy we need to check hugepage sizes match */ 3469 if (postcopy_advised && 3470 block->page_size != qemu_host_page_size) { 3471 uint64_t remote_page_size = qemu_get_be64(f); 3472 if (remote_page_size != block->page_size) { 3473 error_report("Mismatched RAM page size %s " 3474 "(local) %zd != %" PRId64, 3475 id, block->page_size, 3476 remote_page_size); 3477 ret = -EINVAL; 3478 } 3479 } 3480 if (migrate_ignore_shared()) { 3481 hwaddr addr = qemu_get_be64(f); 3482 if (ramblock_is_ignored(block) && 3483 block->mr->addr != addr) { 3484 error_report("Mismatched GPAs for block %s " 3485 "%" PRId64 "!= %" PRId64, 3486 id, (uint64_t)addr, 3487 (uint64_t)block->mr->addr); 3488 ret = -EINVAL; 3489 } 3490 } 3491 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3492 block->idstr); 3493 } else { 3494 error_report("Unknown ramblock \"%s\", cannot " 3495 "accept migration", id); 3496 ret = -EINVAL; 3497 } 3498 3499 total_ram_bytes -= length; 3500 } 3501 break; 3502 3503 case RAM_SAVE_FLAG_ZERO: 3504 ch = qemu_get_byte(f); 3505 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3506 break; 3507 3508 case RAM_SAVE_FLAG_PAGE: 3509 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3510 break; 3511 3512 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3513 len = qemu_get_be32(f); 3514 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3515 error_report("Invalid compressed data length: %d", len); 3516 ret = -EINVAL; 3517 break; 3518 } 3519 decompress_data_with_multi_threads(f, host, len); 3520 break; 3521 3522 case RAM_SAVE_FLAG_XBZRLE: 3523 if (load_xbzrle(f, addr, host) < 0) { 3524 error_report("Failed to decompress XBZRLE page at " 3525 RAM_ADDR_FMT, addr); 3526 ret = -EINVAL; 3527 break; 3528 } 3529 break; 3530 case RAM_SAVE_FLAG_EOS: 3531 /* normal exit */ 3532 multifd_recv_sync_main(); 3533 break; 3534 default: 3535 if (flags & RAM_SAVE_FLAG_HOOK) { 3536 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3537 } else { 3538 error_report("Unknown combination of migration flags: %#x", 3539 flags); 3540 ret = -EINVAL; 3541 } 3542 } 3543 if (!ret) { 3544 ret = qemu_file_get_error(f); 3545 } 3546 if (!ret && host_bak) { 3547 memcpy(host_bak, host, TARGET_PAGE_SIZE); 3548 } 3549 } 3550 3551 ret |= wait_for_decompress_done(); 3552 return ret; 3553 } 3554 3555 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3556 { 3557 int ret = 0; 3558 static uint64_t seq_iter; 3559 /* 3560 * If system is running in postcopy mode, page inserts to host memory must 3561 * be atomic 3562 */ 3563 bool postcopy_running = postcopy_is_running(); 3564 3565 seq_iter++; 3566 3567 if (version_id != 4) { 3568 return -EINVAL; 3569 } 3570 3571 /* 3572 * This RCU critical section can be very long running. 3573 * When RCU reclaims in the code start to become numerous, 3574 * it will be necessary to reduce the granularity of this 3575 * critical section. 3576 */ 3577 WITH_RCU_READ_LOCK_GUARD() { 3578 if (postcopy_running) { 3579 ret = ram_load_postcopy(f); 3580 } else { 3581 ret = ram_load_precopy(f); 3582 } 3583 } 3584 trace_ram_load_complete(ret, seq_iter); 3585 3586 if (!ret && migration_incoming_in_colo_state()) { 3587 colo_flush_ram_cache(); 3588 } 3589 return ret; 3590 } 3591 3592 static bool ram_has_postcopy(void *opaque) 3593 { 3594 RAMBlock *rb; 3595 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3596 if (ramblock_is_pmem(rb)) { 3597 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 3598 "is not supported now!", rb->idstr, rb->host); 3599 return false; 3600 } 3601 } 3602 3603 return migrate_postcopy_ram(); 3604 } 3605 3606 /* Sync all the dirty bitmap with destination VM. */ 3607 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 3608 { 3609 RAMBlock *block; 3610 QEMUFile *file = s->to_dst_file; 3611 int ramblock_count = 0; 3612 3613 trace_ram_dirty_bitmap_sync_start(); 3614 3615 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3616 qemu_savevm_send_recv_bitmap(file, block->idstr); 3617 trace_ram_dirty_bitmap_request(block->idstr); 3618 ramblock_count++; 3619 } 3620 3621 trace_ram_dirty_bitmap_sync_wait(); 3622 3623 /* Wait until all the ramblocks' dirty bitmap synced */ 3624 while (ramblock_count--) { 3625 qemu_sem_wait(&s->rp_state.rp_sem); 3626 } 3627 3628 trace_ram_dirty_bitmap_sync_complete(); 3629 3630 return 0; 3631 } 3632 3633 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 3634 { 3635 qemu_sem_post(&s->rp_state.rp_sem); 3636 } 3637 3638 /* 3639 * Read the received bitmap, revert it as the initial dirty bitmap. 3640 * This is only used when the postcopy migration is paused but wants 3641 * to resume from a middle point. 3642 */ 3643 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 3644 { 3645 int ret = -EINVAL; 3646 QEMUFile *file = s->rp_state.from_dst_file; 3647 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 3648 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 3649 uint64_t size, end_mark; 3650 3651 trace_ram_dirty_bitmap_reload_begin(block->idstr); 3652 3653 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 3654 error_report("%s: incorrect state %s", __func__, 3655 MigrationStatus_str(s->state)); 3656 return -EINVAL; 3657 } 3658 3659 /* 3660 * Note: see comments in ramblock_recv_bitmap_send() on why we 3661 * need the endianess convertion, and the paddings. 3662 */ 3663 local_size = ROUND_UP(local_size, 8); 3664 3665 /* Add paddings */ 3666 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 3667 3668 size = qemu_get_be64(file); 3669 3670 /* The size of the bitmap should match with our ramblock */ 3671 if (size != local_size) { 3672 error_report("%s: ramblock '%s' bitmap size mismatch " 3673 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 3674 block->idstr, size, local_size); 3675 ret = -EINVAL; 3676 goto out; 3677 } 3678 3679 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 3680 end_mark = qemu_get_be64(file); 3681 3682 ret = qemu_file_get_error(file); 3683 if (ret || size != local_size) { 3684 error_report("%s: read bitmap failed for ramblock '%s': %d" 3685 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 3686 __func__, block->idstr, ret, local_size, size); 3687 ret = -EIO; 3688 goto out; 3689 } 3690 3691 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 3692 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64, 3693 __func__, block->idstr, end_mark); 3694 ret = -EINVAL; 3695 goto out; 3696 } 3697 3698 /* 3699 * Endianess convertion. We are during postcopy (though paused). 3700 * The dirty bitmap won't change. We can directly modify it. 3701 */ 3702 bitmap_from_le(block->bmap, le_bitmap, nbits); 3703 3704 /* 3705 * What we received is "received bitmap". Revert it as the initial 3706 * dirty bitmap for this ramblock. 3707 */ 3708 bitmap_complement(block->bmap, block->bmap, nbits); 3709 3710 trace_ram_dirty_bitmap_reload_complete(block->idstr); 3711 3712 /* 3713 * We succeeded to sync bitmap for current ramblock. If this is 3714 * the last one to sync, we need to notify the main send thread. 3715 */ 3716 ram_dirty_bitmap_reload_notify(s); 3717 3718 ret = 0; 3719 out: 3720 g_free(le_bitmap); 3721 return ret; 3722 } 3723 3724 static int ram_resume_prepare(MigrationState *s, void *opaque) 3725 { 3726 RAMState *rs = *(RAMState **)opaque; 3727 int ret; 3728 3729 ret = ram_dirty_bitmap_sync_all(s, rs); 3730 if (ret) { 3731 return ret; 3732 } 3733 3734 ram_state_resume_prepare(rs, s->to_dst_file); 3735 3736 return 0; 3737 } 3738 3739 static SaveVMHandlers savevm_ram_handlers = { 3740 .save_setup = ram_save_setup, 3741 .save_live_iterate = ram_save_iterate, 3742 .save_live_complete_postcopy = ram_save_complete, 3743 .save_live_complete_precopy = ram_save_complete, 3744 .has_postcopy = ram_has_postcopy, 3745 .save_live_pending = ram_save_pending, 3746 .load_state = ram_load, 3747 .save_cleanup = ram_save_cleanup, 3748 .load_setup = ram_load_setup, 3749 .load_cleanup = ram_load_cleanup, 3750 .resume_prepare = ram_resume_prepare, 3751 }; 3752 3753 void ram_mig_init(void) 3754 { 3755 qemu_mutex_init(&XBZRLE.lock); 3756 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 3757 } 3758