1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "cpu.h" 31 #include <zlib.h> 32 #include "qemu/cutils.h" 33 #include "qemu/bitops.h" 34 #include "qemu/bitmap.h" 35 #include "qemu/main-loop.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-events-migration.h" 47 #include "qapi/qmp/qerror.h" 48 #include "trace.h" 49 #include "exec/ram_addr.h" 50 #include "exec/target_page.h" 51 #include "qemu/rcu_queue.h" 52 #include "migration/colo.h" 53 #include "block.h" 54 #include "sysemu/sysemu.h" 55 #include "savevm.h" 56 #include "qemu/iov.h" 57 #include "multifd.h" 58 59 /***********************************************************/ 60 /* ram save/restore */ 61 62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 63 * worked for pages that where filled with the same char. We switched 64 * it to only search for the zero value. And to avoid confusion with 65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 66 */ 67 68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 69 #define RAM_SAVE_FLAG_ZERO 0x02 70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 71 #define RAM_SAVE_FLAG_PAGE 0x08 72 #define RAM_SAVE_FLAG_EOS 0x10 73 #define RAM_SAVE_FLAG_CONTINUE 0x20 74 #define RAM_SAVE_FLAG_XBZRLE 0x40 75 /* 0x80 is reserved in migration.h start with 0x100 next */ 76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 77 78 static inline bool is_zero_range(uint8_t *p, uint64_t size) 79 { 80 return buffer_is_zero(p, size); 81 } 82 83 XBZRLECacheStats xbzrle_counters; 84 85 /* struct contains XBZRLE cache and a static page 86 used by the compression */ 87 static struct { 88 /* buffer used for XBZRLE encoding */ 89 uint8_t *encoded_buf; 90 /* buffer for storing page content */ 91 uint8_t *current_buf; 92 /* Cache for XBZRLE, Protected by lock. */ 93 PageCache *cache; 94 QemuMutex lock; 95 /* it will store a page full of zeros */ 96 uint8_t *zero_target_page; 97 /* buffer used for XBZRLE decoding */ 98 uint8_t *decoded_buf; 99 } XBZRLE; 100 101 static void XBZRLE_cache_lock(void) 102 { 103 if (migrate_use_xbzrle()) 104 qemu_mutex_lock(&XBZRLE.lock); 105 } 106 107 static void XBZRLE_cache_unlock(void) 108 { 109 if (migrate_use_xbzrle()) 110 qemu_mutex_unlock(&XBZRLE.lock); 111 } 112 113 /** 114 * xbzrle_cache_resize: resize the xbzrle cache 115 * 116 * This function is called from qmp_migrate_set_cache_size in main 117 * thread, possibly while a migration is in progress. A running 118 * migration may be using the cache and might finish during this call, 119 * hence changes to the cache are protected by XBZRLE.lock(). 120 * 121 * Returns 0 for success or -1 for error 122 * 123 * @new_size: new cache size 124 * @errp: set *errp if the check failed, with reason 125 */ 126 int xbzrle_cache_resize(int64_t new_size, Error **errp) 127 { 128 PageCache *new_cache; 129 int64_t ret = 0; 130 131 /* Check for truncation */ 132 if (new_size != (size_t)new_size) { 133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 134 "exceeding address space"); 135 return -1; 136 } 137 138 if (new_size == migrate_xbzrle_cache_size()) { 139 /* nothing to do */ 140 return 0; 141 } 142 143 XBZRLE_cache_lock(); 144 145 if (XBZRLE.cache != NULL) { 146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 147 if (!new_cache) { 148 ret = -1; 149 goto out; 150 } 151 152 cache_fini(XBZRLE.cache); 153 XBZRLE.cache = new_cache; 154 } 155 out: 156 XBZRLE_cache_unlock(); 157 return ret; 158 } 159 160 static bool ramblock_is_ignored(RAMBlock *block) 161 { 162 return !qemu_ram_is_migratable(block) || 163 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 164 } 165 166 /* Should be holding either ram_list.mutex, or the RCU lock. */ 167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block) \ 168 INTERNAL_RAMBLOCK_FOREACH(block) \ 169 if (ramblock_is_ignored(block)) {} else 170 171 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \ 172 INTERNAL_RAMBLOCK_FOREACH(block) \ 173 if (!qemu_ram_is_migratable(block)) {} else 174 175 #undef RAMBLOCK_FOREACH 176 177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 178 { 179 RAMBlock *block; 180 int ret = 0; 181 182 RCU_READ_LOCK_GUARD(); 183 184 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 185 ret = func(block, opaque); 186 if (ret) { 187 break; 188 } 189 } 190 return ret; 191 } 192 193 static void ramblock_recv_map_init(void) 194 { 195 RAMBlock *rb; 196 197 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 198 assert(!rb->receivedmap); 199 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 200 } 201 } 202 203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 204 { 205 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 206 rb->receivedmap); 207 } 208 209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 210 { 211 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 212 } 213 214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 215 { 216 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 217 } 218 219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 220 size_t nr) 221 { 222 bitmap_set_atomic(rb->receivedmap, 223 ramblock_recv_bitmap_offset(host_addr, rb), 224 nr); 225 } 226 227 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 228 229 /* 230 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 231 * 232 * Returns >0 if success with sent bytes, or <0 if error. 233 */ 234 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 235 const char *block_name) 236 { 237 RAMBlock *block = qemu_ram_block_by_name(block_name); 238 unsigned long *le_bitmap, nbits; 239 uint64_t size; 240 241 if (!block) { 242 error_report("%s: invalid block name: %s", __func__, block_name); 243 return -1; 244 } 245 246 nbits = block->used_length >> TARGET_PAGE_BITS; 247 248 /* 249 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 250 * machines we may need 4 more bytes for padding (see below 251 * comment). So extend it a bit before hand. 252 */ 253 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 254 255 /* 256 * Always use little endian when sending the bitmap. This is 257 * required that when source and destination VMs are not using the 258 * same endianess. (Note: big endian won't work.) 259 */ 260 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 261 262 /* Size of the bitmap, in bytes */ 263 size = DIV_ROUND_UP(nbits, 8); 264 265 /* 266 * size is always aligned to 8 bytes for 64bit machines, but it 267 * may not be true for 32bit machines. We need this padding to 268 * make sure the migration can survive even between 32bit and 269 * 64bit machines. 270 */ 271 size = ROUND_UP(size, 8); 272 273 qemu_put_be64(file, size); 274 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 275 /* 276 * Mark as an end, in case the middle part is screwed up due to 277 * some "misterious" reason. 278 */ 279 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 280 qemu_fflush(file); 281 282 g_free(le_bitmap); 283 284 if (qemu_file_get_error(file)) { 285 return qemu_file_get_error(file); 286 } 287 288 return size + sizeof(size); 289 } 290 291 /* 292 * An outstanding page request, on the source, having been received 293 * and queued 294 */ 295 struct RAMSrcPageRequest { 296 RAMBlock *rb; 297 hwaddr offset; 298 hwaddr len; 299 300 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 301 }; 302 303 /* State of RAM for migration */ 304 struct RAMState { 305 /* QEMUFile used for this migration */ 306 QEMUFile *f; 307 /* Last block that we have visited searching for dirty pages */ 308 RAMBlock *last_seen_block; 309 /* Last block from where we have sent data */ 310 RAMBlock *last_sent_block; 311 /* Last dirty target page we have sent */ 312 ram_addr_t last_page; 313 /* last ram version we have seen */ 314 uint32_t last_version; 315 /* We are in the first round */ 316 bool ram_bulk_stage; 317 /* The free page optimization is enabled */ 318 bool fpo_enabled; 319 /* How many times we have dirty too many pages */ 320 int dirty_rate_high_cnt; 321 /* these variables are used for bitmap sync */ 322 /* last time we did a full bitmap_sync */ 323 int64_t time_last_bitmap_sync; 324 /* bytes transferred at start_time */ 325 uint64_t bytes_xfer_prev; 326 /* number of dirty pages since start_time */ 327 uint64_t num_dirty_pages_period; 328 /* xbzrle misses since the beginning of the period */ 329 uint64_t xbzrle_cache_miss_prev; 330 331 /* compression statistics since the beginning of the period */ 332 /* amount of count that no free thread to compress data */ 333 uint64_t compress_thread_busy_prev; 334 /* amount bytes after compression */ 335 uint64_t compressed_size_prev; 336 /* amount of compressed pages */ 337 uint64_t compress_pages_prev; 338 339 /* total handled target pages at the beginning of period */ 340 uint64_t target_page_count_prev; 341 /* total handled target pages since start */ 342 uint64_t target_page_count; 343 /* number of dirty bits in the bitmap */ 344 uint64_t migration_dirty_pages; 345 /* Protects modification of the bitmap and migration dirty pages */ 346 QemuMutex bitmap_mutex; 347 /* The RAMBlock used in the last src_page_requests */ 348 RAMBlock *last_req_rb; 349 /* Queue of outstanding page requests from the destination */ 350 QemuMutex src_page_req_mutex; 351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 352 }; 353 typedef struct RAMState RAMState; 354 355 static RAMState *ram_state; 356 357 static NotifierWithReturnList precopy_notifier_list; 358 359 void precopy_infrastructure_init(void) 360 { 361 notifier_with_return_list_init(&precopy_notifier_list); 362 } 363 364 void precopy_add_notifier(NotifierWithReturn *n) 365 { 366 notifier_with_return_list_add(&precopy_notifier_list, n); 367 } 368 369 void precopy_remove_notifier(NotifierWithReturn *n) 370 { 371 notifier_with_return_remove(n); 372 } 373 374 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 375 { 376 PrecopyNotifyData pnd; 377 pnd.reason = reason; 378 pnd.errp = errp; 379 380 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 381 } 382 383 void precopy_enable_free_page_optimization(void) 384 { 385 if (!ram_state) { 386 return; 387 } 388 389 ram_state->fpo_enabled = true; 390 } 391 392 uint64_t ram_bytes_remaining(void) 393 { 394 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 395 0; 396 } 397 398 MigrationStats ram_counters; 399 400 /* used by the search for pages to send */ 401 struct PageSearchStatus { 402 /* Current block being searched */ 403 RAMBlock *block; 404 /* Current page to search from */ 405 unsigned long page; 406 /* Set once we wrap around */ 407 bool complete_round; 408 }; 409 typedef struct PageSearchStatus PageSearchStatus; 410 411 CompressionStats compression_counters; 412 413 struct CompressParam { 414 bool done; 415 bool quit; 416 bool zero_page; 417 QEMUFile *file; 418 QemuMutex mutex; 419 QemuCond cond; 420 RAMBlock *block; 421 ram_addr_t offset; 422 423 /* internally used fields */ 424 z_stream stream; 425 uint8_t *originbuf; 426 }; 427 typedef struct CompressParam CompressParam; 428 429 struct DecompressParam { 430 bool done; 431 bool quit; 432 QemuMutex mutex; 433 QemuCond cond; 434 void *des; 435 uint8_t *compbuf; 436 int len; 437 z_stream stream; 438 }; 439 typedef struct DecompressParam DecompressParam; 440 441 static CompressParam *comp_param; 442 static QemuThread *compress_threads; 443 /* comp_done_cond is used to wake up the migration thread when 444 * one of the compression threads has finished the compression. 445 * comp_done_lock is used to co-work with comp_done_cond. 446 */ 447 static QemuMutex comp_done_lock; 448 static QemuCond comp_done_cond; 449 /* The empty QEMUFileOps will be used by file in CompressParam */ 450 static const QEMUFileOps empty_ops = { }; 451 452 static QEMUFile *decomp_file; 453 static DecompressParam *decomp_param; 454 static QemuThread *decompress_threads; 455 static QemuMutex decomp_done_lock; 456 static QemuCond decomp_done_cond; 457 458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 459 ram_addr_t offset, uint8_t *source_buf); 460 461 static void *do_data_compress(void *opaque) 462 { 463 CompressParam *param = opaque; 464 RAMBlock *block; 465 ram_addr_t offset; 466 bool zero_page; 467 468 qemu_mutex_lock(¶m->mutex); 469 while (!param->quit) { 470 if (param->block) { 471 block = param->block; 472 offset = param->offset; 473 param->block = NULL; 474 qemu_mutex_unlock(¶m->mutex); 475 476 zero_page = do_compress_ram_page(param->file, ¶m->stream, 477 block, offset, param->originbuf); 478 479 qemu_mutex_lock(&comp_done_lock); 480 param->done = true; 481 param->zero_page = zero_page; 482 qemu_cond_signal(&comp_done_cond); 483 qemu_mutex_unlock(&comp_done_lock); 484 485 qemu_mutex_lock(¶m->mutex); 486 } else { 487 qemu_cond_wait(¶m->cond, ¶m->mutex); 488 } 489 } 490 qemu_mutex_unlock(¶m->mutex); 491 492 return NULL; 493 } 494 495 static void compress_threads_save_cleanup(void) 496 { 497 int i, thread_count; 498 499 if (!migrate_use_compression() || !comp_param) { 500 return; 501 } 502 503 thread_count = migrate_compress_threads(); 504 for (i = 0; i < thread_count; i++) { 505 /* 506 * we use it as a indicator which shows if the thread is 507 * properly init'd or not 508 */ 509 if (!comp_param[i].file) { 510 break; 511 } 512 513 qemu_mutex_lock(&comp_param[i].mutex); 514 comp_param[i].quit = true; 515 qemu_cond_signal(&comp_param[i].cond); 516 qemu_mutex_unlock(&comp_param[i].mutex); 517 518 qemu_thread_join(compress_threads + i); 519 qemu_mutex_destroy(&comp_param[i].mutex); 520 qemu_cond_destroy(&comp_param[i].cond); 521 deflateEnd(&comp_param[i].stream); 522 g_free(comp_param[i].originbuf); 523 qemu_fclose(comp_param[i].file); 524 comp_param[i].file = NULL; 525 } 526 qemu_mutex_destroy(&comp_done_lock); 527 qemu_cond_destroy(&comp_done_cond); 528 g_free(compress_threads); 529 g_free(comp_param); 530 compress_threads = NULL; 531 comp_param = NULL; 532 } 533 534 static int compress_threads_save_setup(void) 535 { 536 int i, thread_count; 537 538 if (!migrate_use_compression()) { 539 return 0; 540 } 541 thread_count = migrate_compress_threads(); 542 compress_threads = g_new0(QemuThread, thread_count); 543 comp_param = g_new0(CompressParam, thread_count); 544 qemu_cond_init(&comp_done_cond); 545 qemu_mutex_init(&comp_done_lock); 546 for (i = 0; i < thread_count; i++) { 547 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 548 if (!comp_param[i].originbuf) { 549 goto exit; 550 } 551 552 if (deflateInit(&comp_param[i].stream, 553 migrate_compress_level()) != Z_OK) { 554 g_free(comp_param[i].originbuf); 555 goto exit; 556 } 557 558 /* comp_param[i].file is just used as a dummy buffer to save data, 559 * set its ops to empty. 560 */ 561 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 562 comp_param[i].done = true; 563 comp_param[i].quit = false; 564 qemu_mutex_init(&comp_param[i].mutex); 565 qemu_cond_init(&comp_param[i].cond); 566 qemu_thread_create(compress_threads + i, "compress", 567 do_data_compress, comp_param + i, 568 QEMU_THREAD_JOINABLE); 569 } 570 return 0; 571 572 exit: 573 compress_threads_save_cleanup(); 574 return -1; 575 } 576 577 /** 578 * save_page_header: write page header to wire 579 * 580 * If this is the 1st block, it also writes the block identification 581 * 582 * Returns the number of bytes written 583 * 584 * @f: QEMUFile where to send the data 585 * @block: block that contains the page we want to send 586 * @offset: offset inside the block for the page 587 * in the lower bits, it contains flags 588 */ 589 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 590 ram_addr_t offset) 591 { 592 size_t size, len; 593 594 if (block == rs->last_sent_block) { 595 offset |= RAM_SAVE_FLAG_CONTINUE; 596 } 597 qemu_put_be64(f, offset); 598 size = 8; 599 600 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 601 len = strlen(block->idstr); 602 qemu_put_byte(f, len); 603 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 604 size += 1 + len; 605 rs->last_sent_block = block; 606 } 607 return size; 608 } 609 610 /** 611 * mig_throttle_guest_down: throotle down the guest 612 * 613 * Reduce amount of guest cpu execution to hopefully slow down memory 614 * writes. If guest dirty memory rate is reduced below the rate at 615 * which we can transfer pages to the destination then we should be 616 * able to complete migration. Some workloads dirty memory way too 617 * fast and will not effectively converge, even with auto-converge. 618 */ 619 static void mig_throttle_guest_down(void) 620 { 621 MigrationState *s = migrate_get_current(); 622 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 623 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 624 int pct_max = s->parameters.max_cpu_throttle; 625 626 /* We have not started throttling yet. Let's start it. */ 627 if (!cpu_throttle_active()) { 628 cpu_throttle_set(pct_initial); 629 } else { 630 /* Throttling already on, just increase the rate */ 631 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement, 632 pct_max)); 633 } 634 } 635 636 /** 637 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 638 * 639 * @rs: current RAM state 640 * @current_addr: address for the zero page 641 * 642 * Update the xbzrle cache to reflect a page that's been sent as all 0. 643 * The important thing is that a stale (not-yet-0'd) page be replaced 644 * by the new data. 645 * As a bonus, if the page wasn't in the cache it gets added so that 646 * when a small write is made into the 0'd page it gets XBZRLE sent. 647 */ 648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 649 { 650 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 651 return; 652 } 653 654 /* We don't care if this fails to allocate a new cache page 655 * as long as it updated an old one */ 656 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 657 ram_counters.dirty_sync_count); 658 } 659 660 #define ENCODING_FLAG_XBZRLE 0x1 661 662 /** 663 * save_xbzrle_page: compress and send current page 664 * 665 * Returns: 1 means that we wrote the page 666 * 0 means that page is identical to the one already sent 667 * -1 means that xbzrle would be longer than normal 668 * 669 * @rs: current RAM state 670 * @current_data: pointer to the address of the page contents 671 * @current_addr: addr of the page 672 * @block: block that contains the page we want to send 673 * @offset: offset inside the block for the page 674 * @last_stage: if we are at the completion stage 675 */ 676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 677 ram_addr_t current_addr, RAMBlock *block, 678 ram_addr_t offset, bool last_stage) 679 { 680 int encoded_len = 0, bytes_xbzrle; 681 uint8_t *prev_cached_page; 682 683 if (!cache_is_cached(XBZRLE.cache, current_addr, 684 ram_counters.dirty_sync_count)) { 685 xbzrle_counters.cache_miss++; 686 if (!last_stage) { 687 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 688 ram_counters.dirty_sync_count) == -1) { 689 return -1; 690 } else { 691 /* update *current_data when the page has been 692 inserted into cache */ 693 *current_data = get_cached_data(XBZRLE.cache, current_addr); 694 } 695 } 696 return -1; 697 } 698 699 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 700 701 /* save current buffer into memory */ 702 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 703 704 /* XBZRLE encoding (if there is no overflow) */ 705 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 706 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 707 TARGET_PAGE_SIZE); 708 709 /* 710 * Update the cache contents, so that it corresponds to the data 711 * sent, in all cases except where we skip the page. 712 */ 713 if (!last_stage && encoded_len != 0) { 714 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 715 /* 716 * In the case where we couldn't compress, ensure that the caller 717 * sends the data from the cache, since the guest might have 718 * changed the RAM since we copied it. 719 */ 720 *current_data = prev_cached_page; 721 } 722 723 if (encoded_len == 0) { 724 trace_save_xbzrle_page_skipping(); 725 return 0; 726 } else if (encoded_len == -1) { 727 trace_save_xbzrle_page_overflow(); 728 xbzrle_counters.overflow++; 729 return -1; 730 } 731 732 /* Send XBZRLE based compressed page */ 733 bytes_xbzrle = save_page_header(rs, rs->f, block, 734 offset | RAM_SAVE_FLAG_XBZRLE); 735 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 736 qemu_put_be16(rs->f, encoded_len); 737 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 738 bytes_xbzrle += encoded_len + 1 + 2; 739 xbzrle_counters.pages++; 740 xbzrle_counters.bytes += bytes_xbzrle; 741 ram_counters.transferred += bytes_xbzrle; 742 743 return 1; 744 } 745 746 /** 747 * migration_bitmap_find_dirty: find the next dirty page from start 748 * 749 * Returns the page offset within memory region of the start of a dirty page 750 * 751 * @rs: current RAM state 752 * @rb: RAMBlock where to search for dirty pages 753 * @start: page where we start the search 754 */ 755 static inline 756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 757 unsigned long start) 758 { 759 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 760 unsigned long *bitmap = rb->bmap; 761 unsigned long next; 762 763 if (ramblock_is_ignored(rb)) { 764 return size; 765 } 766 767 /* 768 * When the free page optimization is enabled, we need to check the bitmap 769 * to send the non-free pages rather than all the pages in the bulk stage. 770 */ 771 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) { 772 next = start + 1; 773 } else { 774 next = find_next_bit(bitmap, size, start); 775 } 776 777 return next; 778 } 779 780 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 781 RAMBlock *rb, 782 unsigned long page) 783 { 784 bool ret; 785 786 qemu_mutex_lock(&rs->bitmap_mutex); 787 788 /* 789 * Clear dirty bitmap if needed. This _must_ be called before we 790 * send any of the page in the chunk because we need to make sure 791 * we can capture further page content changes when we sync dirty 792 * log the next time. So as long as we are going to send any of 793 * the page in the chunk we clear the remote dirty bitmap for all. 794 * Clearing it earlier won't be a problem, but too late will. 795 */ 796 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) { 797 uint8_t shift = rb->clear_bmap_shift; 798 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift); 799 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); 800 801 /* 802 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 803 * can make things easier sometimes since then start address 804 * of the small chunk will always be 64 pages aligned so the 805 * bitmap will always be aligned to unsigned long. We should 806 * even be able to remove this restriction but I'm simply 807 * keeping it. 808 */ 809 assert(shift >= 6); 810 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 811 memory_region_clear_dirty_bitmap(rb->mr, start, size); 812 } 813 814 ret = test_and_clear_bit(page, rb->bmap); 815 816 if (ret) { 817 rs->migration_dirty_pages--; 818 } 819 qemu_mutex_unlock(&rs->bitmap_mutex); 820 821 return ret; 822 } 823 824 /* Called with RCU critical section */ 825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 826 { 827 rs->migration_dirty_pages += 828 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length, 829 &rs->num_dirty_pages_period); 830 } 831 832 /** 833 * ram_pagesize_summary: calculate all the pagesizes of a VM 834 * 835 * Returns a summary bitmap of the page sizes of all RAMBlocks 836 * 837 * For VMs with just normal pages this is equivalent to the host page 838 * size. If it's got some huge pages then it's the OR of all the 839 * different page sizes. 840 */ 841 uint64_t ram_pagesize_summary(void) 842 { 843 RAMBlock *block; 844 uint64_t summary = 0; 845 846 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 847 summary |= block->page_size; 848 } 849 850 return summary; 851 } 852 853 uint64_t ram_get_total_transferred_pages(void) 854 { 855 return ram_counters.normal + ram_counters.duplicate + 856 compression_counters.pages + xbzrle_counters.pages; 857 } 858 859 static void migration_update_rates(RAMState *rs, int64_t end_time) 860 { 861 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 862 double compressed_size; 863 864 /* calculate period counters */ 865 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 866 / (end_time - rs->time_last_bitmap_sync); 867 868 if (!page_count) { 869 return; 870 } 871 872 if (migrate_use_xbzrle()) { 873 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 874 rs->xbzrle_cache_miss_prev) / page_count; 875 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 876 } 877 878 if (migrate_use_compression()) { 879 compression_counters.busy_rate = (double)(compression_counters.busy - 880 rs->compress_thread_busy_prev) / page_count; 881 rs->compress_thread_busy_prev = compression_counters.busy; 882 883 compressed_size = compression_counters.compressed_size - 884 rs->compressed_size_prev; 885 if (compressed_size) { 886 double uncompressed_size = (compression_counters.pages - 887 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 888 889 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 890 compression_counters.compression_rate = 891 uncompressed_size / compressed_size; 892 893 rs->compress_pages_prev = compression_counters.pages; 894 rs->compressed_size_prev = compression_counters.compressed_size; 895 } 896 } 897 } 898 899 static void migration_bitmap_sync(RAMState *rs) 900 { 901 RAMBlock *block; 902 int64_t end_time; 903 uint64_t bytes_xfer_now; 904 905 ram_counters.dirty_sync_count++; 906 907 if (!rs->time_last_bitmap_sync) { 908 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 909 } 910 911 trace_migration_bitmap_sync_start(); 912 memory_global_dirty_log_sync(); 913 914 qemu_mutex_lock(&rs->bitmap_mutex); 915 WITH_RCU_READ_LOCK_GUARD() { 916 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 917 ramblock_sync_dirty_bitmap(rs, block); 918 } 919 ram_counters.remaining = ram_bytes_remaining(); 920 } 921 qemu_mutex_unlock(&rs->bitmap_mutex); 922 923 memory_global_after_dirty_log_sync(); 924 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 925 926 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 927 928 /* more than 1 second = 1000 millisecons */ 929 if (end_time > rs->time_last_bitmap_sync + 1000) { 930 bytes_xfer_now = ram_counters.transferred; 931 932 /* During block migration the auto-converge logic incorrectly detects 933 * that ram migration makes no progress. Avoid this by disabling the 934 * throttling logic during the bulk phase of block migration. */ 935 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 936 /* The following detection logic can be refined later. For now: 937 Check to see if the dirtied bytes is 50% more than the approx. 938 amount of bytes that just got transferred since the last time we 939 were in this routine. If that happens twice, start or increase 940 throttling */ 941 942 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE > 943 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) && 944 (++rs->dirty_rate_high_cnt >= 2)) { 945 trace_migration_throttle(); 946 rs->dirty_rate_high_cnt = 0; 947 mig_throttle_guest_down(); 948 } 949 } 950 951 migration_update_rates(rs, end_time); 952 953 rs->target_page_count_prev = rs->target_page_count; 954 955 /* reset period counters */ 956 rs->time_last_bitmap_sync = end_time; 957 rs->num_dirty_pages_period = 0; 958 rs->bytes_xfer_prev = bytes_xfer_now; 959 } 960 if (migrate_use_events()) { 961 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 962 } 963 } 964 965 static void migration_bitmap_sync_precopy(RAMState *rs) 966 { 967 Error *local_err = NULL; 968 969 /* 970 * The current notifier usage is just an optimization to migration, so we 971 * don't stop the normal migration process in the error case. 972 */ 973 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 974 error_report_err(local_err); 975 } 976 977 migration_bitmap_sync(rs); 978 979 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 980 error_report_err(local_err); 981 } 982 } 983 984 /** 985 * save_zero_page_to_file: send the zero page to the file 986 * 987 * Returns the size of data written to the file, 0 means the page is not 988 * a zero page 989 * 990 * @rs: current RAM state 991 * @file: the file where the data is saved 992 * @block: block that contains the page we want to send 993 * @offset: offset inside the block for the page 994 */ 995 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 996 RAMBlock *block, ram_addr_t offset) 997 { 998 uint8_t *p = block->host + offset; 999 int len = 0; 1000 1001 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1002 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1003 qemu_put_byte(file, 0); 1004 len += 1; 1005 } 1006 return len; 1007 } 1008 1009 /** 1010 * save_zero_page: send the zero page to the stream 1011 * 1012 * Returns the number of pages written. 1013 * 1014 * @rs: current RAM state 1015 * @block: block that contains the page we want to send 1016 * @offset: offset inside the block for the page 1017 */ 1018 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1019 { 1020 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1021 1022 if (len) { 1023 ram_counters.duplicate++; 1024 ram_counters.transferred += len; 1025 return 1; 1026 } 1027 return -1; 1028 } 1029 1030 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1031 { 1032 if (!migrate_release_ram() || !migration_in_postcopy()) { 1033 return; 1034 } 1035 1036 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1037 } 1038 1039 /* 1040 * @pages: the number of pages written by the control path, 1041 * < 0 - error 1042 * > 0 - number of pages written 1043 * 1044 * Return true if the pages has been saved, otherwise false is returned. 1045 */ 1046 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1047 int *pages) 1048 { 1049 uint64_t bytes_xmit = 0; 1050 int ret; 1051 1052 *pages = -1; 1053 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1054 &bytes_xmit); 1055 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1056 return false; 1057 } 1058 1059 if (bytes_xmit) { 1060 ram_counters.transferred += bytes_xmit; 1061 *pages = 1; 1062 } 1063 1064 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1065 return true; 1066 } 1067 1068 if (bytes_xmit > 0) { 1069 ram_counters.normal++; 1070 } else if (bytes_xmit == 0) { 1071 ram_counters.duplicate++; 1072 } 1073 1074 return true; 1075 } 1076 1077 /* 1078 * directly send the page to the stream 1079 * 1080 * Returns the number of pages written. 1081 * 1082 * @rs: current RAM state 1083 * @block: block that contains the page we want to send 1084 * @offset: offset inside the block for the page 1085 * @buf: the page to be sent 1086 * @async: send to page asyncly 1087 */ 1088 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1089 uint8_t *buf, bool async) 1090 { 1091 ram_counters.transferred += save_page_header(rs, rs->f, block, 1092 offset | RAM_SAVE_FLAG_PAGE); 1093 if (async) { 1094 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1095 migrate_release_ram() & 1096 migration_in_postcopy()); 1097 } else { 1098 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1099 } 1100 ram_counters.transferred += TARGET_PAGE_SIZE; 1101 ram_counters.normal++; 1102 return 1; 1103 } 1104 1105 /** 1106 * ram_save_page: send the given page to the stream 1107 * 1108 * Returns the number of pages written. 1109 * < 0 - error 1110 * >=0 - Number of pages written - this might legally be 0 1111 * if xbzrle noticed the page was the same. 1112 * 1113 * @rs: current RAM state 1114 * @block: block that contains the page we want to send 1115 * @offset: offset inside the block for the page 1116 * @last_stage: if we are at the completion stage 1117 */ 1118 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1119 { 1120 int pages = -1; 1121 uint8_t *p; 1122 bool send_async = true; 1123 RAMBlock *block = pss->block; 1124 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1125 ram_addr_t current_addr = block->offset + offset; 1126 1127 p = block->host + offset; 1128 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1129 1130 XBZRLE_cache_lock(); 1131 if (!rs->ram_bulk_stage && !migration_in_postcopy() && 1132 migrate_use_xbzrle()) { 1133 pages = save_xbzrle_page(rs, &p, current_addr, block, 1134 offset, last_stage); 1135 if (!last_stage) { 1136 /* Can't send this cached data async, since the cache page 1137 * might get updated before it gets to the wire 1138 */ 1139 send_async = false; 1140 } 1141 } 1142 1143 /* XBZRLE overflow or normal page */ 1144 if (pages == -1) { 1145 pages = save_normal_page(rs, block, offset, p, send_async); 1146 } 1147 1148 XBZRLE_cache_unlock(); 1149 1150 return pages; 1151 } 1152 1153 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1154 ram_addr_t offset) 1155 { 1156 if (multifd_queue_page(rs->f, block, offset) < 0) { 1157 return -1; 1158 } 1159 ram_counters.normal++; 1160 1161 return 1; 1162 } 1163 1164 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1165 ram_addr_t offset, uint8_t *source_buf) 1166 { 1167 RAMState *rs = ram_state; 1168 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1169 bool zero_page = false; 1170 int ret; 1171 1172 if (save_zero_page_to_file(rs, f, block, offset)) { 1173 zero_page = true; 1174 goto exit; 1175 } 1176 1177 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1178 1179 /* 1180 * copy it to a internal buffer to avoid it being modified by VM 1181 * so that we can catch up the error during compression and 1182 * decompression 1183 */ 1184 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1185 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1186 if (ret < 0) { 1187 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1188 error_report("compressed data failed!"); 1189 return false; 1190 } 1191 1192 exit: 1193 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1194 return zero_page; 1195 } 1196 1197 static void 1198 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1199 { 1200 ram_counters.transferred += bytes_xmit; 1201 1202 if (param->zero_page) { 1203 ram_counters.duplicate++; 1204 return; 1205 } 1206 1207 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1208 compression_counters.compressed_size += bytes_xmit - 8; 1209 compression_counters.pages++; 1210 } 1211 1212 static bool save_page_use_compression(RAMState *rs); 1213 1214 static void flush_compressed_data(RAMState *rs) 1215 { 1216 int idx, len, thread_count; 1217 1218 if (!save_page_use_compression(rs)) { 1219 return; 1220 } 1221 thread_count = migrate_compress_threads(); 1222 1223 qemu_mutex_lock(&comp_done_lock); 1224 for (idx = 0; idx < thread_count; idx++) { 1225 while (!comp_param[idx].done) { 1226 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1227 } 1228 } 1229 qemu_mutex_unlock(&comp_done_lock); 1230 1231 for (idx = 0; idx < thread_count; idx++) { 1232 qemu_mutex_lock(&comp_param[idx].mutex); 1233 if (!comp_param[idx].quit) { 1234 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1235 /* 1236 * it's safe to fetch zero_page without holding comp_done_lock 1237 * as there is no further request submitted to the thread, 1238 * i.e, the thread should be waiting for a request at this point. 1239 */ 1240 update_compress_thread_counts(&comp_param[idx], len); 1241 } 1242 qemu_mutex_unlock(&comp_param[idx].mutex); 1243 } 1244 } 1245 1246 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1247 ram_addr_t offset) 1248 { 1249 param->block = block; 1250 param->offset = offset; 1251 } 1252 1253 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1254 ram_addr_t offset) 1255 { 1256 int idx, thread_count, bytes_xmit = -1, pages = -1; 1257 bool wait = migrate_compress_wait_thread(); 1258 1259 thread_count = migrate_compress_threads(); 1260 qemu_mutex_lock(&comp_done_lock); 1261 retry: 1262 for (idx = 0; idx < thread_count; idx++) { 1263 if (comp_param[idx].done) { 1264 comp_param[idx].done = false; 1265 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1266 qemu_mutex_lock(&comp_param[idx].mutex); 1267 set_compress_params(&comp_param[idx], block, offset); 1268 qemu_cond_signal(&comp_param[idx].cond); 1269 qemu_mutex_unlock(&comp_param[idx].mutex); 1270 pages = 1; 1271 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1272 break; 1273 } 1274 } 1275 1276 /* 1277 * wait for the free thread if the user specifies 'compress-wait-thread', 1278 * otherwise we will post the page out in the main thread as normal page. 1279 */ 1280 if (pages < 0 && wait) { 1281 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1282 goto retry; 1283 } 1284 qemu_mutex_unlock(&comp_done_lock); 1285 1286 return pages; 1287 } 1288 1289 /** 1290 * find_dirty_block: find the next dirty page and update any state 1291 * associated with the search process. 1292 * 1293 * Returns true if a page is found 1294 * 1295 * @rs: current RAM state 1296 * @pss: data about the state of the current dirty page scan 1297 * @again: set to false if the search has scanned the whole of RAM 1298 */ 1299 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1300 { 1301 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1302 if (pss->complete_round && pss->block == rs->last_seen_block && 1303 pss->page >= rs->last_page) { 1304 /* 1305 * We've been once around the RAM and haven't found anything. 1306 * Give up. 1307 */ 1308 *again = false; 1309 return false; 1310 } 1311 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS) 1312 >= pss->block->used_length) { 1313 /* Didn't find anything in this RAM Block */ 1314 pss->page = 0; 1315 pss->block = QLIST_NEXT_RCU(pss->block, next); 1316 if (!pss->block) { 1317 /* 1318 * If memory migration starts over, we will meet a dirtied page 1319 * which may still exists in compression threads's ring, so we 1320 * should flush the compressed data to make sure the new page 1321 * is not overwritten by the old one in the destination. 1322 * 1323 * Also If xbzrle is on, stop using the data compression at this 1324 * point. In theory, xbzrle can do better than compression. 1325 */ 1326 flush_compressed_data(rs); 1327 1328 /* Hit the end of the list */ 1329 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1330 /* Flag that we've looped */ 1331 pss->complete_round = true; 1332 rs->ram_bulk_stage = false; 1333 } 1334 /* Didn't find anything this time, but try again on the new block */ 1335 *again = true; 1336 return false; 1337 } else { 1338 /* Can go around again, but... */ 1339 *again = true; 1340 /* We've found something so probably don't need to */ 1341 return true; 1342 } 1343 } 1344 1345 /** 1346 * unqueue_page: gets a page of the queue 1347 * 1348 * Helper for 'get_queued_page' - gets a page off the queue 1349 * 1350 * Returns the block of the page (or NULL if none available) 1351 * 1352 * @rs: current RAM state 1353 * @offset: used to return the offset within the RAMBlock 1354 */ 1355 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1356 { 1357 RAMBlock *block = NULL; 1358 1359 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1360 return NULL; 1361 } 1362 1363 qemu_mutex_lock(&rs->src_page_req_mutex); 1364 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1365 struct RAMSrcPageRequest *entry = 1366 QSIMPLEQ_FIRST(&rs->src_page_requests); 1367 block = entry->rb; 1368 *offset = entry->offset; 1369 1370 if (entry->len > TARGET_PAGE_SIZE) { 1371 entry->len -= TARGET_PAGE_SIZE; 1372 entry->offset += TARGET_PAGE_SIZE; 1373 } else { 1374 memory_region_unref(block->mr); 1375 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1376 g_free(entry); 1377 migration_consume_urgent_request(); 1378 } 1379 } 1380 qemu_mutex_unlock(&rs->src_page_req_mutex); 1381 1382 return block; 1383 } 1384 1385 /** 1386 * get_queued_page: unqueue a page from the postcopy requests 1387 * 1388 * Skips pages that are already sent (!dirty) 1389 * 1390 * Returns true if a queued page is found 1391 * 1392 * @rs: current RAM state 1393 * @pss: data about the state of the current dirty page scan 1394 */ 1395 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1396 { 1397 RAMBlock *block; 1398 ram_addr_t offset; 1399 bool dirty; 1400 1401 do { 1402 block = unqueue_page(rs, &offset); 1403 /* 1404 * We're sending this page, and since it's postcopy nothing else 1405 * will dirty it, and we must make sure it doesn't get sent again 1406 * even if this queue request was received after the background 1407 * search already sent it. 1408 */ 1409 if (block) { 1410 unsigned long page; 1411 1412 page = offset >> TARGET_PAGE_BITS; 1413 dirty = test_bit(page, block->bmap); 1414 if (!dirty) { 1415 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1416 page); 1417 } else { 1418 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1419 } 1420 } 1421 1422 } while (block && !dirty); 1423 1424 if (block) { 1425 /* 1426 * As soon as we start servicing pages out of order, then we have 1427 * to kill the bulk stage, since the bulk stage assumes 1428 * in (migration_bitmap_find_and_reset_dirty) that every page is 1429 * dirty, that's no longer true. 1430 */ 1431 rs->ram_bulk_stage = false; 1432 1433 /* 1434 * We want the background search to continue from the queued page 1435 * since the guest is likely to want other pages near to the page 1436 * it just requested. 1437 */ 1438 pss->block = block; 1439 pss->page = offset >> TARGET_PAGE_BITS; 1440 1441 /* 1442 * This unqueued page would break the "one round" check, even is 1443 * really rare. 1444 */ 1445 pss->complete_round = false; 1446 } 1447 1448 return !!block; 1449 } 1450 1451 /** 1452 * migration_page_queue_free: drop any remaining pages in the ram 1453 * request queue 1454 * 1455 * It should be empty at the end anyway, but in error cases there may 1456 * be some left. in case that there is any page left, we drop it. 1457 * 1458 */ 1459 static void migration_page_queue_free(RAMState *rs) 1460 { 1461 struct RAMSrcPageRequest *mspr, *next_mspr; 1462 /* This queue generally should be empty - but in the case of a failed 1463 * migration might have some droppings in. 1464 */ 1465 RCU_READ_LOCK_GUARD(); 1466 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1467 memory_region_unref(mspr->rb->mr); 1468 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1469 g_free(mspr); 1470 } 1471 } 1472 1473 /** 1474 * ram_save_queue_pages: queue the page for transmission 1475 * 1476 * A request from postcopy destination for example. 1477 * 1478 * Returns zero on success or negative on error 1479 * 1480 * @rbname: Name of the RAMBLock of the request. NULL means the 1481 * same that last one. 1482 * @start: starting address from the start of the RAMBlock 1483 * @len: length (in bytes) to send 1484 */ 1485 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1486 { 1487 RAMBlock *ramblock; 1488 RAMState *rs = ram_state; 1489 1490 ram_counters.postcopy_requests++; 1491 RCU_READ_LOCK_GUARD(); 1492 1493 if (!rbname) { 1494 /* Reuse last RAMBlock */ 1495 ramblock = rs->last_req_rb; 1496 1497 if (!ramblock) { 1498 /* 1499 * Shouldn't happen, we can't reuse the last RAMBlock if 1500 * it's the 1st request. 1501 */ 1502 error_report("ram_save_queue_pages no previous block"); 1503 return -1; 1504 } 1505 } else { 1506 ramblock = qemu_ram_block_by_name(rbname); 1507 1508 if (!ramblock) { 1509 /* We shouldn't be asked for a non-existent RAMBlock */ 1510 error_report("ram_save_queue_pages no block '%s'", rbname); 1511 return -1; 1512 } 1513 rs->last_req_rb = ramblock; 1514 } 1515 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1516 if (start+len > ramblock->used_length) { 1517 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1518 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1519 __func__, start, len, ramblock->used_length); 1520 return -1; 1521 } 1522 1523 struct RAMSrcPageRequest *new_entry = 1524 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1525 new_entry->rb = ramblock; 1526 new_entry->offset = start; 1527 new_entry->len = len; 1528 1529 memory_region_ref(ramblock->mr); 1530 qemu_mutex_lock(&rs->src_page_req_mutex); 1531 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1532 migration_make_urgent_request(); 1533 qemu_mutex_unlock(&rs->src_page_req_mutex); 1534 1535 return 0; 1536 } 1537 1538 static bool save_page_use_compression(RAMState *rs) 1539 { 1540 if (!migrate_use_compression()) { 1541 return false; 1542 } 1543 1544 /* 1545 * If xbzrle is on, stop using the data compression after first 1546 * round of migration even if compression is enabled. In theory, 1547 * xbzrle can do better than compression. 1548 */ 1549 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 1550 return true; 1551 } 1552 1553 return false; 1554 } 1555 1556 /* 1557 * try to compress the page before posting it out, return true if the page 1558 * has been properly handled by compression, otherwise needs other 1559 * paths to handle it 1560 */ 1561 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1562 { 1563 if (!save_page_use_compression(rs)) { 1564 return false; 1565 } 1566 1567 /* 1568 * When starting the process of a new block, the first page of 1569 * the block should be sent out before other pages in the same 1570 * block, and all the pages in last block should have been sent 1571 * out, keeping this order is important, because the 'cont' flag 1572 * is used to avoid resending the block name. 1573 * 1574 * We post the fist page as normal page as compression will take 1575 * much CPU resource. 1576 */ 1577 if (block != rs->last_sent_block) { 1578 flush_compressed_data(rs); 1579 return false; 1580 } 1581 1582 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 1583 return true; 1584 } 1585 1586 compression_counters.busy++; 1587 return false; 1588 } 1589 1590 /** 1591 * ram_save_target_page: save one target page 1592 * 1593 * Returns the number of pages written 1594 * 1595 * @rs: current RAM state 1596 * @pss: data about the page we want to send 1597 * @last_stage: if we are at the completion stage 1598 */ 1599 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1600 bool last_stage) 1601 { 1602 RAMBlock *block = pss->block; 1603 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1604 int res; 1605 1606 if (control_save_page(rs, block, offset, &res)) { 1607 return res; 1608 } 1609 1610 if (save_compress_page(rs, block, offset)) { 1611 return 1; 1612 } 1613 1614 res = save_zero_page(rs, block, offset); 1615 if (res > 0) { 1616 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1617 * page would be stale 1618 */ 1619 if (!save_page_use_compression(rs)) { 1620 XBZRLE_cache_lock(); 1621 xbzrle_cache_zero_page(rs, block->offset + offset); 1622 XBZRLE_cache_unlock(); 1623 } 1624 ram_release_pages(block->idstr, offset, res); 1625 return res; 1626 } 1627 1628 /* 1629 * Do not use multifd for: 1630 * 1. Compression as the first page in the new block should be posted out 1631 * before sending the compressed page 1632 * 2. In postcopy as one whole host page should be placed 1633 */ 1634 if (!save_page_use_compression(rs) && migrate_use_multifd() 1635 && !migration_in_postcopy()) { 1636 return ram_save_multifd_page(rs, block, offset); 1637 } 1638 1639 return ram_save_page(rs, pss, last_stage); 1640 } 1641 1642 /** 1643 * ram_save_host_page: save a whole host page 1644 * 1645 * Starting at *offset send pages up to the end of the current host 1646 * page. It's valid for the initial offset to point into the middle of 1647 * a host page in which case the remainder of the hostpage is sent. 1648 * Only dirty target pages are sent. Note that the host page size may 1649 * be a huge page for this block. 1650 * The saving stops at the boundary of the used_length of the block 1651 * if the RAMBlock isn't a multiple of the host page size. 1652 * 1653 * Returns the number of pages written or negative on error 1654 * 1655 * @rs: current RAM state 1656 * @ms: current migration state 1657 * @pss: data about the page we want to send 1658 * @last_stage: if we are at the completion stage 1659 */ 1660 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1661 bool last_stage) 1662 { 1663 int tmppages, pages = 0; 1664 size_t pagesize_bits = 1665 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1666 1667 if (ramblock_is_ignored(pss->block)) { 1668 error_report("block %s should not be migrated !", pss->block->idstr); 1669 return 0; 1670 } 1671 1672 do { 1673 /* Check the pages is dirty and if it is send it */ 1674 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 1675 pss->page++; 1676 continue; 1677 } 1678 1679 tmppages = ram_save_target_page(rs, pss, last_stage); 1680 if (tmppages < 0) { 1681 return tmppages; 1682 } 1683 1684 pages += tmppages; 1685 pss->page++; 1686 /* Allow rate limiting to happen in the middle of huge pages */ 1687 migration_rate_limit(); 1688 } while ((pss->page & (pagesize_bits - 1)) && 1689 offset_in_ramblock(pss->block, 1690 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 1691 1692 /* The offset we leave with is the last one we looked at */ 1693 pss->page--; 1694 return pages; 1695 } 1696 1697 /** 1698 * ram_find_and_save_block: finds a dirty page and sends it to f 1699 * 1700 * Called within an RCU critical section. 1701 * 1702 * Returns the number of pages written where zero means no dirty pages, 1703 * or negative on error 1704 * 1705 * @rs: current RAM state 1706 * @last_stage: if we are at the completion stage 1707 * 1708 * On systems where host-page-size > target-page-size it will send all the 1709 * pages in a host page that are dirty. 1710 */ 1711 1712 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 1713 { 1714 PageSearchStatus pss; 1715 int pages = 0; 1716 bool again, found; 1717 1718 /* No dirty page as there is zero RAM */ 1719 if (!ram_bytes_total()) { 1720 return pages; 1721 } 1722 1723 pss.block = rs->last_seen_block; 1724 pss.page = rs->last_page; 1725 pss.complete_round = false; 1726 1727 if (!pss.block) { 1728 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1729 } 1730 1731 do { 1732 again = true; 1733 found = get_queued_page(rs, &pss); 1734 1735 if (!found) { 1736 /* priority queue empty, so just search for something dirty */ 1737 found = find_dirty_block(rs, &pss, &again); 1738 } 1739 1740 if (found) { 1741 pages = ram_save_host_page(rs, &pss, last_stage); 1742 } 1743 } while (!pages && again); 1744 1745 rs->last_seen_block = pss.block; 1746 rs->last_page = pss.page; 1747 1748 return pages; 1749 } 1750 1751 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1752 { 1753 uint64_t pages = size / TARGET_PAGE_SIZE; 1754 1755 if (zero) { 1756 ram_counters.duplicate += pages; 1757 } else { 1758 ram_counters.normal += pages; 1759 ram_counters.transferred += size; 1760 qemu_update_position(f, size); 1761 } 1762 } 1763 1764 static uint64_t ram_bytes_total_common(bool count_ignored) 1765 { 1766 RAMBlock *block; 1767 uint64_t total = 0; 1768 1769 RCU_READ_LOCK_GUARD(); 1770 1771 if (count_ignored) { 1772 RAMBLOCK_FOREACH_MIGRATABLE(block) { 1773 total += block->used_length; 1774 } 1775 } else { 1776 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1777 total += block->used_length; 1778 } 1779 } 1780 return total; 1781 } 1782 1783 uint64_t ram_bytes_total(void) 1784 { 1785 return ram_bytes_total_common(false); 1786 } 1787 1788 static void xbzrle_load_setup(void) 1789 { 1790 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 1791 } 1792 1793 static void xbzrle_load_cleanup(void) 1794 { 1795 g_free(XBZRLE.decoded_buf); 1796 XBZRLE.decoded_buf = NULL; 1797 } 1798 1799 static void ram_state_cleanup(RAMState **rsp) 1800 { 1801 if (*rsp) { 1802 migration_page_queue_free(*rsp); 1803 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 1804 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 1805 g_free(*rsp); 1806 *rsp = NULL; 1807 } 1808 } 1809 1810 static void xbzrle_cleanup(void) 1811 { 1812 XBZRLE_cache_lock(); 1813 if (XBZRLE.cache) { 1814 cache_fini(XBZRLE.cache); 1815 g_free(XBZRLE.encoded_buf); 1816 g_free(XBZRLE.current_buf); 1817 g_free(XBZRLE.zero_target_page); 1818 XBZRLE.cache = NULL; 1819 XBZRLE.encoded_buf = NULL; 1820 XBZRLE.current_buf = NULL; 1821 XBZRLE.zero_target_page = NULL; 1822 } 1823 XBZRLE_cache_unlock(); 1824 } 1825 1826 static void ram_save_cleanup(void *opaque) 1827 { 1828 RAMState **rsp = opaque; 1829 RAMBlock *block; 1830 1831 /* caller have hold iothread lock or is in a bh, so there is 1832 * no writing race against the migration bitmap 1833 */ 1834 memory_global_dirty_log_stop(); 1835 1836 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1837 g_free(block->clear_bmap); 1838 block->clear_bmap = NULL; 1839 g_free(block->bmap); 1840 block->bmap = NULL; 1841 } 1842 1843 xbzrle_cleanup(); 1844 compress_threads_save_cleanup(); 1845 ram_state_cleanup(rsp); 1846 } 1847 1848 static void ram_state_reset(RAMState *rs) 1849 { 1850 rs->last_seen_block = NULL; 1851 rs->last_sent_block = NULL; 1852 rs->last_page = 0; 1853 rs->last_version = ram_list.version; 1854 rs->ram_bulk_stage = true; 1855 rs->fpo_enabled = false; 1856 } 1857 1858 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1859 1860 /* 1861 * 'expected' is the value you expect the bitmap mostly to be full 1862 * of; it won't bother printing lines that are all this value. 1863 * If 'todump' is null the migration bitmap is dumped. 1864 */ 1865 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 1866 unsigned long pages) 1867 { 1868 int64_t cur; 1869 int64_t linelen = 128; 1870 char linebuf[129]; 1871 1872 for (cur = 0; cur < pages; cur += linelen) { 1873 int64_t curb; 1874 bool found = false; 1875 /* 1876 * Last line; catch the case where the line length 1877 * is longer than remaining ram 1878 */ 1879 if (cur + linelen > pages) { 1880 linelen = pages - cur; 1881 } 1882 for (curb = 0; curb < linelen; curb++) { 1883 bool thisbit = test_bit(cur + curb, todump); 1884 linebuf[curb] = thisbit ? '1' : '.'; 1885 found = found || (thisbit != expected); 1886 } 1887 if (found) { 1888 linebuf[curb] = '\0'; 1889 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1890 } 1891 } 1892 } 1893 1894 /* **** functions for postcopy ***** */ 1895 1896 void ram_postcopy_migrated_memory_release(MigrationState *ms) 1897 { 1898 struct RAMBlock *block; 1899 1900 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1901 unsigned long *bitmap = block->bmap; 1902 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 1903 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 1904 1905 while (run_start < range) { 1906 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 1907 ram_discard_range(block->idstr, 1908 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 1909 ((ram_addr_t)(run_end - run_start)) 1910 << TARGET_PAGE_BITS); 1911 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 1912 } 1913 } 1914 } 1915 1916 /** 1917 * postcopy_send_discard_bm_ram: discard a RAMBlock 1918 * 1919 * Returns zero on success 1920 * 1921 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1922 * 1923 * @ms: current migration state 1924 * @block: RAMBlock to discard 1925 */ 1926 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 1927 { 1928 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 1929 unsigned long current; 1930 unsigned long *bitmap = block->bmap; 1931 1932 for (current = 0; current < end; ) { 1933 unsigned long one = find_next_bit(bitmap, end, current); 1934 unsigned long zero, discard_length; 1935 1936 if (one >= end) { 1937 break; 1938 } 1939 1940 zero = find_next_zero_bit(bitmap, end, one + 1); 1941 1942 if (zero >= end) { 1943 discard_length = end - one; 1944 } else { 1945 discard_length = zero - one; 1946 } 1947 postcopy_discard_send_range(ms, one, discard_length); 1948 current = one + discard_length; 1949 } 1950 1951 return 0; 1952 } 1953 1954 /** 1955 * postcopy_each_ram_send_discard: discard all RAMBlocks 1956 * 1957 * Returns 0 for success or negative for error 1958 * 1959 * Utility for the outgoing postcopy code. 1960 * Calls postcopy_send_discard_bm_ram for each RAMBlock 1961 * passing it bitmap indexes and name. 1962 * (qemu_ram_foreach_block ends up passing unscaled lengths 1963 * which would mean postcopy code would have to deal with target page) 1964 * 1965 * @ms: current migration state 1966 */ 1967 static int postcopy_each_ram_send_discard(MigrationState *ms) 1968 { 1969 struct RAMBlock *block; 1970 int ret; 1971 1972 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1973 postcopy_discard_send_init(ms, block->idstr); 1974 1975 /* 1976 * Postcopy sends chunks of bitmap over the wire, but it 1977 * just needs indexes at this point, avoids it having 1978 * target page specific code. 1979 */ 1980 ret = postcopy_send_discard_bm_ram(ms, block); 1981 postcopy_discard_send_finish(ms); 1982 if (ret) { 1983 return ret; 1984 } 1985 } 1986 1987 return 0; 1988 } 1989 1990 /** 1991 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 1992 * 1993 * Helper for postcopy_chunk_hostpages; it's called twice to 1994 * canonicalize the two bitmaps, that are similar, but one is 1995 * inverted. 1996 * 1997 * Postcopy requires that all target pages in a hostpage are dirty or 1998 * clean, not a mix. This function canonicalizes the bitmaps. 1999 * 2000 * @ms: current migration state 2001 * @block: block that contains the page we want to canonicalize 2002 */ 2003 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2004 { 2005 RAMState *rs = ram_state; 2006 unsigned long *bitmap = block->bmap; 2007 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2008 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2009 unsigned long run_start; 2010 2011 if (block->page_size == TARGET_PAGE_SIZE) { 2012 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2013 return; 2014 } 2015 2016 /* Find a dirty page */ 2017 run_start = find_next_bit(bitmap, pages, 0); 2018 2019 while (run_start < pages) { 2020 2021 /* 2022 * If the start of this run of pages is in the middle of a host 2023 * page, then we need to fixup this host page. 2024 */ 2025 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2026 /* Find the end of this run */ 2027 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2028 /* 2029 * If the end isn't at the start of a host page, then the 2030 * run doesn't finish at the end of a host page 2031 * and we need to discard. 2032 */ 2033 } 2034 2035 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2036 unsigned long page; 2037 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2038 host_ratio); 2039 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2040 2041 /* Clean up the bitmap */ 2042 for (page = fixup_start_addr; 2043 page < fixup_start_addr + host_ratio; page++) { 2044 /* 2045 * Remark them as dirty, updating the count for any pages 2046 * that weren't previously dirty. 2047 */ 2048 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2049 } 2050 } 2051 2052 /* Find the next dirty page for the next iteration */ 2053 run_start = find_next_bit(bitmap, pages, run_start); 2054 } 2055 } 2056 2057 /** 2058 * postcopy_chunk_hostpages: discard any partially sent host page 2059 * 2060 * Utility for the outgoing postcopy code. 2061 * 2062 * Discard any partially sent host-page size chunks, mark any partially 2063 * dirty host-page size chunks as all dirty. In this case the host-page 2064 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2065 * 2066 * Returns zero on success 2067 * 2068 * @ms: current migration state 2069 * @block: block we want to work with 2070 */ 2071 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2072 { 2073 postcopy_discard_send_init(ms, block->idstr); 2074 2075 /* 2076 * Ensure that all partially dirty host pages are made fully dirty. 2077 */ 2078 postcopy_chunk_hostpages_pass(ms, block); 2079 2080 postcopy_discard_send_finish(ms); 2081 return 0; 2082 } 2083 2084 /** 2085 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2086 * 2087 * Returns zero on success 2088 * 2089 * Transmit the set of pages to be discarded after precopy to the target 2090 * these are pages that: 2091 * a) Have been previously transmitted but are now dirty again 2092 * b) Pages that have never been transmitted, this ensures that 2093 * any pages on the destination that have been mapped by background 2094 * tasks get discarded (transparent huge pages is the specific concern) 2095 * Hopefully this is pretty sparse 2096 * 2097 * @ms: current migration state 2098 */ 2099 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2100 { 2101 RAMState *rs = ram_state; 2102 RAMBlock *block; 2103 int ret; 2104 2105 RCU_READ_LOCK_GUARD(); 2106 2107 /* This should be our last sync, the src is now paused */ 2108 migration_bitmap_sync(rs); 2109 2110 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2111 rs->last_seen_block = NULL; 2112 rs->last_sent_block = NULL; 2113 rs->last_page = 0; 2114 2115 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2116 /* Deal with TPS != HPS and huge pages */ 2117 ret = postcopy_chunk_hostpages(ms, block); 2118 if (ret) { 2119 return ret; 2120 } 2121 2122 #ifdef DEBUG_POSTCOPY 2123 ram_debug_dump_bitmap(block->bmap, true, 2124 block->used_length >> TARGET_PAGE_BITS); 2125 #endif 2126 } 2127 trace_ram_postcopy_send_discard_bitmap(); 2128 2129 ret = postcopy_each_ram_send_discard(ms); 2130 2131 return ret; 2132 } 2133 2134 /** 2135 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2136 * 2137 * Returns zero on success 2138 * 2139 * @rbname: name of the RAMBlock of the request. NULL means the 2140 * same that last one. 2141 * @start: RAMBlock starting page 2142 * @length: RAMBlock size 2143 */ 2144 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2145 { 2146 trace_ram_discard_range(rbname, start, length); 2147 2148 RCU_READ_LOCK_GUARD(); 2149 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2150 2151 if (!rb) { 2152 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2153 return -1; 2154 } 2155 2156 /* 2157 * On source VM, we don't need to update the received bitmap since 2158 * we don't even have one. 2159 */ 2160 if (rb->receivedmap) { 2161 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2162 length >> qemu_target_page_bits()); 2163 } 2164 2165 return ram_block_discard_range(rb, start, length); 2166 } 2167 2168 /* 2169 * For every allocation, we will try not to crash the VM if the 2170 * allocation failed. 2171 */ 2172 static int xbzrle_init(void) 2173 { 2174 Error *local_err = NULL; 2175 2176 if (!migrate_use_xbzrle()) { 2177 return 0; 2178 } 2179 2180 XBZRLE_cache_lock(); 2181 2182 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2183 if (!XBZRLE.zero_target_page) { 2184 error_report("%s: Error allocating zero page", __func__); 2185 goto err_out; 2186 } 2187 2188 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2189 TARGET_PAGE_SIZE, &local_err); 2190 if (!XBZRLE.cache) { 2191 error_report_err(local_err); 2192 goto free_zero_page; 2193 } 2194 2195 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2196 if (!XBZRLE.encoded_buf) { 2197 error_report("%s: Error allocating encoded_buf", __func__); 2198 goto free_cache; 2199 } 2200 2201 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2202 if (!XBZRLE.current_buf) { 2203 error_report("%s: Error allocating current_buf", __func__); 2204 goto free_encoded_buf; 2205 } 2206 2207 /* We are all good */ 2208 XBZRLE_cache_unlock(); 2209 return 0; 2210 2211 free_encoded_buf: 2212 g_free(XBZRLE.encoded_buf); 2213 XBZRLE.encoded_buf = NULL; 2214 free_cache: 2215 cache_fini(XBZRLE.cache); 2216 XBZRLE.cache = NULL; 2217 free_zero_page: 2218 g_free(XBZRLE.zero_target_page); 2219 XBZRLE.zero_target_page = NULL; 2220 err_out: 2221 XBZRLE_cache_unlock(); 2222 return -ENOMEM; 2223 } 2224 2225 static int ram_state_init(RAMState **rsp) 2226 { 2227 *rsp = g_try_new0(RAMState, 1); 2228 2229 if (!*rsp) { 2230 error_report("%s: Init ramstate fail", __func__); 2231 return -1; 2232 } 2233 2234 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2235 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2236 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2237 2238 /* 2239 * Count the total number of pages used by ram blocks not including any 2240 * gaps due to alignment or unplugs. 2241 * This must match with the initial values of dirty bitmap. 2242 */ 2243 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2244 ram_state_reset(*rsp); 2245 2246 return 0; 2247 } 2248 2249 static void ram_list_init_bitmaps(void) 2250 { 2251 MigrationState *ms = migrate_get_current(); 2252 RAMBlock *block; 2253 unsigned long pages; 2254 uint8_t shift; 2255 2256 /* Skip setting bitmap if there is no RAM */ 2257 if (ram_bytes_total()) { 2258 shift = ms->clear_bitmap_shift; 2259 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2260 error_report("clear_bitmap_shift (%u) too big, using " 2261 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2262 shift = CLEAR_BITMAP_SHIFT_MAX; 2263 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2264 error_report("clear_bitmap_shift (%u) too small, using " 2265 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2266 shift = CLEAR_BITMAP_SHIFT_MIN; 2267 } 2268 2269 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2270 pages = block->max_length >> TARGET_PAGE_BITS; 2271 /* 2272 * The initial dirty bitmap for migration must be set with all 2273 * ones to make sure we'll migrate every guest RAM page to 2274 * destination. 2275 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2276 * new migration after a failed migration, ram_list. 2277 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2278 * guest memory. 2279 */ 2280 block->bmap = bitmap_new(pages); 2281 bitmap_set(block->bmap, 0, pages); 2282 block->clear_bmap_shift = shift; 2283 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2284 } 2285 } 2286 } 2287 2288 static void ram_init_bitmaps(RAMState *rs) 2289 { 2290 /* For memory_global_dirty_log_start below. */ 2291 qemu_mutex_lock_iothread(); 2292 qemu_mutex_lock_ramlist(); 2293 2294 WITH_RCU_READ_LOCK_GUARD() { 2295 ram_list_init_bitmaps(); 2296 memory_global_dirty_log_start(); 2297 migration_bitmap_sync_precopy(rs); 2298 } 2299 qemu_mutex_unlock_ramlist(); 2300 qemu_mutex_unlock_iothread(); 2301 } 2302 2303 static int ram_init_all(RAMState **rsp) 2304 { 2305 if (ram_state_init(rsp)) { 2306 return -1; 2307 } 2308 2309 if (xbzrle_init()) { 2310 ram_state_cleanup(rsp); 2311 return -1; 2312 } 2313 2314 ram_init_bitmaps(*rsp); 2315 2316 return 0; 2317 } 2318 2319 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2320 { 2321 RAMBlock *block; 2322 uint64_t pages = 0; 2323 2324 /* 2325 * Postcopy is not using xbzrle/compression, so no need for that. 2326 * Also, since source are already halted, we don't need to care 2327 * about dirty page logging as well. 2328 */ 2329 2330 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2331 pages += bitmap_count_one(block->bmap, 2332 block->used_length >> TARGET_PAGE_BITS); 2333 } 2334 2335 /* This may not be aligned with current bitmaps. Recalculate. */ 2336 rs->migration_dirty_pages = pages; 2337 2338 rs->last_seen_block = NULL; 2339 rs->last_sent_block = NULL; 2340 rs->last_page = 0; 2341 rs->last_version = ram_list.version; 2342 /* 2343 * Disable the bulk stage, otherwise we'll resend the whole RAM no 2344 * matter what we have sent. 2345 */ 2346 rs->ram_bulk_stage = false; 2347 2348 /* Update RAMState cache of output QEMUFile */ 2349 rs->f = out; 2350 2351 trace_ram_state_resume_prepare(pages); 2352 } 2353 2354 /* 2355 * This function clears bits of the free pages reported by the caller from the 2356 * migration dirty bitmap. @addr is the host address corresponding to the 2357 * start of the continuous guest free pages, and @len is the total bytes of 2358 * those pages. 2359 */ 2360 void qemu_guest_free_page_hint(void *addr, size_t len) 2361 { 2362 RAMBlock *block; 2363 ram_addr_t offset; 2364 size_t used_len, start, npages; 2365 MigrationState *s = migrate_get_current(); 2366 2367 /* This function is currently expected to be used during live migration */ 2368 if (!migration_is_setup_or_active(s->state)) { 2369 return; 2370 } 2371 2372 for (; len > 0; len -= used_len, addr += used_len) { 2373 block = qemu_ram_block_from_host(addr, false, &offset); 2374 if (unlikely(!block || offset >= block->used_length)) { 2375 /* 2376 * The implementation might not support RAMBlock resize during 2377 * live migration, but it could happen in theory with future 2378 * updates. So we add a check here to capture that case. 2379 */ 2380 error_report_once("%s unexpected error", __func__); 2381 return; 2382 } 2383 2384 if (len <= block->used_length - offset) { 2385 used_len = len; 2386 } else { 2387 used_len = block->used_length - offset; 2388 } 2389 2390 start = offset >> TARGET_PAGE_BITS; 2391 npages = used_len >> TARGET_PAGE_BITS; 2392 2393 qemu_mutex_lock(&ram_state->bitmap_mutex); 2394 ram_state->migration_dirty_pages -= 2395 bitmap_count_one_with_offset(block->bmap, start, npages); 2396 bitmap_clear(block->bmap, start, npages); 2397 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2398 } 2399 } 2400 2401 /* 2402 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2403 * long-running RCU critical section. When rcu-reclaims in the code 2404 * start to become numerous it will be necessary to reduce the 2405 * granularity of these critical sections. 2406 */ 2407 2408 /** 2409 * ram_save_setup: Setup RAM for migration 2410 * 2411 * Returns zero to indicate success and negative for error 2412 * 2413 * @f: QEMUFile where to send the data 2414 * @opaque: RAMState pointer 2415 */ 2416 static int ram_save_setup(QEMUFile *f, void *opaque) 2417 { 2418 RAMState **rsp = opaque; 2419 RAMBlock *block; 2420 2421 if (compress_threads_save_setup()) { 2422 return -1; 2423 } 2424 2425 /* migration has already setup the bitmap, reuse it. */ 2426 if (!migration_in_colo_state()) { 2427 if (ram_init_all(rsp) != 0) { 2428 compress_threads_save_cleanup(); 2429 return -1; 2430 } 2431 } 2432 (*rsp)->f = f; 2433 2434 WITH_RCU_READ_LOCK_GUARD() { 2435 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2436 2437 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2438 qemu_put_byte(f, strlen(block->idstr)); 2439 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2440 qemu_put_be64(f, block->used_length); 2441 if (migrate_postcopy_ram() && block->page_size != 2442 qemu_host_page_size) { 2443 qemu_put_be64(f, block->page_size); 2444 } 2445 if (migrate_ignore_shared()) { 2446 qemu_put_be64(f, block->mr->addr); 2447 } 2448 } 2449 } 2450 2451 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2452 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2453 2454 multifd_send_sync_main(f); 2455 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2456 qemu_fflush(f); 2457 2458 return 0; 2459 } 2460 2461 /** 2462 * ram_save_iterate: iterative stage for migration 2463 * 2464 * Returns zero to indicate success and negative for error 2465 * 2466 * @f: QEMUFile where to send the data 2467 * @opaque: RAMState pointer 2468 */ 2469 static int ram_save_iterate(QEMUFile *f, void *opaque) 2470 { 2471 RAMState **temp = opaque; 2472 RAMState *rs = *temp; 2473 int ret = 0; 2474 int i; 2475 int64_t t0; 2476 int done = 0; 2477 2478 if (blk_mig_bulk_active()) { 2479 /* Avoid transferring ram during bulk phase of block migration as 2480 * the bulk phase will usually take a long time and transferring 2481 * ram updates during that time is pointless. */ 2482 goto out; 2483 } 2484 2485 WITH_RCU_READ_LOCK_GUARD() { 2486 if (ram_list.version != rs->last_version) { 2487 ram_state_reset(rs); 2488 } 2489 2490 /* Read version before ram_list.blocks */ 2491 smp_rmb(); 2492 2493 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2494 2495 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2496 i = 0; 2497 while ((ret = qemu_file_rate_limit(f)) == 0 || 2498 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 2499 int pages; 2500 2501 if (qemu_file_get_error(f)) { 2502 break; 2503 } 2504 2505 pages = ram_find_and_save_block(rs, false); 2506 /* no more pages to sent */ 2507 if (pages == 0) { 2508 done = 1; 2509 break; 2510 } 2511 2512 if (pages < 0) { 2513 qemu_file_set_error(f, pages); 2514 break; 2515 } 2516 2517 rs->target_page_count += pages; 2518 2519 /* 2520 * During postcopy, it is necessary to make sure one whole host 2521 * page is sent in one chunk. 2522 */ 2523 if (migrate_postcopy_ram()) { 2524 flush_compressed_data(rs); 2525 } 2526 2527 /* 2528 * we want to check in the 1st loop, just in case it was the 1st 2529 * time and we had to sync the dirty bitmap. 2530 * qemu_clock_get_ns() is a bit expensive, so we only check each 2531 * some iterations 2532 */ 2533 if ((i & 63) == 0) { 2534 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 2535 1000000; 2536 if (t1 > MAX_WAIT) { 2537 trace_ram_save_iterate_big_wait(t1, i); 2538 break; 2539 } 2540 } 2541 i++; 2542 } 2543 } 2544 2545 /* 2546 * Must occur before EOS (or any QEMUFile operation) 2547 * because of RDMA protocol. 2548 */ 2549 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2550 2551 out: 2552 if (ret >= 0 2553 && migration_is_setup_or_active(migrate_get_current()->state)) { 2554 multifd_send_sync_main(rs->f); 2555 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2556 qemu_fflush(f); 2557 ram_counters.transferred += 8; 2558 2559 ret = qemu_file_get_error(f); 2560 } 2561 if (ret < 0) { 2562 return ret; 2563 } 2564 2565 return done; 2566 } 2567 2568 /** 2569 * ram_save_complete: function called to send the remaining amount of ram 2570 * 2571 * Returns zero to indicate success or negative on error 2572 * 2573 * Called with iothread lock 2574 * 2575 * @f: QEMUFile where to send the data 2576 * @opaque: RAMState pointer 2577 */ 2578 static int ram_save_complete(QEMUFile *f, void *opaque) 2579 { 2580 RAMState **temp = opaque; 2581 RAMState *rs = *temp; 2582 int ret = 0; 2583 2584 WITH_RCU_READ_LOCK_GUARD() { 2585 if (!migration_in_postcopy()) { 2586 migration_bitmap_sync_precopy(rs); 2587 } 2588 2589 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2590 2591 /* try transferring iterative blocks of memory */ 2592 2593 /* flush all remaining blocks regardless of rate limiting */ 2594 while (true) { 2595 int pages; 2596 2597 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2598 /* no more blocks to sent */ 2599 if (pages == 0) { 2600 break; 2601 } 2602 if (pages < 0) { 2603 ret = pages; 2604 break; 2605 } 2606 } 2607 2608 flush_compressed_data(rs); 2609 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2610 } 2611 2612 if (ret >= 0) { 2613 multifd_send_sync_main(rs->f); 2614 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2615 qemu_fflush(f); 2616 } 2617 2618 return ret; 2619 } 2620 2621 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2622 uint64_t *res_precopy_only, 2623 uint64_t *res_compatible, 2624 uint64_t *res_postcopy_only) 2625 { 2626 RAMState **temp = opaque; 2627 RAMState *rs = *temp; 2628 uint64_t remaining_size; 2629 2630 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2631 2632 if (!migration_in_postcopy() && 2633 remaining_size < max_size) { 2634 qemu_mutex_lock_iothread(); 2635 WITH_RCU_READ_LOCK_GUARD() { 2636 migration_bitmap_sync_precopy(rs); 2637 } 2638 qemu_mutex_unlock_iothread(); 2639 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2640 } 2641 2642 if (migrate_postcopy_ram()) { 2643 /* We can do postcopy, and all the data is postcopiable */ 2644 *res_compatible += remaining_size; 2645 } else { 2646 *res_precopy_only += remaining_size; 2647 } 2648 } 2649 2650 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2651 { 2652 unsigned int xh_len; 2653 int xh_flags; 2654 uint8_t *loaded_data; 2655 2656 /* extract RLE header */ 2657 xh_flags = qemu_get_byte(f); 2658 xh_len = qemu_get_be16(f); 2659 2660 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2661 error_report("Failed to load XBZRLE page - wrong compression!"); 2662 return -1; 2663 } 2664 2665 if (xh_len > TARGET_PAGE_SIZE) { 2666 error_report("Failed to load XBZRLE page - len overflow!"); 2667 return -1; 2668 } 2669 loaded_data = XBZRLE.decoded_buf; 2670 /* load data and decode */ 2671 /* it can change loaded_data to point to an internal buffer */ 2672 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2673 2674 /* decode RLE */ 2675 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2676 TARGET_PAGE_SIZE) == -1) { 2677 error_report("Failed to load XBZRLE page - decode error!"); 2678 return -1; 2679 } 2680 2681 return 0; 2682 } 2683 2684 /** 2685 * ram_block_from_stream: read a RAMBlock id from the migration stream 2686 * 2687 * Must be called from within a rcu critical section. 2688 * 2689 * Returns a pointer from within the RCU-protected ram_list. 2690 * 2691 * @f: QEMUFile where to read the data from 2692 * @flags: Page flags (mostly to see if it's a continuation of previous block) 2693 */ 2694 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 2695 { 2696 static RAMBlock *block = NULL; 2697 char id[256]; 2698 uint8_t len; 2699 2700 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2701 if (!block) { 2702 error_report("Ack, bad migration stream!"); 2703 return NULL; 2704 } 2705 return block; 2706 } 2707 2708 len = qemu_get_byte(f); 2709 qemu_get_buffer(f, (uint8_t *)id, len); 2710 id[len] = 0; 2711 2712 block = qemu_ram_block_by_name(id); 2713 if (!block) { 2714 error_report("Can't find block %s", id); 2715 return NULL; 2716 } 2717 2718 if (ramblock_is_ignored(block)) { 2719 error_report("block %s should not be migrated !", id); 2720 return NULL; 2721 } 2722 2723 return block; 2724 } 2725 2726 static inline void *host_from_ram_block_offset(RAMBlock *block, 2727 ram_addr_t offset) 2728 { 2729 if (!offset_in_ramblock(block, offset)) { 2730 return NULL; 2731 } 2732 2733 return block->host + offset; 2734 } 2735 2736 static inline void *colo_cache_from_block_offset(RAMBlock *block, 2737 ram_addr_t offset) 2738 { 2739 if (!offset_in_ramblock(block, offset)) { 2740 return NULL; 2741 } 2742 if (!block->colo_cache) { 2743 error_report("%s: colo_cache is NULL in block :%s", 2744 __func__, block->idstr); 2745 return NULL; 2746 } 2747 2748 /* 2749 * During colo checkpoint, we need bitmap of these migrated pages. 2750 * It help us to decide which pages in ram cache should be flushed 2751 * into VM's RAM later. 2752 */ 2753 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 2754 ram_state->migration_dirty_pages++; 2755 } 2756 return block->colo_cache + offset; 2757 } 2758 2759 /** 2760 * ram_handle_compressed: handle the zero page case 2761 * 2762 * If a page (or a whole RDMA chunk) has been 2763 * determined to be zero, then zap it. 2764 * 2765 * @host: host address for the zero page 2766 * @ch: what the page is filled from. We only support zero 2767 * @size: size of the zero page 2768 */ 2769 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2770 { 2771 if (ch != 0 || !is_zero_range(host, size)) { 2772 memset(host, ch, size); 2773 } 2774 } 2775 2776 /* return the size after decompression, or negative value on error */ 2777 static int 2778 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 2779 const uint8_t *source, size_t source_len) 2780 { 2781 int err; 2782 2783 err = inflateReset(stream); 2784 if (err != Z_OK) { 2785 return -1; 2786 } 2787 2788 stream->avail_in = source_len; 2789 stream->next_in = (uint8_t *)source; 2790 stream->avail_out = dest_len; 2791 stream->next_out = dest; 2792 2793 err = inflate(stream, Z_NO_FLUSH); 2794 if (err != Z_STREAM_END) { 2795 return -1; 2796 } 2797 2798 return stream->total_out; 2799 } 2800 2801 static void *do_data_decompress(void *opaque) 2802 { 2803 DecompressParam *param = opaque; 2804 unsigned long pagesize; 2805 uint8_t *des; 2806 int len, ret; 2807 2808 qemu_mutex_lock(¶m->mutex); 2809 while (!param->quit) { 2810 if (param->des) { 2811 des = param->des; 2812 len = param->len; 2813 param->des = 0; 2814 qemu_mutex_unlock(¶m->mutex); 2815 2816 pagesize = TARGET_PAGE_SIZE; 2817 2818 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 2819 param->compbuf, len); 2820 if (ret < 0 && migrate_get_current()->decompress_error_check) { 2821 error_report("decompress data failed"); 2822 qemu_file_set_error(decomp_file, ret); 2823 } 2824 2825 qemu_mutex_lock(&decomp_done_lock); 2826 param->done = true; 2827 qemu_cond_signal(&decomp_done_cond); 2828 qemu_mutex_unlock(&decomp_done_lock); 2829 2830 qemu_mutex_lock(¶m->mutex); 2831 } else { 2832 qemu_cond_wait(¶m->cond, ¶m->mutex); 2833 } 2834 } 2835 qemu_mutex_unlock(¶m->mutex); 2836 2837 return NULL; 2838 } 2839 2840 static int wait_for_decompress_done(void) 2841 { 2842 int idx, thread_count; 2843 2844 if (!migrate_use_compression()) { 2845 return 0; 2846 } 2847 2848 thread_count = migrate_decompress_threads(); 2849 qemu_mutex_lock(&decomp_done_lock); 2850 for (idx = 0; idx < thread_count; idx++) { 2851 while (!decomp_param[idx].done) { 2852 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2853 } 2854 } 2855 qemu_mutex_unlock(&decomp_done_lock); 2856 return qemu_file_get_error(decomp_file); 2857 } 2858 2859 static void compress_threads_load_cleanup(void) 2860 { 2861 int i, thread_count; 2862 2863 if (!migrate_use_compression()) { 2864 return; 2865 } 2866 thread_count = migrate_decompress_threads(); 2867 for (i = 0; i < thread_count; i++) { 2868 /* 2869 * we use it as a indicator which shows if the thread is 2870 * properly init'd or not 2871 */ 2872 if (!decomp_param[i].compbuf) { 2873 break; 2874 } 2875 2876 qemu_mutex_lock(&decomp_param[i].mutex); 2877 decomp_param[i].quit = true; 2878 qemu_cond_signal(&decomp_param[i].cond); 2879 qemu_mutex_unlock(&decomp_param[i].mutex); 2880 } 2881 for (i = 0; i < thread_count; i++) { 2882 if (!decomp_param[i].compbuf) { 2883 break; 2884 } 2885 2886 qemu_thread_join(decompress_threads + i); 2887 qemu_mutex_destroy(&decomp_param[i].mutex); 2888 qemu_cond_destroy(&decomp_param[i].cond); 2889 inflateEnd(&decomp_param[i].stream); 2890 g_free(decomp_param[i].compbuf); 2891 decomp_param[i].compbuf = NULL; 2892 } 2893 g_free(decompress_threads); 2894 g_free(decomp_param); 2895 decompress_threads = NULL; 2896 decomp_param = NULL; 2897 decomp_file = NULL; 2898 } 2899 2900 static int compress_threads_load_setup(QEMUFile *f) 2901 { 2902 int i, thread_count; 2903 2904 if (!migrate_use_compression()) { 2905 return 0; 2906 } 2907 2908 thread_count = migrate_decompress_threads(); 2909 decompress_threads = g_new0(QemuThread, thread_count); 2910 decomp_param = g_new0(DecompressParam, thread_count); 2911 qemu_mutex_init(&decomp_done_lock); 2912 qemu_cond_init(&decomp_done_cond); 2913 decomp_file = f; 2914 for (i = 0; i < thread_count; i++) { 2915 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 2916 goto exit; 2917 } 2918 2919 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2920 qemu_mutex_init(&decomp_param[i].mutex); 2921 qemu_cond_init(&decomp_param[i].cond); 2922 decomp_param[i].done = true; 2923 decomp_param[i].quit = false; 2924 qemu_thread_create(decompress_threads + i, "decompress", 2925 do_data_decompress, decomp_param + i, 2926 QEMU_THREAD_JOINABLE); 2927 } 2928 return 0; 2929 exit: 2930 compress_threads_load_cleanup(); 2931 return -1; 2932 } 2933 2934 static void decompress_data_with_multi_threads(QEMUFile *f, 2935 void *host, int len) 2936 { 2937 int idx, thread_count; 2938 2939 thread_count = migrate_decompress_threads(); 2940 qemu_mutex_lock(&decomp_done_lock); 2941 while (true) { 2942 for (idx = 0; idx < thread_count; idx++) { 2943 if (decomp_param[idx].done) { 2944 decomp_param[idx].done = false; 2945 qemu_mutex_lock(&decomp_param[idx].mutex); 2946 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2947 decomp_param[idx].des = host; 2948 decomp_param[idx].len = len; 2949 qemu_cond_signal(&decomp_param[idx].cond); 2950 qemu_mutex_unlock(&decomp_param[idx].mutex); 2951 break; 2952 } 2953 } 2954 if (idx < thread_count) { 2955 break; 2956 } else { 2957 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2958 } 2959 } 2960 qemu_mutex_unlock(&decomp_done_lock); 2961 } 2962 2963 /* 2964 * colo cache: this is for secondary VM, we cache the whole 2965 * memory of the secondary VM, it is need to hold the global lock 2966 * to call this helper. 2967 */ 2968 int colo_init_ram_cache(void) 2969 { 2970 RAMBlock *block; 2971 2972 WITH_RCU_READ_LOCK_GUARD() { 2973 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2974 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 2975 NULL, 2976 false); 2977 if (!block->colo_cache) { 2978 error_report("%s: Can't alloc memory for COLO cache of block %s," 2979 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 2980 block->used_length); 2981 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2982 if (block->colo_cache) { 2983 qemu_anon_ram_free(block->colo_cache, block->used_length); 2984 block->colo_cache = NULL; 2985 } 2986 } 2987 return -errno; 2988 } 2989 memcpy(block->colo_cache, block->host, block->used_length); 2990 } 2991 } 2992 2993 /* 2994 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 2995 * with to decide which page in cache should be flushed into SVM's RAM. Here 2996 * we use the same name 'ram_bitmap' as for migration. 2997 */ 2998 if (ram_bytes_total()) { 2999 RAMBlock *block; 3000 3001 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3002 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3003 3004 block->bmap = bitmap_new(pages); 3005 bitmap_set(block->bmap, 0, pages); 3006 } 3007 } 3008 ram_state = g_new0(RAMState, 1); 3009 ram_state->migration_dirty_pages = 0; 3010 qemu_mutex_init(&ram_state->bitmap_mutex); 3011 memory_global_dirty_log_start(); 3012 3013 return 0; 3014 } 3015 3016 /* It is need to hold the global lock to call this helper */ 3017 void colo_release_ram_cache(void) 3018 { 3019 RAMBlock *block; 3020 3021 memory_global_dirty_log_stop(); 3022 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3023 g_free(block->bmap); 3024 block->bmap = NULL; 3025 } 3026 3027 WITH_RCU_READ_LOCK_GUARD() { 3028 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3029 if (block->colo_cache) { 3030 qemu_anon_ram_free(block->colo_cache, block->used_length); 3031 block->colo_cache = NULL; 3032 } 3033 } 3034 } 3035 qemu_mutex_destroy(&ram_state->bitmap_mutex); 3036 g_free(ram_state); 3037 ram_state = NULL; 3038 } 3039 3040 /** 3041 * ram_load_setup: Setup RAM for migration incoming side 3042 * 3043 * Returns zero to indicate success and negative for error 3044 * 3045 * @f: QEMUFile where to receive the data 3046 * @opaque: RAMState pointer 3047 */ 3048 static int ram_load_setup(QEMUFile *f, void *opaque) 3049 { 3050 if (compress_threads_load_setup(f)) { 3051 return -1; 3052 } 3053 3054 xbzrle_load_setup(); 3055 ramblock_recv_map_init(); 3056 3057 return 0; 3058 } 3059 3060 static int ram_load_cleanup(void *opaque) 3061 { 3062 RAMBlock *rb; 3063 3064 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3065 qemu_ram_block_writeback(rb); 3066 } 3067 3068 xbzrle_load_cleanup(); 3069 compress_threads_load_cleanup(); 3070 3071 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3072 g_free(rb->receivedmap); 3073 rb->receivedmap = NULL; 3074 } 3075 3076 return 0; 3077 } 3078 3079 /** 3080 * ram_postcopy_incoming_init: allocate postcopy data structures 3081 * 3082 * Returns 0 for success and negative if there was one error 3083 * 3084 * @mis: current migration incoming state 3085 * 3086 * Allocate data structures etc needed by incoming migration with 3087 * postcopy-ram. postcopy-ram's similarly names 3088 * postcopy_ram_incoming_init does the work. 3089 */ 3090 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3091 { 3092 return postcopy_ram_incoming_init(mis); 3093 } 3094 3095 /** 3096 * ram_load_postcopy: load a page in postcopy case 3097 * 3098 * Returns 0 for success or -errno in case of error 3099 * 3100 * Called in postcopy mode by ram_load(). 3101 * rcu_read_lock is taken prior to this being called. 3102 * 3103 * @f: QEMUFile where to send the data 3104 */ 3105 static int ram_load_postcopy(QEMUFile *f) 3106 { 3107 int flags = 0, ret = 0; 3108 bool place_needed = false; 3109 bool matches_target_page_size = false; 3110 MigrationIncomingState *mis = migration_incoming_get_current(); 3111 /* Temporary page that is later 'placed' */ 3112 void *postcopy_host_page = mis->postcopy_tmp_page; 3113 void *this_host = NULL; 3114 bool all_zero = false; 3115 int target_pages = 0; 3116 3117 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3118 ram_addr_t addr; 3119 void *host = NULL; 3120 void *page_buffer = NULL; 3121 void *place_source = NULL; 3122 RAMBlock *block = NULL; 3123 uint8_t ch; 3124 int len; 3125 3126 addr = qemu_get_be64(f); 3127 3128 /* 3129 * If qemu file error, we should stop here, and then "addr" 3130 * may be invalid 3131 */ 3132 ret = qemu_file_get_error(f); 3133 if (ret) { 3134 break; 3135 } 3136 3137 flags = addr & ~TARGET_PAGE_MASK; 3138 addr &= TARGET_PAGE_MASK; 3139 3140 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3141 place_needed = false; 3142 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3143 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3144 block = ram_block_from_stream(f, flags); 3145 3146 host = host_from_ram_block_offset(block, addr); 3147 if (!host) { 3148 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3149 ret = -EINVAL; 3150 break; 3151 } 3152 target_pages++; 3153 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3154 /* 3155 * Postcopy requires that we place whole host pages atomically; 3156 * these may be huge pages for RAMBlocks that are backed by 3157 * hugetlbfs. 3158 * To make it atomic, the data is read into a temporary page 3159 * that's moved into place later. 3160 * The migration protocol uses, possibly smaller, target-pages 3161 * however the source ensures it always sends all the components 3162 * of a host page in one chunk. 3163 */ 3164 page_buffer = postcopy_host_page + 3165 ((uintptr_t)host & (block->page_size - 1)); 3166 /* If all TP are zero then we can optimise the place */ 3167 if (target_pages == 1) { 3168 all_zero = true; 3169 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3170 block->page_size); 3171 } else { 3172 /* not the 1st TP within the HP */ 3173 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) != 3174 (uintptr_t)this_host) { 3175 error_report("Non-same host page %p/%p", 3176 host, this_host); 3177 ret = -EINVAL; 3178 break; 3179 } 3180 } 3181 3182 /* 3183 * If it's the last part of a host page then we place the host 3184 * page 3185 */ 3186 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3187 place_needed = true; 3188 target_pages = 0; 3189 } 3190 place_source = postcopy_host_page; 3191 } 3192 3193 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3194 case RAM_SAVE_FLAG_ZERO: 3195 ch = qemu_get_byte(f); 3196 /* 3197 * Can skip to set page_buffer when 3198 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3199 */ 3200 if (ch || !matches_target_page_size) { 3201 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3202 } 3203 if (ch) { 3204 all_zero = false; 3205 } 3206 break; 3207 3208 case RAM_SAVE_FLAG_PAGE: 3209 all_zero = false; 3210 if (!matches_target_page_size) { 3211 /* For huge pages, we always use temporary buffer */ 3212 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3213 } else { 3214 /* 3215 * For small pages that matches target page size, we 3216 * avoid the qemu_file copy. Instead we directly use 3217 * the buffer of QEMUFile to place the page. Note: we 3218 * cannot do any QEMUFile operation before using that 3219 * buffer to make sure the buffer is valid when 3220 * placing the page. 3221 */ 3222 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3223 TARGET_PAGE_SIZE); 3224 } 3225 break; 3226 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3227 all_zero = false; 3228 len = qemu_get_be32(f); 3229 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3230 error_report("Invalid compressed data length: %d", len); 3231 ret = -EINVAL; 3232 break; 3233 } 3234 decompress_data_with_multi_threads(f, page_buffer, len); 3235 break; 3236 3237 case RAM_SAVE_FLAG_EOS: 3238 /* normal exit */ 3239 multifd_recv_sync_main(); 3240 break; 3241 default: 3242 error_report("Unknown combination of migration flags: %#x" 3243 " (postcopy mode)", flags); 3244 ret = -EINVAL; 3245 break; 3246 } 3247 3248 /* Got the whole host page, wait for decompress before placing. */ 3249 if (place_needed) { 3250 ret |= wait_for_decompress_done(); 3251 } 3252 3253 /* Detect for any possible file errors */ 3254 if (!ret && qemu_file_get_error(f)) { 3255 ret = qemu_file_get_error(f); 3256 } 3257 3258 if (!ret && place_needed) { 3259 /* This gets called at the last target page in the host page */ 3260 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3261 block->page_size); 3262 3263 if (all_zero) { 3264 ret = postcopy_place_page_zero(mis, place_dest, 3265 block); 3266 } else { 3267 ret = postcopy_place_page(mis, place_dest, 3268 place_source, block); 3269 } 3270 } 3271 } 3272 3273 return ret; 3274 } 3275 3276 static bool postcopy_is_advised(void) 3277 { 3278 PostcopyState ps = postcopy_state_get(); 3279 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3280 } 3281 3282 static bool postcopy_is_running(void) 3283 { 3284 PostcopyState ps = postcopy_state_get(); 3285 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3286 } 3287 3288 /* 3289 * Flush content of RAM cache into SVM's memory. 3290 * Only flush the pages that be dirtied by PVM or SVM or both. 3291 */ 3292 static void colo_flush_ram_cache(void) 3293 { 3294 RAMBlock *block = NULL; 3295 void *dst_host; 3296 void *src_host; 3297 unsigned long offset = 0; 3298 3299 memory_global_dirty_log_sync(); 3300 WITH_RCU_READ_LOCK_GUARD() { 3301 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3302 ramblock_sync_dirty_bitmap(ram_state, block); 3303 } 3304 } 3305 3306 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3307 WITH_RCU_READ_LOCK_GUARD() { 3308 block = QLIST_FIRST_RCU(&ram_list.blocks); 3309 3310 while (block) { 3311 offset = migration_bitmap_find_dirty(ram_state, block, offset); 3312 3313 if (((ram_addr_t)offset) << TARGET_PAGE_BITS 3314 >= block->used_length) { 3315 offset = 0; 3316 block = QLIST_NEXT_RCU(block, next); 3317 } else { 3318 migration_bitmap_clear_dirty(ram_state, block, offset); 3319 dst_host = block->host 3320 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3321 src_host = block->colo_cache 3322 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3323 memcpy(dst_host, src_host, TARGET_PAGE_SIZE); 3324 } 3325 } 3326 } 3327 trace_colo_flush_ram_cache_end(); 3328 } 3329 3330 /** 3331 * ram_load_precopy: load pages in precopy case 3332 * 3333 * Returns 0 for success or -errno in case of error 3334 * 3335 * Called in precopy mode by ram_load(). 3336 * rcu_read_lock is taken prior to this being called. 3337 * 3338 * @f: QEMUFile where to send the data 3339 */ 3340 static int ram_load_precopy(QEMUFile *f) 3341 { 3342 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3343 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3344 bool postcopy_advised = postcopy_is_advised(); 3345 if (!migrate_use_compression()) { 3346 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3347 } 3348 3349 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3350 ram_addr_t addr, total_ram_bytes; 3351 void *host = NULL; 3352 uint8_t ch; 3353 3354 /* 3355 * Yield periodically to let main loop run, but an iteration of 3356 * the main loop is expensive, so do it each some iterations 3357 */ 3358 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3359 aio_co_schedule(qemu_get_current_aio_context(), 3360 qemu_coroutine_self()); 3361 qemu_coroutine_yield(); 3362 } 3363 i++; 3364 3365 addr = qemu_get_be64(f); 3366 flags = addr & ~TARGET_PAGE_MASK; 3367 addr &= TARGET_PAGE_MASK; 3368 3369 if (flags & invalid_flags) { 3370 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3371 error_report("Received an unexpected compressed page"); 3372 } 3373 3374 ret = -EINVAL; 3375 break; 3376 } 3377 3378 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3379 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3380 RAMBlock *block = ram_block_from_stream(f, flags); 3381 3382 /* 3383 * After going into COLO, we should load the Page into colo_cache. 3384 */ 3385 if (migration_incoming_in_colo_state()) { 3386 host = colo_cache_from_block_offset(block, addr); 3387 } else { 3388 host = host_from_ram_block_offset(block, addr); 3389 } 3390 if (!host) { 3391 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3392 ret = -EINVAL; 3393 break; 3394 } 3395 3396 if (!migration_incoming_in_colo_state()) { 3397 ramblock_recv_bitmap_set(block, host); 3398 } 3399 3400 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3401 } 3402 3403 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3404 case RAM_SAVE_FLAG_MEM_SIZE: 3405 /* Synchronize RAM block list */ 3406 total_ram_bytes = addr; 3407 while (!ret && total_ram_bytes) { 3408 RAMBlock *block; 3409 char id[256]; 3410 ram_addr_t length; 3411 3412 len = qemu_get_byte(f); 3413 qemu_get_buffer(f, (uint8_t *)id, len); 3414 id[len] = 0; 3415 length = qemu_get_be64(f); 3416 3417 block = qemu_ram_block_by_name(id); 3418 if (block && !qemu_ram_is_migratable(block)) { 3419 error_report("block %s should not be migrated !", id); 3420 ret = -EINVAL; 3421 } else if (block) { 3422 if (length != block->used_length) { 3423 Error *local_err = NULL; 3424 3425 ret = qemu_ram_resize(block, length, 3426 &local_err); 3427 if (local_err) { 3428 error_report_err(local_err); 3429 } 3430 } 3431 /* For postcopy we need to check hugepage sizes match */ 3432 if (postcopy_advised && 3433 block->page_size != qemu_host_page_size) { 3434 uint64_t remote_page_size = qemu_get_be64(f); 3435 if (remote_page_size != block->page_size) { 3436 error_report("Mismatched RAM page size %s " 3437 "(local) %zd != %" PRId64, 3438 id, block->page_size, 3439 remote_page_size); 3440 ret = -EINVAL; 3441 } 3442 } 3443 if (migrate_ignore_shared()) { 3444 hwaddr addr = qemu_get_be64(f); 3445 if (ramblock_is_ignored(block) && 3446 block->mr->addr != addr) { 3447 error_report("Mismatched GPAs for block %s " 3448 "%" PRId64 "!= %" PRId64, 3449 id, (uint64_t)addr, 3450 (uint64_t)block->mr->addr); 3451 ret = -EINVAL; 3452 } 3453 } 3454 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3455 block->idstr); 3456 } else { 3457 error_report("Unknown ramblock \"%s\", cannot " 3458 "accept migration", id); 3459 ret = -EINVAL; 3460 } 3461 3462 total_ram_bytes -= length; 3463 } 3464 break; 3465 3466 case RAM_SAVE_FLAG_ZERO: 3467 ch = qemu_get_byte(f); 3468 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3469 break; 3470 3471 case RAM_SAVE_FLAG_PAGE: 3472 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3473 break; 3474 3475 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3476 len = qemu_get_be32(f); 3477 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3478 error_report("Invalid compressed data length: %d", len); 3479 ret = -EINVAL; 3480 break; 3481 } 3482 decompress_data_with_multi_threads(f, host, len); 3483 break; 3484 3485 case RAM_SAVE_FLAG_XBZRLE: 3486 if (load_xbzrle(f, addr, host) < 0) { 3487 error_report("Failed to decompress XBZRLE page at " 3488 RAM_ADDR_FMT, addr); 3489 ret = -EINVAL; 3490 break; 3491 } 3492 break; 3493 case RAM_SAVE_FLAG_EOS: 3494 /* normal exit */ 3495 multifd_recv_sync_main(); 3496 break; 3497 default: 3498 if (flags & RAM_SAVE_FLAG_HOOK) { 3499 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3500 } else { 3501 error_report("Unknown combination of migration flags: %#x", 3502 flags); 3503 ret = -EINVAL; 3504 } 3505 } 3506 if (!ret) { 3507 ret = qemu_file_get_error(f); 3508 } 3509 } 3510 3511 ret |= wait_for_decompress_done(); 3512 return ret; 3513 } 3514 3515 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3516 { 3517 int ret = 0; 3518 static uint64_t seq_iter; 3519 /* 3520 * If system is running in postcopy mode, page inserts to host memory must 3521 * be atomic 3522 */ 3523 bool postcopy_running = postcopy_is_running(); 3524 3525 seq_iter++; 3526 3527 if (version_id != 4) { 3528 return -EINVAL; 3529 } 3530 3531 /* 3532 * This RCU critical section can be very long running. 3533 * When RCU reclaims in the code start to become numerous, 3534 * it will be necessary to reduce the granularity of this 3535 * critical section. 3536 */ 3537 WITH_RCU_READ_LOCK_GUARD() { 3538 if (postcopy_running) { 3539 ret = ram_load_postcopy(f); 3540 } else { 3541 ret = ram_load_precopy(f); 3542 } 3543 } 3544 trace_ram_load_complete(ret, seq_iter); 3545 3546 if (!ret && migration_incoming_in_colo_state()) { 3547 colo_flush_ram_cache(); 3548 } 3549 return ret; 3550 } 3551 3552 static bool ram_has_postcopy(void *opaque) 3553 { 3554 RAMBlock *rb; 3555 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3556 if (ramblock_is_pmem(rb)) { 3557 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 3558 "is not supported now!", rb->idstr, rb->host); 3559 return false; 3560 } 3561 } 3562 3563 return migrate_postcopy_ram(); 3564 } 3565 3566 /* Sync all the dirty bitmap with destination VM. */ 3567 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 3568 { 3569 RAMBlock *block; 3570 QEMUFile *file = s->to_dst_file; 3571 int ramblock_count = 0; 3572 3573 trace_ram_dirty_bitmap_sync_start(); 3574 3575 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3576 qemu_savevm_send_recv_bitmap(file, block->idstr); 3577 trace_ram_dirty_bitmap_request(block->idstr); 3578 ramblock_count++; 3579 } 3580 3581 trace_ram_dirty_bitmap_sync_wait(); 3582 3583 /* Wait until all the ramblocks' dirty bitmap synced */ 3584 while (ramblock_count--) { 3585 qemu_sem_wait(&s->rp_state.rp_sem); 3586 } 3587 3588 trace_ram_dirty_bitmap_sync_complete(); 3589 3590 return 0; 3591 } 3592 3593 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 3594 { 3595 qemu_sem_post(&s->rp_state.rp_sem); 3596 } 3597 3598 /* 3599 * Read the received bitmap, revert it as the initial dirty bitmap. 3600 * This is only used when the postcopy migration is paused but wants 3601 * to resume from a middle point. 3602 */ 3603 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 3604 { 3605 int ret = -EINVAL; 3606 QEMUFile *file = s->rp_state.from_dst_file; 3607 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 3608 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 3609 uint64_t size, end_mark; 3610 3611 trace_ram_dirty_bitmap_reload_begin(block->idstr); 3612 3613 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 3614 error_report("%s: incorrect state %s", __func__, 3615 MigrationStatus_str(s->state)); 3616 return -EINVAL; 3617 } 3618 3619 /* 3620 * Note: see comments in ramblock_recv_bitmap_send() on why we 3621 * need the endianess convertion, and the paddings. 3622 */ 3623 local_size = ROUND_UP(local_size, 8); 3624 3625 /* Add paddings */ 3626 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 3627 3628 size = qemu_get_be64(file); 3629 3630 /* The size of the bitmap should match with our ramblock */ 3631 if (size != local_size) { 3632 error_report("%s: ramblock '%s' bitmap size mismatch " 3633 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 3634 block->idstr, size, local_size); 3635 ret = -EINVAL; 3636 goto out; 3637 } 3638 3639 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 3640 end_mark = qemu_get_be64(file); 3641 3642 ret = qemu_file_get_error(file); 3643 if (ret || size != local_size) { 3644 error_report("%s: read bitmap failed for ramblock '%s': %d" 3645 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 3646 __func__, block->idstr, ret, local_size, size); 3647 ret = -EIO; 3648 goto out; 3649 } 3650 3651 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 3652 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64, 3653 __func__, block->idstr, end_mark); 3654 ret = -EINVAL; 3655 goto out; 3656 } 3657 3658 /* 3659 * Endianess convertion. We are during postcopy (though paused). 3660 * The dirty bitmap won't change. We can directly modify it. 3661 */ 3662 bitmap_from_le(block->bmap, le_bitmap, nbits); 3663 3664 /* 3665 * What we received is "received bitmap". Revert it as the initial 3666 * dirty bitmap for this ramblock. 3667 */ 3668 bitmap_complement(block->bmap, block->bmap, nbits); 3669 3670 trace_ram_dirty_bitmap_reload_complete(block->idstr); 3671 3672 /* 3673 * We succeeded to sync bitmap for current ramblock. If this is 3674 * the last one to sync, we need to notify the main send thread. 3675 */ 3676 ram_dirty_bitmap_reload_notify(s); 3677 3678 ret = 0; 3679 out: 3680 g_free(le_bitmap); 3681 return ret; 3682 } 3683 3684 static int ram_resume_prepare(MigrationState *s, void *opaque) 3685 { 3686 RAMState *rs = *(RAMState **)opaque; 3687 int ret; 3688 3689 ret = ram_dirty_bitmap_sync_all(s, rs); 3690 if (ret) { 3691 return ret; 3692 } 3693 3694 ram_state_resume_prepare(rs, s->to_dst_file); 3695 3696 return 0; 3697 } 3698 3699 static SaveVMHandlers savevm_ram_handlers = { 3700 .save_setup = ram_save_setup, 3701 .save_live_iterate = ram_save_iterate, 3702 .save_live_complete_postcopy = ram_save_complete, 3703 .save_live_complete_precopy = ram_save_complete, 3704 .has_postcopy = ram_has_postcopy, 3705 .save_live_pending = ram_save_pending, 3706 .load_state = ram_load, 3707 .save_cleanup = ram_save_cleanup, 3708 .load_setup = ram_load_setup, 3709 .load_cleanup = ram_load_cleanup, 3710 .resume_prepare = ram_resume_prepare, 3711 }; 3712 3713 void ram_mig_init(void) 3714 { 3715 qemu_mutex_init(&XBZRLE.lock); 3716 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 3717 } 3718