1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/main-loop.h" 34 #include "xbzrle.h" 35 #include "ram.h" 36 #include "migration.h" 37 #include "migration/register.h" 38 #include "migration/misc.h" 39 #include "qemu-file.h" 40 #include "postcopy-ram.h" 41 #include "page_cache.h" 42 #include "qemu/error-report.h" 43 #include "qapi/error.h" 44 #include "qapi/qapi-types-migration.h" 45 #include "qapi/qapi-events-migration.h" 46 #include "qapi/qmp/qerror.h" 47 #include "trace.h" 48 #include "exec/ram_addr.h" 49 #include "exec/target_page.h" 50 #include "qemu/rcu_queue.h" 51 #include "migration/colo.h" 52 #include "block.h" 53 #include "sysemu/cpu-throttle.h" 54 #include "savevm.h" 55 #include "qemu/iov.h" 56 #include "multifd.h" 57 #include "sysemu/runstate.h" 58 59 #include "hw/boards.h" /* for machine_dump_guest_core() */ 60 61 #if defined(__linux__) 62 #include "qemu/userfaultfd.h" 63 #endif /* defined(__linux__) */ 64 65 /***********************************************************/ 66 /* ram save/restore */ 67 68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 69 * worked for pages that where filled with the same char. We switched 70 * it to only search for the zero value. And to avoid confusion with 71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 72 */ 73 74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 75 #define RAM_SAVE_FLAG_ZERO 0x02 76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 77 #define RAM_SAVE_FLAG_PAGE 0x08 78 #define RAM_SAVE_FLAG_EOS 0x10 79 #define RAM_SAVE_FLAG_CONTINUE 0x20 80 #define RAM_SAVE_FLAG_XBZRLE 0x40 81 /* 0x80 is reserved in migration.h start with 0x100 next */ 82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 83 84 XBZRLECacheStats xbzrle_counters; 85 86 /* struct contains XBZRLE cache and a static page 87 used by the compression */ 88 static struct { 89 /* buffer used for XBZRLE encoding */ 90 uint8_t *encoded_buf; 91 /* buffer for storing page content */ 92 uint8_t *current_buf; 93 /* Cache for XBZRLE, Protected by lock. */ 94 PageCache *cache; 95 QemuMutex lock; 96 /* it will store a page full of zeros */ 97 uint8_t *zero_target_page; 98 /* buffer used for XBZRLE decoding */ 99 uint8_t *decoded_buf; 100 } XBZRLE; 101 102 static void XBZRLE_cache_lock(void) 103 { 104 if (migrate_use_xbzrle()) { 105 qemu_mutex_lock(&XBZRLE.lock); 106 } 107 } 108 109 static void XBZRLE_cache_unlock(void) 110 { 111 if (migrate_use_xbzrle()) { 112 qemu_mutex_unlock(&XBZRLE.lock); 113 } 114 } 115 116 /** 117 * xbzrle_cache_resize: resize the xbzrle cache 118 * 119 * This function is called from migrate_params_apply in main 120 * thread, possibly while a migration is in progress. A running 121 * migration may be using the cache and might finish during this call, 122 * hence changes to the cache are protected by XBZRLE.lock(). 123 * 124 * Returns 0 for success or -1 for error 125 * 126 * @new_size: new cache size 127 * @errp: set *errp if the check failed, with reason 128 */ 129 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 130 { 131 PageCache *new_cache; 132 int64_t ret = 0; 133 134 /* Check for truncation */ 135 if (new_size != (size_t)new_size) { 136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 137 "exceeding address space"); 138 return -1; 139 } 140 141 if (new_size == migrate_xbzrle_cache_size()) { 142 /* nothing to do */ 143 return 0; 144 } 145 146 XBZRLE_cache_lock(); 147 148 if (XBZRLE.cache != NULL) { 149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 150 if (!new_cache) { 151 ret = -1; 152 goto out; 153 } 154 155 cache_fini(XBZRLE.cache); 156 XBZRLE.cache = new_cache; 157 } 158 out: 159 XBZRLE_cache_unlock(); 160 return ret; 161 } 162 163 bool ramblock_is_ignored(RAMBlock *block) 164 { 165 return !qemu_ram_is_migratable(block) || 166 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 167 } 168 169 #undef RAMBLOCK_FOREACH 170 171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 172 { 173 RAMBlock *block; 174 int ret = 0; 175 176 RCU_READ_LOCK_GUARD(); 177 178 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 179 ret = func(block, opaque); 180 if (ret) { 181 break; 182 } 183 } 184 return ret; 185 } 186 187 static void ramblock_recv_map_init(void) 188 { 189 RAMBlock *rb; 190 191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 192 assert(!rb->receivedmap); 193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 194 } 195 } 196 197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 198 { 199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 200 rb->receivedmap); 201 } 202 203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 204 { 205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 206 } 207 208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 209 { 210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 211 } 212 213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 214 size_t nr) 215 { 216 bitmap_set_atomic(rb->receivedmap, 217 ramblock_recv_bitmap_offset(host_addr, rb), 218 nr); 219 } 220 221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 222 223 /* 224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 225 * 226 * Returns >0 if success with sent bytes, or <0 if error. 227 */ 228 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 229 const char *block_name) 230 { 231 RAMBlock *block = qemu_ram_block_by_name(block_name); 232 unsigned long *le_bitmap, nbits; 233 uint64_t size; 234 235 if (!block) { 236 error_report("%s: invalid block name: %s", __func__, block_name); 237 return -1; 238 } 239 240 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 241 242 /* 243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 244 * machines we may need 4 more bytes for padding (see below 245 * comment). So extend it a bit before hand. 246 */ 247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 248 249 /* 250 * Always use little endian when sending the bitmap. This is 251 * required that when source and destination VMs are not using the 252 * same endianness. (Note: big endian won't work.) 253 */ 254 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 255 256 /* Size of the bitmap, in bytes */ 257 size = DIV_ROUND_UP(nbits, 8); 258 259 /* 260 * size is always aligned to 8 bytes for 64bit machines, but it 261 * may not be true for 32bit machines. We need this padding to 262 * make sure the migration can survive even between 32bit and 263 * 64bit machines. 264 */ 265 size = ROUND_UP(size, 8); 266 267 qemu_put_be64(file, size); 268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 269 /* 270 * Mark as an end, in case the middle part is screwed up due to 271 * some "mysterious" reason. 272 */ 273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 274 qemu_fflush(file); 275 276 g_free(le_bitmap); 277 278 if (qemu_file_get_error(file)) { 279 return qemu_file_get_error(file); 280 } 281 282 return size + sizeof(size); 283 } 284 285 /* 286 * An outstanding page request, on the source, having been received 287 * and queued 288 */ 289 struct RAMSrcPageRequest { 290 RAMBlock *rb; 291 hwaddr offset; 292 hwaddr len; 293 294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 295 }; 296 297 /* State of RAM for migration */ 298 struct RAMState { 299 /* QEMUFile used for this migration */ 300 QEMUFile *f; 301 /* UFFD file descriptor, used in 'write-tracking' migration */ 302 int uffdio_fd; 303 /* Last block that we have visited searching for dirty pages */ 304 RAMBlock *last_seen_block; 305 /* Last block from where we have sent data */ 306 RAMBlock *last_sent_block; 307 /* Last dirty target page we have sent */ 308 ram_addr_t last_page; 309 /* last ram version we have seen */ 310 uint32_t last_version; 311 /* How many times we have dirty too many pages */ 312 int dirty_rate_high_cnt; 313 /* these variables are used for bitmap sync */ 314 /* last time we did a full bitmap_sync */ 315 int64_t time_last_bitmap_sync; 316 /* bytes transferred at start_time */ 317 uint64_t bytes_xfer_prev; 318 /* number of dirty pages since start_time */ 319 uint64_t num_dirty_pages_period; 320 /* xbzrle misses since the beginning of the period */ 321 uint64_t xbzrle_cache_miss_prev; 322 /* Amount of xbzrle pages since the beginning of the period */ 323 uint64_t xbzrle_pages_prev; 324 /* Amount of xbzrle encoded bytes since the beginning of the period */ 325 uint64_t xbzrle_bytes_prev; 326 /* Start using XBZRLE (e.g., after the first round). */ 327 bool xbzrle_enabled; 328 329 /* compression statistics since the beginning of the period */ 330 /* amount of count that no free thread to compress data */ 331 uint64_t compress_thread_busy_prev; 332 /* amount bytes after compression */ 333 uint64_t compressed_size_prev; 334 /* amount of compressed pages */ 335 uint64_t compress_pages_prev; 336 337 /* total handled target pages at the beginning of period */ 338 uint64_t target_page_count_prev; 339 /* total handled target pages since start */ 340 uint64_t target_page_count; 341 /* number of dirty bits in the bitmap */ 342 uint64_t migration_dirty_pages; 343 /* Protects modification of the bitmap and migration dirty pages */ 344 QemuMutex bitmap_mutex; 345 /* The RAMBlock used in the last src_page_requests */ 346 RAMBlock *last_req_rb; 347 /* Queue of outstanding page requests from the destination */ 348 QemuMutex src_page_req_mutex; 349 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 350 }; 351 typedef struct RAMState RAMState; 352 353 static RAMState *ram_state; 354 355 static NotifierWithReturnList precopy_notifier_list; 356 357 void precopy_infrastructure_init(void) 358 { 359 notifier_with_return_list_init(&precopy_notifier_list); 360 } 361 362 void precopy_add_notifier(NotifierWithReturn *n) 363 { 364 notifier_with_return_list_add(&precopy_notifier_list, n); 365 } 366 367 void precopy_remove_notifier(NotifierWithReturn *n) 368 { 369 notifier_with_return_remove(n); 370 } 371 372 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 373 { 374 PrecopyNotifyData pnd; 375 pnd.reason = reason; 376 pnd.errp = errp; 377 378 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 379 } 380 381 uint64_t ram_bytes_remaining(void) 382 { 383 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 384 0; 385 } 386 387 MigrationStats ram_counters; 388 389 /* used by the search for pages to send */ 390 struct PageSearchStatus { 391 /* Current block being searched */ 392 RAMBlock *block; 393 /* Current page to search from */ 394 unsigned long page; 395 /* Set once we wrap around */ 396 bool complete_round; 397 }; 398 typedef struct PageSearchStatus PageSearchStatus; 399 400 CompressionStats compression_counters; 401 402 struct CompressParam { 403 bool done; 404 bool quit; 405 bool zero_page; 406 QEMUFile *file; 407 QemuMutex mutex; 408 QemuCond cond; 409 RAMBlock *block; 410 ram_addr_t offset; 411 412 /* internally used fields */ 413 z_stream stream; 414 uint8_t *originbuf; 415 }; 416 typedef struct CompressParam CompressParam; 417 418 struct DecompressParam { 419 bool done; 420 bool quit; 421 QemuMutex mutex; 422 QemuCond cond; 423 void *des; 424 uint8_t *compbuf; 425 int len; 426 z_stream stream; 427 }; 428 typedef struct DecompressParam DecompressParam; 429 430 static CompressParam *comp_param; 431 static QemuThread *compress_threads; 432 /* comp_done_cond is used to wake up the migration thread when 433 * one of the compression threads has finished the compression. 434 * comp_done_lock is used to co-work with comp_done_cond. 435 */ 436 static QemuMutex comp_done_lock; 437 static QemuCond comp_done_cond; 438 /* The empty QEMUFileOps will be used by file in CompressParam */ 439 static const QEMUFileOps empty_ops = { }; 440 441 static QEMUFile *decomp_file; 442 static DecompressParam *decomp_param; 443 static QemuThread *decompress_threads; 444 static QemuMutex decomp_done_lock; 445 static QemuCond decomp_done_cond; 446 447 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 448 ram_addr_t offset, uint8_t *source_buf); 449 450 static void *do_data_compress(void *opaque) 451 { 452 CompressParam *param = opaque; 453 RAMBlock *block; 454 ram_addr_t offset; 455 bool zero_page; 456 457 qemu_mutex_lock(¶m->mutex); 458 while (!param->quit) { 459 if (param->block) { 460 block = param->block; 461 offset = param->offset; 462 param->block = NULL; 463 qemu_mutex_unlock(¶m->mutex); 464 465 zero_page = do_compress_ram_page(param->file, ¶m->stream, 466 block, offset, param->originbuf); 467 468 qemu_mutex_lock(&comp_done_lock); 469 param->done = true; 470 param->zero_page = zero_page; 471 qemu_cond_signal(&comp_done_cond); 472 qemu_mutex_unlock(&comp_done_lock); 473 474 qemu_mutex_lock(¶m->mutex); 475 } else { 476 qemu_cond_wait(¶m->cond, ¶m->mutex); 477 } 478 } 479 qemu_mutex_unlock(¶m->mutex); 480 481 return NULL; 482 } 483 484 static void compress_threads_save_cleanup(void) 485 { 486 int i, thread_count; 487 488 if (!migrate_use_compression() || !comp_param) { 489 return; 490 } 491 492 thread_count = migrate_compress_threads(); 493 for (i = 0; i < thread_count; i++) { 494 /* 495 * we use it as a indicator which shows if the thread is 496 * properly init'd or not 497 */ 498 if (!comp_param[i].file) { 499 break; 500 } 501 502 qemu_mutex_lock(&comp_param[i].mutex); 503 comp_param[i].quit = true; 504 qemu_cond_signal(&comp_param[i].cond); 505 qemu_mutex_unlock(&comp_param[i].mutex); 506 507 qemu_thread_join(compress_threads + i); 508 qemu_mutex_destroy(&comp_param[i].mutex); 509 qemu_cond_destroy(&comp_param[i].cond); 510 deflateEnd(&comp_param[i].stream); 511 g_free(comp_param[i].originbuf); 512 qemu_fclose(comp_param[i].file); 513 comp_param[i].file = NULL; 514 } 515 qemu_mutex_destroy(&comp_done_lock); 516 qemu_cond_destroy(&comp_done_cond); 517 g_free(compress_threads); 518 g_free(comp_param); 519 compress_threads = NULL; 520 comp_param = NULL; 521 } 522 523 static int compress_threads_save_setup(void) 524 { 525 int i, thread_count; 526 527 if (!migrate_use_compression()) { 528 return 0; 529 } 530 thread_count = migrate_compress_threads(); 531 compress_threads = g_new0(QemuThread, thread_count); 532 comp_param = g_new0(CompressParam, thread_count); 533 qemu_cond_init(&comp_done_cond); 534 qemu_mutex_init(&comp_done_lock); 535 for (i = 0; i < thread_count; i++) { 536 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 537 if (!comp_param[i].originbuf) { 538 goto exit; 539 } 540 541 if (deflateInit(&comp_param[i].stream, 542 migrate_compress_level()) != Z_OK) { 543 g_free(comp_param[i].originbuf); 544 goto exit; 545 } 546 547 /* comp_param[i].file is just used as a dummy buffer to save data, 548 * set its ops to empty. 549 */ 550 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false); 551 comp_param[i].done = true; 552 comp_param[i].quit = false; 553 qemu_mutex_init(&comp_param[i].mutex); 554 qemu_cond_init(&comp_param[i].cond); 555 qemu_thread_create(compress_threads + i, "compress", 556 do_data_compress, comp_param + i, 557 QEMU_THREAD_JOINABLE); 558 } 559 return 0; 560 561 exit: 562 compress_threads_save_cleanup(); 563 return -1; 564 } 565 566 /** 567 * save_page_header: write page header to wire 568 * 569 * If this is the 1st block, it also writes the block identification 570 * 571 * Returns the number of bytes written 572 * 573 * @f: QEMUFile where to send the data 574 * @block: block that contains the page we want to send 575 * @offset: offset inside the block for the page 576 * in the lower bits, it contains flags 577 */ 578 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 579 ram_addr_t offset) 580 { 581 size_t size, len; 582 583 if (block == rs->last_sent_block) { 584 offset |= RAM_SAVE_FLAG_CONTINUE; 585 } 586 qemu_put_be64(f, offset); 587 size = 8; 588 589 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 590 len = strlen(block->idstr); 591 qemu_put_byte(f, len); 592 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 593 size += 1 + len; 594 rs->last_sent_block = block; 595 } 596 return size; 597 } 598 599 /** 600 * mig_throttle_guest_down: throttle down the guest 601 * 602 * Reduce amount of guest cpu execution to hopefully slow down memory 603 * writes. If guest dirty memory rate is reduced below the rate at 604 * which we can transfer pages to the destination then we should be 605 * able to complete migration. Some workloads dirty memory way too 606 * fast and will not effectively converge, even with auto-converge. 607 */ 608 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 609 uint64_t bytes_dirty_threshold) 610 { 611 MigrationState *s = migrate_get_current(); 612 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 613 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 614 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 615 int pct_max = s->parameters.max_cpu_throttle; 616 617 uint64_t throttle_now = cpu_throttle_get_percentage(); 618 uint64_t cpu_now, cpu_ideal, throttle_inc; 619 620 /* We have not started throttling yet. Let's start it. */ 621 if (!cpu_throttle_active()) { 622 cpu_throttle_set(pct_initial); 623 } else { 624 /* Throttling already on, just increase the rate */ 625 if (!pct_tailslow) { 626 throttle_inc = pct_increment; 627 } else { 628 /* Compute the ideal CPU percentage used by Guest, which may 629 * make the dirty rate match the dirty rate threshold. */ 630 cpu_now = 100 - throttle_now; 631 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 632 bytes_dirty_period); 633 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 634 } 635 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 636 } 637 } 638 639 void mig_throttle_counter_reset(void) 640 { 641 RAMState *rs = ram_state; 642 643 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 644 rs->num_dirty_pages_period = 0; 645 rs->bytes_xfer_prev = ram_counters.transferred; 646 } 647 648 /** 649 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 650 * 651 * @rs: current RAM state 652 * @current_addr: address for the zero page 653 * 654 * Update the xbzrle cache to reflect a page that's been sent as all 0. 655 * The important thing is that a stale (not-yet-0'd) page be replaced 656 * by the new data. 657 * As a bonus, if the page wasn't in the cache it gets added so that 658 * when a small write is made into the 0'd page it gets XBZRLE sent. 659 */ 660 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 661 { 662 if (!rs->xbzrle_enabled) { 663 return; 664 } 665 666 /* We don't care if this fails to allocate a new cache page 667 * as long as it updated an old one */ 668 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 669 ram_counters.dirty_sync_count); 670 } 671 672 #define ENCODING_FLAG_XBZRLE 0x1 673 674 /** 675 * save_xbzrle_page: compress and send current page 676 * 677 * Returns: 1 means that we wrote the page 678 * 0 means that page is identical to the one already sent 679 * -1 means that xbzrle would be longer than normal 680 * 681 * @rs: current RAM state 682 * @current_data: pointer to the address of the page contents 683 * @current_addr: addr of the page 684 * @block: block that contains the page we want to send 685 * @offset: offset inside the block for the page 686 * @last_stage: if we are at the completion stage 687 */ 688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 689 ram_addr_t current_addr, RAMBlock *block, 690 ram_addr_t offset, bool last_stage) 691 { 692 int encoded_len = 0, bytes_xbzrle; 693 uint8_t *prev_cached_page; 694 695 if (!cache_is_cached(XBZRLE.cache, current_addr, 696 ram_counters.dirty_sync_count)) { 697 xbzrle_counters.cache_miss++; 698 if (!last_stage) { 699 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 700 ram_counters.dirty_sync_count) == -1) { 701 return -1; 702 } else { 703 /* update *current_data when the page has been 704 inserted into cache */ 705 *current_data = get_cached_data(XBZRLE.cache, current_addr); 706 } 707 } 708 return -1; 709 } 710 711 /* 712 * Reaching here means the page has hit the xbzrle cache, no matter what 713 * encoding result it is (normal encoding, overflow or skipping the page), 714 * count the page as encoded. This is used to calculate the encoding rate. 715 * 716 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 717 * 2nd page turns out to be skipped (i.e. no new bytes written to the 718 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 719 * skipped page included. In this way, the encoding rate can tell if the 720 * guest page is good for xbzrle encoding. 721 */ 722 xbzrle_counters.pages++; 723 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 724 725 /* save current buffer into memory */ 726 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 727 728 /* XBZRLE encoding (if there is no overflow) */ 729 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 730 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 731 TARGET_PAGE_SIZE); 732 733 /* 734 * Update the cache contents, so that it corresponds to the data 735 * sent, in all cases except where we skip the page. 736 */ 737 if (!last_stage && encoded_len != 0) { 738 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 739 /* 740 * In the case where we couldn't compress, ensure that the caller 741 * sends the data from the cache, since the guest might have 742 * changed the RAM since we copied it. 743 */ 744 *current_data = prev_cached_page; 745 } 746 747 if (encoded_len == 0) { 748 trace_save_xbzrle_page_skipping(); 749 return 0; 750 } else if (encoded_len == -1) { 751 trace_save_xbzrle_page_overflow(); 752 xbzrle_counters.overflow++; 753 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 754 return -1; 755 } 756 757 /* Send XBZRLE based compressed page */ 758 bytes_xbzrle = save_page_header(rs, rs->f, block, 759 offset | RAM_SAVE_FLAG_XBZRLE); 760 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 761 qemu_put_be16(rs->f, encoded_len); 762 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 763 bytes_xbzrle += encoded_len + 1 + 2; 764 /* 765 * Like compressed_size (please see update_compress_thread_counts), 766 * the xbzrle encoded bytes don't count the 8 byte header with 767 * RAM_SAVE_FLAG_CONTINUE. 768 */ 769 xbzrle_counters.bytes += bytes_xbzrle - 8; 770 ram_counters.transferred += bytes_xbzrle; 771 772 return 1; 773 } 774 775 /** 776 * migration_bitmap_find_dirty: find the next dirty page from start 777 * 778 * Returns the page offset within memory region of the start of a dirty page 779 * 780 * @rs: current RAM state 781 * @rb: RAMBlock where to search for dirty pages 782 * @start: page where we start the search 783 */ 784 static inline 785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 786 unsigned long start) 787 { 788 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 789 unsigned long *bitmap = rb->bmap; 790 791 if (ramblock_is_ignored(rb)) { 792 return size; 793 } 794 795 return find_next_bit(bitmap, size, start); 796 } 797 798 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 799 unsigned long page) 800 { 801 uint8_t shift; 802 hwaddr size, start; 803 804 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 805 return; 806 } 807 808 shift = rb->clear_bmap_shift; 809 /* 810 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 811 * can make things easier sometimes since then start address 812 * of the small chunk will always be 64 pages aligned so the 813 * bitmap will always be aligned to unsigned long. We should 814 * even be able to remove this restriction but I'm simply 815 * keeping it. 816 */ 817 assert(shift >= 6); 818 819 size = 1ULL << (TARGET_PAGE_BITS + shift); 820 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 821 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 822 memory_region_clear_dirty_bitmap(rb->mr, start, size); 823 } 824 825 static void 826 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 827 unsigned long start, 828 unsigned long npages) 829 { 830 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 831 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 832 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 833 834 /* 835 * Clear pages from start to start + npages - 1, so the end boundary is 836 * exclusive. 837 */ 838 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 839 migration_clear_memory_region_dirty_bitmap(rb, i); 840 } 841 } 842 843 /* 844 * colo_bitmap_find_diry:find contiguous dirty pages from start 845 * 846 * Returns the page offset within memory region of the start of the contiguout 847 * dirty page 848 * 849 * @rs: current RAM state 850 * @rb: RAMBlock where to search for dirty pages 851 * @start: page where we start the search 852 * @num: the number of contiguous dirty pages 853 */ 854 static inline 855 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 856 unsigned long start, unsigned long *num) 857 { 858 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 859 unsigned long *bitmap = rb->bmap; 860 unsigned long first, next; 861 862 *num = 0; 863 864 if (ramblock_is_ignored(rb)) { 865 return size; 866 } 867 868 first = find_next_bit(bitmap, size, start); 869 if (first >= size) { 870 return first; 871 } 872 next = find_next_zero_bit(bitmap, size, first + 1); 873 assert(next >= first); 874 *num = next - first; 875 return first; 876 } 877 878 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 879 RAMBlock *rb, 880 unsigned long page) 881 { 882 bool ret; 883 884 /* 885 * Clear dirty bitmap if needed. This _must_ be called before we 886 * send any of the page in the chunk because we need to make sure 887 * we can capture further page content changes when we sync dirty 888 * log the next time. So as long as we are going to send any of 889 * the page in the chunk we clear the remote dirty bitmap for all. 890 * Clearing it earlier won't be a problem, but too late will. 891 */ 892 migration_clear_memory_region_dirty_bitmap(rb, page); 893 894 ret = test_and_clear_bit(page, rb->bmap); 895 if (ret) { 896 rs->migration_dirty_pages--; 897 } 898 899 return ret; 900 } 901 902 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 903 void *opaque) 904 { 905 const hwaddr offset = section->offset_within_region; 906 const hwaddr size = int128_get64(section->size); 907 const unsigned long start = offset >> TARGET_PAGE_BITS; 908 const unsigned long npages = size >> TARGET_PAGE_BITS; 909 RAMBlock *rb = section->mr->ram_block; 910 uint64_t *cleared_bits = opaque; 911 912 /* 913 * We don't grab ram_state->bitmap_mutex because we expect to run 914 * only when starting migration or during postcopy recovery where 915 * we don't have concurrent access. 916 */ 917 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 918 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 919 } 920 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 921 bitmap_clear(rb->bmap, start, npages); 922 } 923 924 /* 925 * Exclude all dirty pages from migration that fall into a discarded range as 926 * managed by a RamDiscardManager responsible for the mapped memory region of 927 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 928 * 929 * Discarded pages ("logically unplugged") have undefined content and must 930 * not get migrated, because even reading these pages for migration might 931 * result in undesired behavior. 932 * 933 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 934 * 935 * Note: The result is only stable while migrating (precopy/postcopy). 936 */ 937 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 938 { 939 uint64_t cleared_bits = 0; 940 941 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 942 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 943 MemoryRegionSection section = { 944 .mr = rb->mr, 945 .offset_within_region = 0, 946 .size = int128_make64(qemu_ram_get_used_length(rb)), 947 }; 948 949 ram_discard_manager_replay_discarded(rdm, §ion, 950 dirty_bitmap_clear_section, 951 &cleared_bits); 952 } 953 return cleared_bits; 954 } 955 956 /* 957 * Check if a host-page aligned page falls into a discarded range as managed by 958 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 959 * 960 * Note: The result is only stable while migrating (precopy/postcopy). 961 */ 962 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 963 { 964 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 965 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 966 MemoryRegionSection section = { 967 .mr = rb->mr, 968 .offset_within_region = start, 969 .size = int128_make64(qemu_ram_pagesize(rb)), 970 }; 971 972 return !ram_discard_manager_is_populated(rdm, §ion); 973 } 974 return false; 975 } 976 977 /* Called with RCU critical section */ 978 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 979 { 980 uint64_t new_dirty_pages = 981 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 982 983 rs->migration_dirty_pages += new_dirty_pages; 984 rs->num_dirty_pages_period += new_dirty_pages; 985 } 986 987 /** 988 * ram_pagesize_summary: calculate all the pagesizes of a VM 989 * 990 * Returns a summary bitmap of the page sizes of all RAMBlocks 991 * 992 * For VMs with just normal pages this is equivalent to the host page 993 * size. If it's got some huge pages then it's the OR of all the 994 * different page sizes. 995 */ 996 uint64_t ram_pagesize_summary(void) 997 { 998 RAMBlock *block; 999 uint64_t summary = 0; 1000 1001 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1002 summary |= block->page_size; 1003 } 1004 1005 return summary; 1006 } 1007 1008 uint64_t ram_get_total_transferred_pages(void) 1009 { 1010 return ram_counters.normal + ram_counters.duplicate + 1011 compression_counters.pages + xbzrle_counters.pages; 1012 } 1013 1014 static void migration_update_rates(RAMState *rs, int64_t end_time) 1015 { 1016 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1017 double compressed_size; 1018 1019 /* calculate period counters */ 1020 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1021 / (end_time - rs->time_last_bitmap_sync); 1022 1023 if (!page_count) { 1024 return; 1025 } 1026 1027 if (migrate_use_xbzrle()) { 1028 double encoded_size, unencoded_size; 1029 1030 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1031 rs->xbzrle_cache_miss_prev) / page_count; 1032 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1033 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1034 TARGET_PAGE_SIZE; 1035 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1036 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1037 xbzrle_counters.encoding_rate = 0; 1038 } else { 1039 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1040 } 1041 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1042 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1043 } 1044 1045 if (migrate_use_compression()) { 1046 compression_counters.busy_rate = (double)(compression_counters.busy - 1047 rs->compress_thread_busy_prev) / page_count; 1048 rs->compress_thread_busy_prev = compression_counters.busy; 1049 1050 compressed_size = compression_counters.compressed_size - 1051 rs->compressed_size_prev; 1052 if (compressed_size) { 1053 double uncompressed_size = (compression_counters.pages - 1054 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1055 1056 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1057 compression_counters.compression_rate = 1058 uncompressed_size / compressed_size; 1059 1060 rs->compress_pages_prev = compression_counters.pages; 1061 rs->compressed_size_prev = compression_counters.compressed_size; 1062 } 1063 } 1064 } 1065 1066 static void migration_trigger_throttle(RAMState *rs) 1067 { 1068 MigrationState *s = migrate_get_current(); 1069 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1070 1071 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 1072 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1073 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1074 1075 /* During block migration the auto-converge logic incorrectly detects 1076 * that ram migration makes no progress. Avoid this by disabling the 1077 * throttling logic during the bulk phase of block migration. */ 1078 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1079 /* The following detection logic can be refined later. For now: 1080 Check to see if the ratio between dirtied bytes and the approx. 1081 amount of bytes that just got transferred since the last time 1082 we were in this routine reaches the threshold. If that happens 1083 twice, start or increase throttling. */ 1084 1085 if ((bytes_dirty_period > bytes_dirty_threshold) && 1086 (++rs->dirty_rate_high_cnt >= 2)) { 1087 trace_migration_throttle(); 1088 rs->dirty_rate_high_cnt = 0; 1089 mig_throttle_guest_down(bytes_dirty_period, 1090 bytes_dirty_threshold); 1091 } 1092 } 1093 } 1094 1095 static void migration_bitmap_sync(RAMState *rs) 1096 { 1097 RAMBlock *block; 1098 int64_t end_time; 1099 1100 ram_counters.dirty_sync_count++; 1101 1102 if (!rs->time_last_bitmap_sync) { 1103 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1104 } 1105 1106 trace_migration_bitmap_sync_start(); 1107 memory_global_dirty_log_sync(); 1108 1109 qemu_mutex_lock(&rs->bitmap_mutex); 1110 WITH_RCU_READ_LOCK_GUARD() { 1111 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1112 ramblock_sync_dirty_bitmap(rs, block); 1113 } 1114 ram_counters.remaining = ram_bytes_remaining(); 1115 } 1116 qemu_mutex_unlock(&rs->bitmap_mutex); 1117 1118 memory_global_after_dirty_log_sync(); 1119 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1120 1121 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1122 1123 /* more than 1 second = 1000 millisecons */ 1124 if (end_time > rs->time_last_bitmap_sync + 1000) { 1125 migration_trigger_throttle(rs); 1126 1127 migration_update_rates(rs, end_time); 1128 1129 rs->target_page_count_prev = rs->target_page_count; 1130 1131 /* reset period counters */ 1132 rs->time_last_bitmap_sync = end_time; 1133 rs->num_dirty_pages_period = 0; 1134 rs->bytes_xfer_prev = ram_counters.transferred; 1135 } 1136 if (migrate_use_events()) { 1137 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1138 } 1139 } 1140 1141 static void migration_bitmap_sync_precopy(RAMState *rs) 1142 { 1143 Error *local_err = NULL; 1144 1145 /* 1146 * The current notifier usage is just an optimization to migration, so we 1147 * don't stop the normal migration process in the error case. 1148 */ 1149 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1150 error_report_err(local_err); 1151 local_err = NULL; 1152 } 1153 1154 migration_bitmap_sync(rs); 1155 1156 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1157 error_report_err(local_err); 1158 } 1159 } 1160 1161 /** 1162 * save_zero_page_to_file: send the zero page to the file 1163 * 1164 * Returns the size of data written to the file, 0 means the page is not 1165 * a zero page 1166 * 1167 * @rs: current RAM state 1168 * @file: the file where the data is saved 1169 * @block: block that contains the page we want to send 1170 * @offset: offset inside the block for the page 1171 */ 1172 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1173 RAMBlock *block, ram_addr_t offset) 1174 { 1175 uint8_t *p = block->host + offset; 1176 int len = 0; 1177 1178 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1179 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1180 qemu_put_byte(file, 0); 1181 len += 1; 1182 } 1183 return len; 1184 } 1185 1186 /** 1187 * save_zero_page: send the zero page to the stream 1188 * 1189 * Returns the number of pages written. 1190 * 1191 * @rs: current RAM state 1192 * @block: block that contains the page we want to send 1193 * @offset: offset inside the block for the page 1194 */ 1195 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1196 { 1197 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1198 1199 if (len) { 1200 ram_counters.duplicate++; 1201 ram_counters.transferred += len; 1202 return 1; 1203 } 1204 return -1; 1205 } 1206 1207 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1208 { 1209 if (!migrate_release_ram() || !migration_in_postcopy()) { 1210 return; 1211 } 1212 1213 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1214 } 1215 1216 /* 1217 * @pages: the number of pages written by the control path, 1218 * < 0 - error 1219 * > 0 - number of pages written 1220 * 1221 * Return true if the pages has been saved, otherwise false is returned. 1222 */ 1223 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1224 int *pages) 1225 { 1226 uint64_t bytes_xmit = 0; 1227 int ret; 1228 1229 *pages = -1; 1230 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1231 &bytes_xmit); 1232 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1233 return false; 1234 } 1235 1236 if (bytes_xmit) { 1237 ram_counters.transferred += bytes_xmit; 1238 *pages = 1; 1239 } 1240 1241 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1242 return true; 1243 } 1244 1245 if (bytes_xmit > 0) { 1246 ram_counters.normal++; 1247 } else if (bytes_xmit == 0) { 1248 ram_counters.duplicate++; 1249 } 1250 1251 return true; 1252 } 1253 1254 /* 1255 * directly send the page to the stream 1256 * 1257 * Returns the number of pages written. 1258 * 1259 * @rs: current RAM state 1260 * @block: block that contains the page we want to send 1261 * @offset: offset inside the block for the page 1262 * @buf: the page to be sent 1263 * @async: send to page asyncly 1264 */ 1265 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1266 uint8_t *buf, bool async) 1267 { 1268 ram_counters.transferred += save_page_header(rs, rs->f, block, 1269 offset | RAM_SAVE_FLAG_PAGE); 1270 if (async) { 1271 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1272 migrate_release_ram() & 1273 migration_in_postcopy()); 1274 } else { 1275 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1276 } 1277 ram_counters.transferred += TARGET_PAGE_SIZE; 1278 ram_counters.normal++; 1279 return 1; 1280 } 1281 1282 /** 1283 * ram_save_page: send the given page to the stream 1284 * 1285 * Returns the number of pages written. 1286 * < 0 - error 1287 * >=0 - Number of pages written - this might legally be 0 1288 * if xbzrle noticed the page was the same. 1289 * 1290 * @rs: current RAM state 1291 * @block: block that contains the page we want to send 1292 * @offset: offset inside the block for the page 1293 * @last_stage: if we are at the completion stage 1294 */ 1295 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1296 { 1297 int pages = -1; 1298 uint8_t *p; 1299 bool send_async = true; 1300 RAMBlock *block = pss->block; 1301 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1302 ram_addr_t current_addr = block->offset + offset; 1303 1304 p = block->host + offset; 1305 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1306 1307 XBZRLE_cache_lock(); 1308 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1309 pages = save_xbzrle_page(rs, &p, current_addr, block, 1310 offset, last_stage); 1311 if (!last_stage) { 1312 /* Can't send this cached data async, since the cache page 1313 * might get updated before it gets to the wire 1314 */ 1315 send_async = false; 1316 } 1317 } 1318 1319 /* XBZRLE overflow or normal page */ 1320 if (pages == -1) { 1321 pages = save_normal_page(rs, block, offset, p, send_async); 1322 } 1323 1324 XBZRLE_cache_unlock(); 1325 1326 return pages; 1327 } 1328 1329 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1330 ram_addr_t offset) 1331 { 1332 if (multifd_queue_page(rs->f, block, offset) < 0) { 1333 return -1; 1334 } 1335 ram_counters.normal++; 1336 1337 return 1; 1338 } 1339 1340 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1341 ram_addr_t offset, uint8_t *source_buf) 1342 { 1343 RAMState *rs = ram_state; 1344 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1345 bool zero_page = false; 1346 int ret; 1347 1348 if (save_zero_page_to_file(rs, f, block, offset)) { 1349 zero_page = true; 1350 goto exit; 1351 } 1352 1353 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1354 1355 /* 1356 * copy it to a internal buffer to avoid it being modified by VM 1357 * so that we can catch up the error during compression and 1358 * decompression 1359 */ 1360 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1361 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1362 if (ret < 0) { 1363 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1364 error_report("compressed data failed!"); 1365 return false; 1366 } 1367 1368 exit: 1369 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1370 return zero_page; 1371 } 1372 1373 static void 1374 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1375 { 1376 ram_counters.transferred += bytes_xmit; 1377 1378 if (param->zero_page) { 1379 ram_counters.duplicate++; 1380 return; 1381 } 1382 1383 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1384 compression_counters.compressed_size += bytes_xmit - 8; 1385 compression_counters.pages++; 1386 } 1387 1388 static bool save_page_use_compression(RAMState *rs); 1389 1390 static void flush_compressed_data(RAMState *rs) 1391 { 1392 int idx, len, thread_count; 1393 1394 if (!save_page_use_compression(rs)) { 1395 return; 1396 } 1397 thread_count = migrate_compress_threads(); 1398 1399 qemu_mutex_lock(&comp_done_lock); 1400 for (idx = 0; idx < thread_count; idx++) { 1401 while (!comp_param[idx].done) { 1402 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1403 } 1404 } 1405 qemu_mutex_unlock(&comp_done_lock); 1406 1407 for (idx = 0; idx < thread_count; idx++) { 1408 qemu_mutex_lock(&comp_param[idx].mutex); 1409 if (!comp_param[idx].quit) { 1410 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1411 /* 1412 * it's safe to fetch zero_page without holding comp_done_lock 1413 * as there is no further request submitted to the thread, 1414 * i.e, the thread should be waiting for a request at this point. 1415 */ 1416 update_compress_thread_counts(&comp_param[idx], len); 1417 } 1418 qemu_mutex_unlock(&comp_param[idx].mutex); 1419 } 1420 } 1421 1422 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1423 ram_addr_t offset) 1424 { 1425 param->block = block; 1426 param->offset = offset; 1427 } 1428 1429 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1430 ram_addr_t offset) 1431 { 1432 int idx, thread_count, bytes_xmit = -1, pages = -1; 1433 bool wait = migrate_compress_wait_thread(); 1434 1435 thread_count = migrate_compress_threads(); 1436 qemu_mutex_lock(&comp_done_lock); 1437 retry: 1438 for (idx = 0; idx < thread_count; idx++) { 1439 if (comp_param[idx].done) { 1440 comp_param[idx].done = false; 1441 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1442 qemu_mutex_lock(&comp_param[idx].mutex); 1443 set_compress_params(&comp_param[idx], block, offset); 1444 qemu_cond_signal(&comp_param[idx].cond); 1445 qemu_mutex_unlock(&comp_param[idx].mutex); 1446 pages = 1; 1447 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1448 break; 1449 } 1450 } 1451 1452 /* 1453 * wait for the free thread if the user specifies 'compress-wait-thread', 1454 * otherwise we will post the page out in the main thread as normal page. 1455 */ 1456 if (pages < 0 && wait) { 1457 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1458 goto retry; 1459 } 1460 qemu_mutex_unlock(&comp_done_lock); 1461 1462 return pages; 1463 } 1464 1465 /** 1466 * find_dirty_block: find the next dirty page and update any state 1467 * associated with the search process. 1468 * 1469 * Returns true if a page is found 1470 * 1471 * @rs: current RAM state 1472 * @pss: data about the state of the current dirty page scan 1473 * @again: set to false if the search has scanned the whole of RAM 1474 */ 1475 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1476 { 1477 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1478 if (pss->complete_round && pss->block == rs->last_seen_block && 1479 pss->page >= rs->last_page) { 1480 /* 1481 * We've been once around the RAM and haven't found anything. 1482 * Give up. 1483 */ 1484 *again = false; 1485 return false; 1486 } 1487 if (!offset_in_ramblock(pss->block, 1488 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1489 /* Didn't find anything in this RAM Block */ 1490 pss->page = 0; 1491 pss->block = QLIST_NEXT_RCU(pss->block, next); 1492 if (!pss->block) { 1493 /* 1494 * If memory migration starts over, we will meet a dirtied page 1495 * which may still exists in compression threads's ring, so we 1496 * should flush the compressed data to make sure the new page 1497 * is not overwritten by the old one in the destination. 1498 * 1499 * Also If xbzrle is on, stop using the data compression at this 1500 * point. In theory, xbzrle can do better than compression. 1501 */ 1502 flush_compressed_data(rs); 1503 1504 /* Hit the end of the list */ 1505 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1506 /* Flag that we've looped */ 1507 pss->complete_round = true; 1508 /* After the first round, enable XBZRLE. */ 1509 if (migrate_use_xbzrle()) { 1510 rs->xbzrle_enabled = true; 1511 } 1512 } 1513 /* Didn't find anything this time, but try again on the new block */ 1514 *again = true; 1515 return false; 1516 } else { 1517 /* Can go around again, but... */ 1518 *again = true; 1519 /* We've found something so probably don't need to */ 1520 return true; 1521 } 1522 } 1523 1524 /** 1525 * unqueue_page: gets a page of the queue 1526 * 1527 * Helper for 'get_queued_page' - gets a page off the queue 1528 * 1529 * Returns the block of the page (or NULL if none available) 1530 * 1531 * @rs: current RAM state 1532 * @offset: used to return the offset within the RAMBlock 1533 */ 1534 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1535 { 1536 RAMBlock *block = NULL; 1537 1538 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1539 return NULL; 1540 } 1541 1542 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1543 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1544 struct RAMSrcPageRequest *entry = 1545 QSIMPLEQ_FIRST(&rs->src_page_requests); 1546 block = entry->rb; 1547 *offset = entry->offset; 1548 1549 if (entry->len > TARGET_PAGE_SIZE) { 1550 entry->len -= TARGET_PAGE_SIZE; 1551 entry->offset += TARGET_PAGE_SIZE; 1552 } else { 1553 memory_region_unref(block->mr); 1554 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1555 g_free(entry); 1556 migration_consume_urgent_request(); 1557 } 1558 } 1559 1560 return block; 1561 } 1562 1563 #if defined(__linux__) 1564 /** 1565 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1566 * is found, return RAM block pointer and page offset 1567 * 1568 * Returns pointer to the RAMBlock containing faulting page, 1569 * NULL if no write faults are pending 1570 * 1571 * @rs: current RAM state 1572 * @offset: page offset from the beginning of the block 1573 */ 1574 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1575 { 1576 struct uffd_msg uffd_msg; 1577 void *page_address; 1578 RAMBlock *block; 1579 int res; 1580 1581 if (!migrate_background_snapshot()) { 1582 return NULL; 1583 } 1584 1585 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1586 if (res <= 0) { 1587 return NULL; 1588 } 1589 1590 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1591 block = qemu_ram_block_from_host(page_address, false, offset); 1592 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1593 return block; 1594 } 1595 1596 /** 1597 * ram_save_release_protection: release UFFD write protection after 1598 * a range of pages has been saved 1599 * 1600 * @rs: current RAM state 1601 * @pss: page-search-status structure 1602 * @start_page: index of the first page in the range relative to pss->block 1603 * 1604 * Returns 0 on success, negative value in case of an error 1605 */ 1606 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1607 unsigned long start_page) 1608 { 1609 int res = 0; 1610 1611 /* Check if page is from UFFD-managed region. */ 1612 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1613 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1614 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; 1615 1616 /* Flush async buffers before un-protect. */ 1617 qemu_fflush(rs->f); 1618 /* Un-protect memory range. */ 1619 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1620 false, false); 1621 } 1622 1623 return res; 1624 } 1625 1626 /* ram_write_tracking_available: check if kernel supports required UFFD features 1627 * 1628 * Returns true if supports, false otherwise 1629 */ 1630 bool ram_write_tracking_available(void) 1631 { 1632 uint64_t uffd_features; 1633 int res; 1634 1635 res = uffd_query_features(&uffd_features); 1636 return (res == 0 && 1637 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1638 } 1639 1640 /* ram_write_tracking_compatible: check if guest configuration is 1641 * compatible with 'write-tracking' 1642 * 1643 * Returns true if compatible, false otherwise 1644 */ 1645 bool ram_write_tracking_compatible(void) 1646 { 1647 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1648 int uffd_fd; 1649 RAMBlock *block; 1650 bool ret = false; 1651 1652 /* Open UFFD file descriptor */ 1653 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1654 if (uffd_fd < 0) { 1655 return false; 1656 } 1657 1658 RCU_READ_LOCK_GUARD(); 1659 1660 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1661 uint64_t uffd_ioctls; 1662 1663 /* Nothing to do with read-only and MMIO-writable regions */ 1664 if (block->mr->readonly || block->mr->rom_device) { 1665 continue; 1666 } 1667 /* Try to register block memory via UFFD-IO to track writes */ 1668 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1669 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1670 goto out; 1671 } 1672 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1673 goto out; 1674 } 1675 } 1676 ret = true; 1677 1678 out: 1679 uffd_close_fd(uffd_fd); 1680 return ret; 1681 } 1682 1683 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1684 ram_addr_t size) 1685 { 1686 /* 1687 * We read one byte of each page; this will preallocate page tables if 1688 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1689 * where no page was populated yet. This might require adaption when 1690 * supporting other mappings, like shmem. 1691 */ 1692 for (; offset < size; offset += block->page_size) { 1693 char tmp = *((char *)block->host + offset); 1694 1695 /* Don't optimize the read out */ 1696 asm volatile("" : "+r" (tmp)); 1697 } 1698 } 1699 1700 static inline int populate_read_section(MemoryRegionSection *section, 1701 void *opaque) 1702 { 1703 const hwaddr size = int128_get64(section->size); 1704 hwaddr offset = section->offset_within_region; 1705 RAMBlock *block = section->mr->ram_block; 1706 1707 populate_read_range(block, offset, size); 1708 return 0; 1709 } 1710 1711 /* 1712 * ram_block_populate_read: preallocate page tables and populate pages in the 1713 * RAM block by reading a byte of each page. 1714 * 1715 * Since it's solely used for userfault_fd WP feature, here we just 1716 * hardcode page size to qemu_real_host_page_size. 1717 * 1718 * @block: RAM block to populate 1719 */ 1720 static void ram_block_populate_read(RAMBlock *rb) 1721 { 1722 /* 1723 * Skip populating all pages that fall into a discarded range as managed by 1724 * a RamDiscardManager responsible for the mapped memory region of the 1725 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1726 * must not get populated automatically. We don't have to track 1727 * modifications via userfaultfd WP reliably, because these pages will 1728 * not be part of the migration stream either way -- see 1729 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1730 * 1731 * Note: The result is only stable while migrating (precopy/postcopy). 1732 */ 1733 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1734 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1735 MemoryRegionSection section = { 1736 .mr = rb->mr, 1737 .offset_within_region = 0, 1738 .size = rb->mr->size, 1739 }; 1740 1741 ram_discard_manager_replay_populated(rdm, §ion, 1742 populate_read_section, NULL); 1743 } else { 1744 populate_read_range(rb, 0, rb->used_length); 1745 } 1746 } 1747 1748 /* 1749 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1750 */ 1751 void ram_write_tracking_prepare(void) 1752 { 1753 RAMBlock *block; 1754 1755 RCU_READ_LOCK_GUARD(); 1756 1757 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1758 /* Nothing to do with read-only and MMIO-writable regions */ 1759 if (block->mr->readonly || block->mr->rom_device) { 1760 continue; 1761 } 1762 1763 /* 1764 * Populate pages of the RAM block before enabling userfault_fd 1765 * write protection. 1766 * 1767 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1768 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1769 * pages with pte_none() entries in page table. 1770 */ 1771 ram_block_populate_read(block); 1772 } 1773 } 1774 1775 /* 1776 * ram_write_tracking_start: start UFFD-WP memory tracking 1777 * 1778 * Returns 0 for success or negative value in case of error 1779 */ 1780 int ram_write_tracking_start(void) 1781 { 1782 int uffd_fd; 1783 RAMState *rs = ram_state; 1784 RAMBlock *block; 1785 1786 /* Open UFFD file descriptor */ 1787 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1788 if (uffd_fd < 0) { 1789 return uffd_fd; 1790 } 1791 rs->uffdio_fd = uffd_fd; 1792 1793 RCU_READ_LOCK_GUARD(); 1794 1795 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1796 /* Nothing to do with read-only and MMIO-writable regions */ 1797 if (block->mr->readonly || block->mr->rom_device) { 1798 continue; 1799 } 1800 1801 /* Register block memory with UFFD to track writes */ 1802 if (uffd_register_memory(rs->uffdio_fd, block->host, 1803 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1804 goto fail; 1805 } 1806 /* Apply UFFD write protection to the block memory range */ 1807 if (uffd_change_protection(rs->uffdio_fd, block->host, 1808 block->max_length, true, false)) { 1809 goto fail; 1810 } 1811 block->flags |= RAM_UF_WRITEPROTECT; 1812 memory_region_ref(block->mr); 1813 1814 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1815 block->host, block->max_length); 1816 } 1817 1818 return 0; 1819 1820 fail: 1821 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1822 1823 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1824 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1825 continue; 1826 } 1827 /* 1828 * In case some memory block failed to be write-protected 1829 * remove protection and unregister all succeeded RAM blocks 1830 */ 1831 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1832 false, false); 1833 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1834 /* Cleanup flags and remove reference */ 1835 block->flags &= ~RAM_UF_WRITEPROTECT; 1836 memory_region_unref(block->mr); 1837 } 1838 1839 uffd_close_fd(uffd_fd); 1840 rs->uffdio_fd = -1; 1841 return -1; 1842 } 1843 1844 /** 1845 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1846 */ 1847 void ram_write_tracking_stop(void) 1848 { 1849 RAMState *rs = ram_state; 1850 RAMBlock *block; 1851 1852 RCU_READ_LOCK_GUARD(); 1853 1854 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1855 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1856 continue; 1857 } 1858 /* Remove protection and unregister all affected RAM blocks */ 1859 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1860 false, false); 1861 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1862 1863 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1864 block->host, block->max_length); 1865 1866 /* Cleanup flags and remove reference */ 1867 block->flags &= ~RAM_UF_WRITEPROTECT; 1868 memory_region_unref(block->mr); 1869 } 1870 1871 /* Finally close UFFD file descriptor */ 1872 uffd_close_fd(rs->uffdio_fd); 1873 rs->uffdio_fd = -1; 1874 } 1875 1876 #else 1877 /* No target OS support, stubs just fail or ignore */ 1878 1879 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1880 { 1881 (void) rs; 1882 (void) offset; 1883 1884 return NULL; 1885 } 1886 1887 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1888 unsigned long start_page) 1889 { 1890 (void) rs; 1891 (void) pss; 1892 (void) start_page; 1893 1894 return 0; 1895 } 1896 1897 bool ram_write_tracking_available(void) 1898 { 1899 return false; 1900 } 1901 1902 bool ram_write_tracking_compatible(void) 1903 { 1904 assert(0); 1905 return false; 1906 } 1907 1908 int ram_write_tracking_start(void) 1909 { 1910 assert(0); 1911 return -1; 1912 } 1913 1914 void ram_write_tracking_stop(void) 1915 { 1916 assert(0); 1917 } 1918 #endif /* defined(__linux__) */ 1919 1920 /** 1921 * get_queued_page: unqueue a page from the postcopy requests 1922 * 1923 * Skips pages that are already sent (!dirty) 1924 * 1925 * Returns true if a queued page is found 1926 * 1927 * @rs: current RAM state 1928 * @pss: data about the state of the current dirty page scan 1929 */ 1930 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1931 { 1932 RAMBlock *block; 1933 ram_addr_t offset; 1934 bool dirty; 1935 1936 do { 1937 block = unqueue_page(rs, &offset); 1938 /* 1939 * We're sending this page, and since it's postcopy nothing else 1940 * will dirty it, and we must make sure it doesn't get sent again 1941 * even if this queue request was received after the background 1942 * search already sent it. 1943 */ 1944 if (block) { 1945 unsigned long page; 1946 1947 page = offset >> TARGET_PAGE_BITS; 1948 dirty = test_bit(page, block->bmap); 1949 if (!dirty) { 1950 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1951 page); 1952 } else { 1953 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1954 } 1955 } 1956 1957 } while (block && !dirty); 1958 1959 if (!block) { 1960 /* 1961 * Poll write faults too if background snapshot is enabled; that's 1962 * when we have vcpus got blocked by the write protected pages. 1963 */ 1964 block = poll_fault_page(rs, &offset); 1965 } 1966 1967 if (block) { 1968 /* 1969 * We want the background search to continue from the queued page 1970 * since the guest is likely to want other pages near to the page 1971 * it just requested. 1972 */ 1973 pss->block = block; 1974 pss->page = offset >> TARGET_PAGE_BITS; 1975 1976 /* 1977 * This unqueued page would break the "one round" check, even is 1978 * really rare. 1979 */ 1980 pss->complete_round = false; 1981 } 1982 1983 return !!block; 1984 } 1985 1986 /** 1987 * migration_page_queue_free: drop any remaining pages in the ram 1988 * request queue 1989 * 1990 * It should be empty at the end anyway, but in error cases there may 1991 * be some left. in case that there is any page left, we drop it. 1992 * 1993 */ 1994 static void migration_page_queue_free(RAMState *rs) 1995 { 1996 struct RAMSrcPageRequest *mspr, *next_mspr; 1997 /* This queue generally should be empty - but in the case of a failed 1998 * migration might have some droppings in. 1999 */ 2000 RCU_READ_LOCK_GUARD(); 2001 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2002 memory_region_unref(mspr->rb->mr); 2003 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2004 g_free(mspr); 2005 } 2006 } 2007 2008 /** 2009 * ram_save_queue_pages: queue the page for transmission 2010 * 2011 * A request from postcopy destination for example. 2012 * 2013 * Returns zero on success or negative on error 2014 * 2015 * @rbname: Name of the RAMBLock of the request. NULL means the 2016 * same that last one. 2017 * @start: starting address from the start of the RAMBlock 2018 * @len: length (in bytes) to send 2019 */ 2020 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2021 { 2022 RAMBlock *ramblock; 2023 RAMState *rs = ram_state; 2024 2025 ram_counters.postcopy_requests++; 2026 RCU_READ_LOCK_GUARD(); 2027 2028 if (!rbname) { 2029 /* Reuse last RAMBlock */ 2030 ramblock = rs->last_req_rb; 2031 2032 if (!ramblock) { 2033 /* 2034 * Shouldn't happen, we can't reuse the last RAMBlock if 2035 * it's the 1st request. 2036 */ 2037 error_report("ram_save_queue_pages no previous block"); 2038 return -1; 2039 } 2040 } else { 2041 ramblock = qemu_ram_block_by_name(rbname); 2042 2043 if (!ramblock) { 2044 /* We shouldn't be asked for a non-existent RAMBlock */ 2045 error_report("ram_save_queue_pages no block '%s'", rbname); 2046 return -1; 2047 } 2048 rs->last_req_rb = ramblock; 2049 } 2050 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2051 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2052 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2053 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2054 __func__, start, len, ramblock->used_length); 2055 return -1; 2056 } 2057 2058 struct RAMSrcPageRequest *new_entry = 2059 g_malloc0(sizeof(struct RAMSrcPageRequest)); 2060 new_entry->rb = ramblock; 2061 new_entry->offset = start; 2062 new_entry->len = len; 2063 2064 memory_region_ref(ramblock->mr); 2065 qemu_mutex_lock(&rs->src_page_req_mutex); 2066 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2067 migration_make_urgent_request(); 2068 qemu_mutex_unlock(&rs->src_page_req_mutex); 2069 2070 return 0; 2071 } 2072 2073 static bool save_page_use_compression(RAMState *rs) 2074 { 2075 if (!migrate_use_compression()) { 2076 return false; 2077 } 2078 2079 /* 2080 * If xbzrle is enabled (e.g., after first round of migration), stop 2081 * using the data compression. In theory, xbzrle can do better than 2082 * compression. 2083 */ 2084 if (rs->xbzrle_enabled) { 2085 return false; 2086 } 2087 2088 return true; 2089 } 2090 2091 /* 2092 * try to compress the page before posting it out, return true if the page 2093 * has been properly handled by compression, otherwise needs other 2094 * paths to handle it 2095 */ 2096 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 2097 { 2098 if (!save_page_use_compression(rs)) { 2099 return false; 2100 } 2101 2102 /* 2103 * When starting the process of a new block, the first page of 2104 * the block should be sent out before other pages in the same 2105 * block, and all the pages in last block should have been sent 2106 * out, keeping this order is important, because the 'cont' flag 2107 * is used to avoid resending the block name. 2108 * 2109 * We post the fist page as normal page as compression will take 2110 * much CPU resource. 2111 */ 2112 if (block != rs->last_sent_block) { 2113 flush_compressed_data(rs); 2114 return false; 2115 } 2116 2117 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 2118 return true; 2119 } 2120 2121 compression_counters.busy++; 2122 return false; 2123 } 2124 2125 /** 2126 * ram_save_target_page: save one target page 2127 * 2128 * Returns the number of pages written 2129 * 2130 * @rs: current RAM state 2131 * @pss: data about the page we want to send 2132 * @last_stage: if we are at the completion stage 2133 */ 2134 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 2135 bool last_stage) 2136 { 2137 RAMBlock *block = pss->block; 2138 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2139 int res; 2140 2141 if (control_save_page(rs, block, offset, &res)) { 2142 return res; 2143 } 2144 2145 if (save_compress_page(rs, block, offset)) { 2146 return 1; 2147 } 2148 2149 res = save_zero_page(rs, block, offset); 2150 if (res > 0) { 2151 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2152 * page would be stale 2153 */ 2154 if (!save_page_use_compression(rs)) { 2155 XBZRLE_cache_lock(); 2156 xbzrle_cache_zero_page(rs, block->offset + offset); 2157 XBZRLE_cache_unlock(); 2158 } 2159 ram_release_pages(block->idstr, offset, res); 2160 return res; 2161 } 2162 2163 /* 2164 * Do not use multifd for: 2165 * 1. Compression as the first page in the new block should be posted out 2166 * before sending the compressed page 2167 * 2. In postcopy as one whole host page should be placed 2168 */ 2169 if (!save_page_use_compression(rs) && migrate_use_multifd() 2170 && !migration_in_postcopy()) { 2171 return ram_save_multifd_page(rs, block, offset); 2172 } 2173 2174 return ram_save_page(rs, pss, last_stage); 2175 } 2176 2177 /** 2178 * ram_save_host_page: save a whole host page 2179 * 2180 * Starting at *offset send pages up to the end of the current host 2181 * page. It's valid for the initial offset to point into the middle of 2182 * a host page in which case the remainder of the hostpage is sent. 2183 * Only dirty target pages are sent. Note that the host page size may 2184 * be a huge page for this block. 2185 * The saving stops at the boundary of the used_length of the block 2186 * if the RAMBlock isn't a multiple of the host page size. 2187 * 2188 * Returns the number of pages written or negative on error 2189 * 2190 * @rs: current RAM state 2191 * @ms: current migration state 2192 * @pss: data about the page we want to send 2193 * @last_stage: if we are at the completion stage 2194 */ 2195 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 2196 bool last_stage) 2197 { 2198 int tmppages, pages = 0; 2199 size_t pagesize_bits = 2200 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2201 unsigned long hostpage_boundary = 2202 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2203 unsigned long start_page = pss->page; 2204 int res; 2205 2206 if (ramblock_is_ignored(pss->block)) { 2207 error_report("block %s should not be migrated !", pss->block->idstr); 2208 return 0; 2209 } 2210 2211 do { 2212 /* Check the pages is dirty and if it is send it */ 2213 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2214 tmppages = ram_save_target_page(rs, pss, last_stage); 2215 if (tmppages < 0) { 2216 return tmppages; 2217 } 2218 2219 pages += tmppages; 2220 /* 2221 * Allow rate limiting to happen in the middle of huge pages if 2222 * something is sent in the current iteration. 2223 */ 2224 if (pagesize_bits > 1 && tmppages > 0) { 2225 migration_rate_limit(); 2226 } 2227 } 2228 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2229 } while ((pss->page < hostpage_boundary) && 2230 offset_in_ramblock(pss->block, 2231 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2232 /* The offset we leave with is the min boundary of host page and block */ 2233 pss->page = MIN(pss->page, hostpage_boundary) - 1; 2234 2235 res = ram_save_release_protection(rs, pss, start_page); 2236 return (res < 0 ? res : pages); 2237 } 2238 2239 /** 2240 * ram_find_and_save_block: finds a dirty page and sends it to f 2241 * 2242 * Called within an RCU critical section. 2243 * 2244 * Returns the number of pages written where zero means no dirty pages, 2245 * or negative on error 2246 * 2247 * @rs: current RAM state 2248 * @last_stage: if we are at the completion stage 2249 * 2250 * On systems where host-page-size > target-page-size it will send all the 2251 * pages in a host page that are dirty. 2252 */ 2253 2254 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 2255 { 2256 PageSearchStatus pss; 2257 int pages = 0; 2258 bool again, found; 2259 2260 /* No dirty page as there is zero RAM */ 2261 if (!ram_bytes_total()) { 2262 return pages; 2263 } 2264 2265 pss.block = rs->last_seen_block; 2266 pss.page = rs->last_page; 2267 pss.complete_round = false; 2268 2269 if (!pss.block) { 2270 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2271 } 2272 2273 do { 2274 again = true; 2275 found = get_queued_page(rs, &pss); 2276 2277 if (!found) { 2278 /* priority queue empty, so just search for something dirty */ 2279 found = find_dirty_block(rs, &pss, &again); 2280 } 2281 2282 if (found) { 2283 pages = ram_save_host_page(rs, &pss, last_stage); 2284 } 2285 } while (!pages && again); 2286 2287 rs->last_seen_block = pss.block; 2288 rs->last_page = pss.page; 2289 2290 return pages; 2291 } 2292 2293 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2294 { 2295 uint64_t pages = size / TARGET_PAGE_SIZE; 2296 2297 if (zero) { 2298 ram_counters.duplicate += pages; 2299 } else { 2300 ram_counters.normal += pages; 2301 ram_counters.transferred += size; 2302 qemu_update_position(f, size); 2303 } 2304 } 2305 2306 static uint64_t ram_bytes_total_common(bool count_ignored) 2307 { 2308 RAMBlock *block; 2309 uint64_t total = 0; 2310 2311 RCU_READ_LOCK_GUARD(); 2312 2313 if (count_ignored) { 2314 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2315 total += block->used_length; 2316 } 2317 } else { 2318 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2319 total += block->used_length; 2320 } 2321 } 2322 return total; 2323 } 2324 2325 uint64_t ram_bytes_total(void) 2326 { 2327 return ram_bytes_total_common(false); 2328 } 2329 2330 static void xbzrle_load_setup(void) 2331 { 2332 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2333 } 2334 2335 static void xbzrle_load_cleanup(void) 2336 { 2337 g_free(XBZRLE.decoded_buf); 2338 XBZRLE.decoded_buf = NULL; 2339 } 2340 2341 static void ram_state_cleanup(RAMState **rsp) 2342 { 2343 if (*rsp) { 2344 migration_page_queue_free(*rsp); 2345 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2346 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2347 g_free(*rsp); 2348 *rsp = NULL; 2349 } 2350 } 2351 2352 static void xbzrle_cleanup(void) 2353 { 2354 XBZRLE_cache_lock(); 2355 if (XBZRLE.cache) { 2356 cache_fini(XBZRLE.cache); 2357 g_free(XBZRLE.encoded_buf); 2358 g_free(XBZRLE.current_buf); 2359 g_free(XBZRLE.zero_target_page); 2360 XBZRLE.cache = NULL; 2361 XBZRLE.encoded_buf = NULL; 2362 XBZRLE.current_buf = NULL; 2363 XBZRLE.zero_target_page = NULL; 2364 } 2365 XBZRLE_cache_unlock(); 2366 } 2367 2368 static void ram_save_cleanup(void *opaque) 2369 { 2370 RAMState **rsp = opaque; 2371 RAMBlock *block; 2372 2373 /* We don't use dirty log with background snapshots */ 2374 if (!migrate_background_snapshot()) { 2375 /* caller have hold iothread lock or is in a bh, so there is 2376 * no writing race against the migration bitmap 2377 */ 2378 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2379 /* 2380 * do not stop dirty log without starting it, since 2381 * memory_global_dirty_log_stop will assert that 2382 * memory_global_dirty_log_start/stop used in pairs 2383 */ 2384 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2385 } 2386 } 2387 2388 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2389 g_free(block->clear_bmap); 2390 block->clear_bmap = NULL; 2391 g_free(block->bmap); 2392 block->bmap = NULL; 2393 } 2394 2395 xbzrle_cleanup(); 2396 compress_threads_save_cleanup(); 2397 ram_state_cleanup(rsp); 2398 } 2399 2400 static void ram_state_reset(RAMState *rs) 2401 { 2402 rs->last_seen_block = NULL; 2403 rs->last_sent_block = NULL; 2404 rs->last_page = 0; 2405 rs->last_version = ram_list.version; 2406 rs->xbzrle_enabled = false; 2407 } 2408 2409 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2410 2411 /* 2412 * 'expected' is the value you expect the bitmap mostly to be full 2413 * of; it won't bother printing lines that are all this value. 2414 * If 'todump' is null the migration bitmap is dumped. 2415 */ 2416 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 2417 unsigned long pages) 2418 { 2419 int64_t cur; 2420 int64_t linelen = 128; 2421 char linebuf[129]; 2422 2423 for (cur = 0; cur < pages; cur += linelen) { 2424 int64_t curb; 2425 bool found = false; 2426 /* 2427 * Last line; catch the case where the line length 2428 * is longer than remaining ram 2429 */ 2430 if (cur + linelen > pages) { 2431 linelen = pages - cur; 2432 } 2433 for (curb = 0; curb < linelen; curb++) { 2434 bool thisbit = test_bit(cur + curb, todump); 2435 linebuf[curb] = thisbit ? '1' : '.'; 2436 found = found || (thisbit != expected); 2437 } 2438 if (found) { 2439 linebuf[curb] = '\0'; 2440 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 2441 } 2442 } 2443 } 2444 2445 /* **** functions for postcopy ***** */ 2446 2447 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2448 { 2449 struct RAMBlock *block; 2450 2451 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2452 unsigned long *bitmap = block->bmap; 2453 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2454 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2455 2456 while (run_start < range) { 2457 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2458 ram_discard_range(block->idstr, 2459 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2460 ((ram_addr_t)(run_end - run_start)) 2461 << TARGET_PAGE_BITS); 2462 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2463 } 2464 } 2465 } 2466 2467 /** 2468 * postcopy_send_discard_bm_ram: discard a RAMBlock 2469 * 2470 * Returns zero on success 2471 * 2472 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2473 * 2474 * @ms: current migration state 2475 * @block: RAMBlock to discard 2476 */ 2477 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2478 { 2479 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2480 unsigned long current; 2481 unsigned long *bitmap = block->bmap; 2482 2483 for (current = 0; current < end; ) { 2484 unsigned long one = find_next_bit(bitmap, end, current); 2485 unsigned long zero, discard_length; 2486 2487 if (one >= end) { 2488 break; 2489 } 2490 2491 zero = find_next_zero_bit(bitmap, end, one + 1); 2492 2493 if (zero >= end) { 2494 discard_length = end - one; 2495 } else { 2496 discard_length = zero - one; 2497 } 2498 postcopy_discard_send_range(ms, one, discard_length); 2499 current = one + discard_length; 2500 } 2501 2502 return 0; 2503 } 2504 2505 /** 2506 * postcopy_each_ram_send_discard: discard all RAMBlocks 2507 * 2508 * Returns 0 for success or negative for error 2509 * 2510 * Utility for the outgoing postcopy code. 2511 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2512 * passing it bitmap indexes and name. 2513 * (qemu_ram_foreach_block ends up passing unscaled lengths 2514 * which would mean postcopy code would have to deal with target page) 2515 * 2516 * @ms: current migration state 2517 */ 2518 static int postcopy_each_ram_send_discard(MigrationState *ms) 2519 { 2520 struct RAMBlock *block; 2521 int ret; 2522 2523 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2524 postcopy_discard_send_init(ms, block->idstr); 2525 2526 /* 2527 * Postcopy sends chunks of bitmap over the wire, but it 2528 * just needs indexes at this point, avoids it having 2529 * target page specific code. 2530 */ 2531 ret = postcopy_send_discard_bm_ram(ms, block); 2532 postcopy_discard_send_finish(ms); 2533 if (ret) { 2534 return ret; 2535 } 2536 } 2537 2538 return 0; 2539 } 2540 2541 /** 2542 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2543 * 2544 * Helper for postcopy_chunk_hostpages; it's called twice to 2545 * canonicalize the two bitmaps, that are similar, but one is 2546 * inverted. 2547 * 2548 * Postcopy requires that all target pages in a hostpage are dirty or 2549 * clean, not a mix. This function canonicalizes the bitmaps. 2550 * 2551 * @ms: current migration state 2552 * @block: block that contains the page we want to canonicalize 2553 */ 2554 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2555 { 2556 RAMState *rs = ram_state; 2557 unsigned long *bitmap = block->bmap; 2558 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2559 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2560 unsigned long run_start; 2561 2562 if (block->page_size == TARGET_PAGE_SIZE) { 2563 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2564 return; 2565 } 2566 2567 /* Find a dirty page */ 2568 run_start = find_next_bit(bitmap, pages, 0); 2569 2570 while (run_start < pages) { 2571 2572 /* 2573 * If the start of this run of pages is in the middle of a host 2574 * page, then we need to fixup this host page. 2575 */ 2576 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2577 /* Find the end of this run */ 2578 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2579 /* 2580 * If the end isn't at the start of a host page, then the 2581 * run doesn't finish at the end of a host page 2582 * and we need to discard. 2583 */ 2584 } 2585 2586 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2587 unsigned long page; 2588 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2589 host_ratio); 2590 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2591 2592 /* Clean up the bitmap */ 2593 for (page = fixup_start_addr; 2594 page < fixup_start_addr + host_ratio; page++) { 2595 /* 2596 * Remark them as dirty, updating the count for any pages 2597 * that weren't previously dirty. 2598 */ 2599 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2600 } 2601 } 2602 2603 /* Find the next dirty page for the next iteration */ 2604 run_start = find_next_bit(bitmap, pages, run_start); 2605 } 2606 } 2607 2608 /** 2609 * postcopy_chunk_hostpages: discard any partially sent host page 2610 * 2611 * Utility for the outgoing postcopy code. 2612 * 2613 * Discard any partially sent host-page size chunks, mark any partially 2614 * dirty host-page size chunks as all dirty. In this case the host-page 2615 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2616 * 2617 * Returns zero on success 2618 * 2619 * @ms: current migration state 2620 * @block: block we want to work with 2621 */ 2622 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2623 { 2624 postcopy_discard_send_init(ms, block->idstr); 2625 2626 /* 2627 * Ensure that all partially dirty host pages are made fully dirty. 2628 */ 2629 postcopy_chunk_hostpages_pass(ms, block); 2630 2631 postcopy_discard_send_finish(ms); 2632 return 0; 2633 } 2634 2635 /** 2636 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2637 * 2638 * Returns zero on success 2639 * 2640 * Transmit the set of pages to be discarded after precopy to the target 2641 * these are pages that: 2642 * a) Have been previously transmitted but are now dirty again 2643 * b) Pages that have never been transmitted, this ensures that 2644 * any pages on the destination that have been mapped by background 2645 * tasks get discarded (transparent huge pages is the specific concern) 2646 * Hopefully this is pretty sparse 2647 * 2648 * @ms: current migration state 2649 */ 2650 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2651 { 2652 RAMState *rs = ram_state; 2653 RAMBlock *block; 2654 int ret; 2655 2656 RCU_READ_LOCK_GUARD(); 2657 2658 /* This should be our last sync, the src is now paused */ 2659 migration_bitmap_sync(rs); 2660 2661 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2662 rs->last_seen_block = NULL; 2663 rs->last_sent_block = NULL; 2664 rs->last_page = 0; 2665 2666 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2667 /* Deal with TPS != HPS and huge pages */ 2668 ret = postcopy_chunk_hostpages(ms, block); 2669 if (ret) { 2670 return ret; 2671 } 2672 2673 #ifdef DEBUG_POSTCOPY 2674 ram_debug_dump_bitmap(block->bmap, true, 2675 block->used_length >> TARGET_PAGE_BITS); 2676 #endif 2677 } 2678 trace_ram_postcopy_send_discard_bitmap(); 2679 2680 return postcopy_each_ram_send_discard(ms); 2681 } 2682 2683 /** 2684 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2685 * 2686 * Returns zero on success 2687 * 2688 * @rbname: name of the RAMBlock of the request. NULL means the 2689 * same that last one. 2690 * @start: RAMBlock starting page 2691 * @length: RAMBlock size 2692 */ 2693 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2694 { 2695 trace_ram_discard_range(rbname, start, length); 2696 2697 RCU_READ_LOCK_GUARD(); 2698 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2699 2700 if (!rb) { 2701 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2702 return -1; 2703 } 2704 2705 /* 2706 * On source VM, we don't need to update the received bitmap since 2707 * we don't even have one. 2708 */ 2709 if (rb->receivedmap) { 2710 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2711 length >> qemu_target_page_bits()); 2712 } 2713 2714 return ram_block_discard_range(rb, start, length); 2715 } 2716 2717 /* 2718 * For every allocation, we will try not to crash the VM if the 2719 * allocation failed. 2720 */ 2721 static int xbzrle_init(void) 2722 { 2723 Error *local_err = NULL; 2724 2725 if (!migrate_use_xbzrle()) { 2726 return 0; 2727 } 2728 2729 XBZRLE_cache_lock(); 2730 2731 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2732 if (!XBZRLE.zero_target_page) { 2733 error_report("%s: Error allocating zero page", __func__); 2734 goto err_out; 2735 } 2736 2737 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2738 TARGET_PAGE_SIZE, &local_err); 2739 if (!XBZRLE.cache) { 2740 error_report_err(local_err); 2741 goto free_zero_page; 2742 } 2743 2744 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2745 if (!XBZRLE.encoded_buf) { 2746 error_report("%s: Error allocating encoded_buf", __func__); 2747 goto free_cache; 2748 } 2749 2750 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2751 if (!XBZRLE.current_buf) { 2752 error_report("%s: Error allocating current_buf", __func__); 2753 goto free_encoded_buf; 2754 } 2755 2756 /* We are all good */ 2757 XBZRLE_cache_unlock(); 2758 return 0; 2759 2760 free_encoded_buf: 2761 g_free(XBZRLE.encoded_buf); 2762 XBZRLE.encoded_buf = NULL; 2763 free_cache: 2764 cache_fini(XBZRLE.cache); 2765 XBZRLE.cache = NULL; 2766 free_zero_page: 2767 g_free(XBZRLE.zero_target_page); 2768 XBZRLE.zero_target_page = NULL; 2769 err_out: 2770 XBZRLE_cache_unlock(); 2771 return -ENOMEM; 2772 } 2773 2774 static int ram_state_init(RAMState **rsp) 2775 { 2776 *rsp = g_try_new0(RAMState, 1); 2777 2778 if (!*rsp) { 2779 error_report("%s: Init ramstate fail", __func__); 2780 return -1; 2781 } 2782 2783 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2784 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2785 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2786 2787 /* 2788 * Count the total number of pages used by ram blocks not including any 2789 * gaps due to alignment or unplugs. 2790 * This must match with the initial values of dirty bitmap. 2791 */ 2792 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2793 ram_state_reset(*rsp); 2794 2795 return 0; 2796 } 2797 2798 static void ram_list_init_bitmaps(void) 2799 { 2800 MigrationState *ms = migrate_get_current(); 2801 RAMBlock *block; 2802 unsigned long pages; 2803 uint8_t shift; 2804 2805 /* Skip setting bitmap if there is no RAM */ 2806 if (ram_bytes_total()) { 2807 shift = ms->clear_bitmap_shift; 2808 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2809 error_report("clear_bitmap_shift (%u) too big, using " 2810 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2811 shift = CLEAR_BITMAP_SHIFT_MAX; 2812 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2813 error_report("clear_bitmap_shift (%u) too small, using " 2814 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2815 shift = CLEAR_BITMAP_SHIFT_MIN; 2816 } 2817 2818 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2819 pages = block->max_length >> TARGET_PAGE_BITS; 2820 /* 2821 * The initial dirty bitmap for migration must be set with all 2822 * ones to make sure we'll migrate every guest RAM page to 2823 * destination. 2824 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2825 * new migration after a failed migration, ram_list. 2826 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2827 * guest memory. 2828 */ 2829 block->bmap = bitmap_new(pages); 2830 bitmap_set(block->bmap, 0, pages); 2831 block->clear_bmap_shift = shift; 2832 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2833 } 2834 } 2835 } 2836 2837 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2838 { 2839 unsigned long pages; 2840 RAMBlock *rb; 2841 2842 RCU_READ_LOCK_GUARD(); 2843 2844 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2845 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2846 rs->migration_dirty_pages -= pages; 2847 } 2848 } 2849 2850 static void ram_init_bitmaps(RAMState *rs) 2851 { 2852 /* For memory_global_dirty_log_start below. */ 2853 qemu_mutex_lock_iothread(); 2854 qemu_mutex_lock_ramlist(); 2855 2856 WITH_RCU_READ_LOCK_GUARD() { 2857 ram_list_init_bitmaps(); 2858 /* We don't use dirty log with background snapshots */ 2859 if (!migrate_background_snapshot()) { 2860 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 2861 migration_bitmap_sync_precopy(rs); 2862 } 2863 } 2864 qemu_mutex_unlock_ramlist(); 2865 qemu_mutex_unlock_iothread(); 2866 2867 /* 2868 * After an eventual first bitmap sync, fixup the initial bitmap 2869 * containing all 1s to exclude any discarded pages from migration. 2870 */ 2871 migration_bitmap_clear_discarded_pages(rs); 2872 } 2873 2874 static int ram_init_all(RAMState **rsp) 2875 { 2876 if (ram_state_init(rsp)) { 2877 return -1; 2878 } 2879 2880 if (xbzrle_init()) { 2881 ram_state_cleanup(rsp); 2882 return -1; 2883 } 2884 2885 ram_init_bitmaps(*rsp); 2886 2887 return 0; 2888 } 2889 2890 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2891 { 2892 RAMBlock *block; 2893 uint64_t pages = 0; 2894 2895 /* 2896 * Postcopy is not using xbzrle/compression, so no need for that. 2897 * Also, since source are already halted, we don't need to care 2898 * about dirty page logging as well. 2899 */ 2900 2901 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2902 pages += bitmap_count_one(block->bmap, 2903 block->used_length >> TARGET_PAGE_BITS); 2904 } 2905 2906 /* This may not be aligned with current bitmaps. Recalculate. */ 2907 rs->migration_dirty_pages = pages; 2908 2909 ram_state_reset(rs); 2910 2911 /* Update RAMState cache of output QEMUFile */ 2912 rs->f = out; 2913 2914 trace_ram_state_resume_prepare(pages); 2915 } 2916 2917 /* 2918 * This function clears bits of the free pages reported by the caller from the 2919 * migration dirty bitmap. @addr is the host address corresponding to the 2920 * start of the continuous guest free pages, and @len is the total bytes of 2921 * those pages. 2922 */ 2923 void qemu_guest_free_page_hint(void *addr, size_t len) 2924 { 2925 RAMBlock *block; 2926 ram_addr_t offset; 2927 size_t used_len, start, npages; 2928 MigrationState *s = migrate_get_current(); 2929 2930 /* This function is currently expected to be used during live migration */ 2931 if (!migration_is_setup_or_active(s->state)) { 2932 return; 2933 } 2934 2935 for (; len > 0; len -= used_len, addr += used_len) { 2936 block = qemu_ram_block_from_host(addr, false, &offset); 2937 if (unlikely(!block || offset >= block->used_length)) { 2938 /* 2939 * The implementation might not support RAMBlock resize during 2940 * live migration, but it could happen in theory with future 2941 * updates. So we add a check here to capture that case. 2942 */ 2943 error_report_once("%s unexpected error", __func__); 2944 return; 2945 } 2946 2947 if (len <= block->used_length - offset) { 2948 used_len = len; 2949 } else { 2950 used_len = block->used_length - offset; 2951 } 2952 2953 start = offset >> TARGET_PAGE_BITS; 2954 npages = used_len >> TARGET_PAGE_BITS; 2955 2956 qemu_mutex_lock(&ram_state->bitmap_mutex); 2957 /* 2958 * The skipped free pages are equavalent to be sent from clear_bmap's 2959 * perspective, so clear the bits from the memory region bitmap which 2960 * are initially set. Otherwise those skipped pages will be sent in 2961 * the next round after syncing from the memory region bitmap. 2962 */ 2963 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2964 ram_state->migration_dirty_pages -= 2965 bitmap_count_one_with_offset(block->bmap, start, npages); 2966 bitmap_clear(block->bmap, start, npages); 2967 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2968 } 2969 } 2970 2971 /* 2972 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2973 * long-running RCU critical section. When rcu-reclaims in the code 2974 * start to become numerous it will be necessary to reduce the 2975 * granularity of these critical sections. 2976 */ 2977 2978 /** 2979 * ram_save_setup: Setup RAM for migration 2980 * 2981 * Returns zero to indicate success and negative for error 2982 * 2983 * @f: QEMUFile where to send the data 2984 * @opaque: RAMState pointer 2985 */ 2986 static int ram_save_setup(QEMUFile *f, void *opaque) 2987 { 2988 RAMState **rsp = opaque; 2989 RAMBlock *block; 2990 2991 if (compress_threads_save_setup()) { 2992 return -1; 2993 } 2994 2995 /* migration has already setup the bitmap, reuse it. */ 2996 if (!migration_in_colo_state()) { 2997 if (ram_init_all(rsp) != 0) { 2998 compress_threads_save_cleanup(); 2999 return -1; 3000 } 3001 } 3002 (*rsp)->f = f; 3003 3004 WITH_RCU_READ_LOCK_GUARD() { 3005 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 3006 3007 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3008 qemu_put_byte(f, strlen(block->idstr)); 3009 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3010 qemu_put_be64(f, block->used_length); 3011 if (migrate_postcopy_ram() && block->page_size != 3012 qemu_host_page_size) { 3013 qemu_put_be64(f, block->page_size); 3014 } 3015 if (migrate_ignore_shared()) { 3016 qemu_put_be64(f, block->mr->addr); 3017 } 3018 } 3019 } 3020 3021 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3022 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3023 3024 multifd_send_sync_main(f); 3025 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3026 qemu_fflush(f); 3027 3028 return 0; 3029 } 3030 3031 /** 3032 * ram_save_iterate: iterative stage for migration 3033 * 3034 * Returns zero to indicate success and negative for error 3035 * 3036 * @f: QEMUFile where to send the data 3037 * @opaque: RAMState pointer 3038 */ 3039 static int ram_save_iterate(QEMUFile *f, void *opaque) 3040 { 3041 RAMState **temp = opaque; 3042 RAMState *rs = *temp; 3043 int ret = 0; 3044 int i; 3045 int64_t t0; 3046 int done = 0; 3047 3048 if (blk_mig_bulk_active()) { 3049 /* Avoid transferring ram during bulk phase of block migration as 3050 * the bulk phase will usually take a long time and transferring 3051 * ram updates during that time is pointless. */ 3052 goto out; 3053 } 3054 3055 /* 3056 * We'll take this lock a little bit long, but it's okay for two reasons. 3057 * Firstly, the only possible other thread to take it is who calls 3058 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3059 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3060 * guarantees that we'll at least released it in a regular basis. 3061 */ 3062 qemu_mutex_lock(&rs->bitmap_mutex); 3063 WITH_RCU_READ_LOCK_GUARD() { 3064 if (ram_list.version != rs->last_version) { 3065 ram_state_reset(rs); 3066 } 3067 3068 /* Read version before ram_list.blocks */ 3069 smp_rmb(); 3070 3071 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3072 3073 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3074 i = 0; 3075 while ((ret = qemu_file_rate_limit(f)) == 0 || 3076 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 3077 int pages; 3078 3079 if (qemu_file_get_error(f)) { 3080 break; 3081 } 3082 3083 pages = ram_find_and_save_block(rs, false); 3084 /* no more pages to sent */ 3085 if (pages == 0) { 3086 done = 1; 3087 break; 3088 } 3089 3090 if (pages < 0) { 3091 qemu_file_set_error(f, pages); 3092 break; 3093 } 3094 3095 rs->target_page_count += pages; 3096 3097 /* 3098 * During postcopy, it is necessary to make sure one whole host 3099 * page is sent in one chunk. 3100 */ 3101 if (migrate_postcopy_ram()) { 3102 flush_compressed_data(rs); 3103 } 3104 3105 /* 3106 * we want to check in the 1st loop, just in case it was the 1st 3107 * time and we had to sync the dirty bitmap. 3108 * qemu_clock_get_ns() is a bit expensive, so we only check each 3109 * some iterations 3110 */ 3111 if ((i & 63) == 0) { 3112 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3113 1000000; 3114 if (t1 > MAX_WAIT) { 3115 trace_ram_save_iterate_big_wait(t1, i); 3116 break; 3117 } 3118 } 3119 i++; 3120 } 3121 } 3122 qemu_mutex_unlock(&rs->bitmap_mutex); 3123 3124 /* 3125 * Must occur before EOS (or any QEMUFile operation) 3126 * because of RDMA protocol. 3127 */ 3128 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3129 3130 out: 3131 if (ret >= 0 3132 && migration_is_setup_or_active(migrate_get_current()->state)) { 3133 multifd_send_sync_main(rs->f); 3134 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3135 qemu_fflush(f); 3136 ram_counters.transferred += 8; 3137 3138 ret = qemu_file_get_error(f); 3139 } 3140 if (ret < 0) { 3141 return ret; 3142 } 3143 3144 return done; 3145 } 3146 3147 /** 3148 * ram_save_complete: function called to send the remaining amount of ram 3149 * 3150 * Returns zero to indicate success or negative on error 3151 * 3152 * Called with iothread lock 3153 * 3154 * @f: QEMUFile where to send the data 3155 * @opaque: RAMState pointer 3156 */ 3157 static int ram_save_complete(QEMUFile *f, void *opaque) 3158 { 3159 RAMState **temp = opaque; 3160 RAMState *rs = *temp; 3161 int ret = 0; 3162 3163 WITH_RCU_READ_LOCK_GUARD() { 3164 if (!migration_in_postcopy()) { 3165 migration_bitmap_sync_precopy(rs); 3166 } 3167 3168 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3169 3170 /* try transferring iterative blocks of memory */ 3171 3172 /* flush all remaining blocks regardless of rate limiting */ 3173 while (true) { 3174 int pages; 3175 3176 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 3177 /* no more blocks to sent */ 3178 if (pages == 0) { 3179 break; 3180 } 3181 if (pages < 0) { 3182 ret = pages; 3183 break; 3184 } 3185 } 3186 3187 flush_compressed_data(rs); 3188 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3189 } 3190 3191 if (ret >= 0) { 3192 multifd_send_sync_main(rs->f); 3193 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3194 qemu_fflush(f); 3195 } 3196 3197 return ret; 3198 } 3199 3200 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3201 uint64_t *res_precopy_only, 3202 uint64_t *res_compatible, 3203 uint64_t *res_postcopy_only) 3204 { 3205 RAMState **temp = opaque; 3206 RAMState *rs = *temp; 3207 uint64_t remaining_size; 3208 3209 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3210 3211 if (!migration_in_postcopy() && 3212 remaining_size < max_size) { 3213 qemu_mutex_lock_iothread(); 3214 WITH_RCU_READ_LOCK_GUARD() { 3215 migration_bitmap_sync_precopy(rs); 3216 } 3217 qemu_mutex_unlock_iothread(); 3218 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3219 } 3220 3221 if (migrate_postcopy_ram()) { 3222 /* We can do postcopy, and all the data is postcopiable */ 3223 *res_compatible += remaining_size; 3224 } else { 3225 *res_precopy_only += remaining_size; 3226 } 3227 } 3228 3229 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3230 { 3231 unsigned int xh_len; 3232 int xh_flags; 3233 uint8_t *loaded_data; 3234 3235 /* extract RLE header */ 3236 xh_flags = qemu_get_byte(f); 3237 xh_len = qemu_get_be16(f); 3238 3239 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3240 error_report("Failed to load XBZRLE page - wrong compression!"); 3241 return -1; 3242 } 3243 3244 if (xh_len > TARGET_PAGE_SIZE) { 3245 error_report("Failed to load XBZRLE page - len overflow!"); 3246 return -1; 3247 } 3248 loaded_data = XBZRLE.decoded_buf; 3249 /* load data and decode */ 3250 /* it can change loaded_data to point to an internal buffer */ 3251 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3252 3253 /* decode RLE */ 3254 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3255 TARGET_PAGE_SIZE) == -1) { 3256 error_report("Failed to load XBZRLE page - decode error!"); 3257 return -1; 3258 } 3259 3260 return 0; 3261 } 3262 3263 /** 3264 * ram_block_from_stream: read a RAMBlock id from the migration stream 3265 * 3266 * Must be called from within a rcu critical section. 3267 * 3268 * Returns a pointer from within the RCU-protected ram_list. 3269 * 3270 * @f: QEMUFile where to read the data from 3271 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3272 */ 3273 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 3274 { 3275 static RAMBlock *block; 3276 char id[256]; 3277 uint8_t len; 3278 3279 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3280 if (!block) { 3281 error_report("Ack, bad migration stream!"); 3282 return NULL; 3283 } 3284 return block; 3285 } 3286 3287 len = qemu_get_byte(f); 3288 qemu_get_buffer(f, (uint8_t *)id, len); 3289 id[len] = 0; 3290 3291 block = qemu_ram_block_by_name(id); 3292 if (!block) { 3293 error_report("Can't find block %s", id); 3294 return NULL; 3295 } 3296 3297 if (ramblock_is_ignored(block)) { 3298 error_report("block %s should not be migrated !", id); 3299 return NULL; 3300 } 3301 3302 return block; 3303 } 3304 3305 static inline void *host_from_ram_block_offset(RAMBlock *block, 3306 ram_addr_t offset) 3307 { 3308 if (!offset_in_ramblock(block, offset)) { 3309 return NULL; 3310 } 3311 3312 return block->host + offset; 3313 } 3314 3315 static void *host_page_from_ram_block_offset(RAMBlock *block, 3316 ram_addr_t offset) 3317 { 3318 /* Note: Explicitly no check against offset_in_ramblock(). */ 3319 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3320 block->page_size); 3321 } 3322 3323 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3324 ram_addr_t offset) 3325 { 3326 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3327 } 3328 3329 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3330 ram_addr_t offset, bool record_bitmap) 3331 { 3332 if (!offset_in_ramblock(block, offset)) { 3333 return NULL; 3334 } 3335 if (!block->colo_cache) { 3336 error_report("%s: colo_cache is NULL in block :%s", 3337 __func__, block->idstr); 3338 return NULL; 3339 } 3340 3341 /* 3342 * During colo checkpoint, we need bitmap of these migrated pages. 3343 * It help us to decide which pages in ram cache should be flushed 3344 * into VM's RAM later. 3345 */ 3346 if (record_bitmap && 3347 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3348 ram_state->migration_dirty_pages++; 3349 } 3350 return block->colo_cache + offset; 3351 } 3352 3353 /** 3354 * ram_handle_compressed: handle the zero page case 3355 * 3356 * If a page (or a whole RDMA chunk) has been 3357 * determined to be zero, then zap it. 3358 * 3359 * @host: host address for the zero page 3360 * @ch: what the page is filled from. We only support zero 3361 * @size: size of the zero page 3362 */ 3363 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3364 { 3365 if (ch != 0 || !buffer_is_zero(host, size)) { 3366 memset(host, ch, size); 3367 } 3368 } 3369 3370 /* return the size after decompression, or negative value on error */ 3371 static int 3372 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3373 const uint8_t *source, size_t source_len) 3374 { 3375 int err; 3376 3377 err = inflateReset(stream); 3378 if (err != Z_OK) { 3379 return -1; 3380 } 3381 3382 stream->avail_in = source_len; 3383 stream->next_in = (uint8_t *)source; 3384 stream->avail_out = dest_len; 3385 stream->next_out = dest; 3386 3387 err = inflate(stream, Z_NO_FLUSH); 3388 if (err != Z_STREAM_END) { 3389 return -1; 3390 } 3391 3392 return stream->total_out; 3393 } 3394 3395 static void *do_data_decompress(void *opaque) 3396 { 3397 DecompressParam *param = opaque; 3398 unsigned long pagesize; 3399 uint8_t *des; 3400 int len, ret; 3401 3402 qemu_mutex_lock(¶m->mutex); 3403 while (!param->quit) { 3404 if (param->des) { 3405 des = param->des; 3406 len = param->len; 3407 param->des = 0; 3408 qemu_mutex_unlock(¶m->mutex); 3409 3410 pagesize = TARGET_PAGE_SIZE; 3411 3412 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3413 param->compbuf, len); 3414 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3415 error_report("decompress data failed"); 3416 qemu_file_set_error(decomp_file, ret); 3417 } 3418 3419 qemu_mutex_lock(&decomp_done_lock); 3420 param->done = true; 3421 qemu_cond_signal(&decomp_done_cond); 3422 qemu_mutex_unlock(&decomp_done_lock); 3423 3424 qemu_mutex_lock(¶m->mutex); 3425 } else { 3426 qemu_cond_wait(¶m->cond, ¶m->mutex); 3427 } 3428 } 3429 qemu_mutex_unlock(¶m->mutex); 3430 3431 return NULL; 3432 } 3433 3434 static int wait_for_decompress_done(void) 3435 { 3436 int idx, thread_count; 3437 3438 if (!migrate_use_compression()) { 3439 return 0; 3440 } 3441 3442 thread_count = migrate_decompress_threads(); 3443 qemu_mutex_lock(&decomp_done_lock); 3444 for (idx = 0; idx < thread_count; idx++) { 3445 while (!decomp_param[idx].done) { 3446 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3447 } 3448 } 3449 qemu_mutex_unlock(&decomp_done_lock); 3450 return qemu_file_get_error(decomp_file); 3451 } 3452 3453 static void compress_threads_load_cleanup(void) 3454 { 3455 int i, thread_count; 3456 3457 if (!migrate_use_compression()) { 3458 return; 3459 } 3460 thread_count = migrate_decompress_threads(); 3461 for (i = 0; i < thread_count; i++) { 3462 /* 3463 * we use it as a indicator which shows if the thread is 3464 * properly init'd or not 3465 */ 3466 if (!decomp_param[i].compbuf) { 3467 break; 3468 } 3469 3470 qemu_mutex_lock(&decomp_param[i].mutex); 3471 decomp_param[i].quit = true; 3472 qemu_cond_signal(&decomp_param[i].cond); 3473 qemu_mutex_unlock(&decomp_param[i].mutex); 3474 } 3475 for (i = 0; i < thread_count; i++) { 3476 if (!decomp_param[i].compbuf) { 3477 break; 3478 } 3479 3480 qemu_thread_join(decompress_threads + i); 3481 qemu_mutex_destroy(&decomp_param[i].mutex); 3482 qemu_cond_destroy(&decomp_param[i].cond); 3483 inflateEnd(&decomp_param[i].stream); 3484 g_free(decomp_param[i].compbuf); 3485 decomp_param[i].compbuf = NULL; 3486 } 3487 g_free(decompress_threads); 3488 g_free(decomp_param); 3489 decompress_threads = NULL; 3490 decomp_param = NULL; 3491 decomp_file = NULL; 3492 } 3493 3494 static int compress_threads_load_setup(QEMUFile *f) 3495 { 3496 int i, thread_count; 3497 3498 if (!migrate_use_compression()) { 3499 return 0; 3500 } 3501 3502 thread_count = migrate_decompress_threads(); 3503 decompress_threads = g_new0(QemuThread, thread_count); 3504 decomp_param = g_new0(DecompressParam, thread_count); 3505 qemu_mutex_init(&decomp_done_lock); 3506 qemu_cond_init(&decomp_done_cond); 3507 decomp_file = f; 3508 for (i = 0; i < thread_count; i++) { 3509 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3510 goto exit; 3511 } 3512 3513 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3514 qemu_mutex_init(&decomp_param[i].mutex); 3515 qemu_cond_init(&decomp_param[i].cond); 3516 decomp_param[i].done = true; 3517 decomp_param[i].quit = false; 3518 qemu_thread_create(decompress_threads + i, "decompress", 3519 do_data_decompress, decomp_param + i, 3520 QEMU_THREAD_JOINABLE); 3521 } 3522 return 0; 3523 exit: 3524 compress_threads_load_cleanup(); 3525 return -1; 3526 } 3527 3528 static void decompress_data_with_multi_threads(QEMUFile *f, 3529 void *host, int len) 3530 { 3531 int idx, thread_count; 3532 3533 thread_count = migrate_decompress_threads(); 3534 QEMU_LOCK_GUARD(&decomp_done_lock); 3535 while (true) { 3536 for (idx = 0; idx < thread_count; idx++) { 3537 if (decomp_param[idx].done) { 3538 decomp_param[idx].done = false; 3539 qemu_mutex_lock(&decomp_param[idx].mutex); 3540 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3541 decomp_param[idx].des = host; 3542 decomp_param[idx].len = len; 3543 qemu_cond_signal(&decomp_param[idx].cond); 3544 qemu_mutex_unlock(&decomp_param[idx].mutex); 3545 break; 3546 } 3547 } 3548 if (idx < thread_count) { 3549 break; 3550 } else { 3551 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3552 } 3553 } 3554 } 3555 3556 static void colo_init_ram_state(void) 3557 { 3558 ram_state_init(&ram_state); 3559 } 3560 3561 /* 3562 * colo cache: this is for secondary VM, we cache the whole 3563 * memory of the secondary VM, it is need to hold the global lock 3564 * to call this helper. 3565 */ 3566 int colo_init_ram_cache(void) 3567 { 3568 RAMBlock *block; 3569 3570 WITH_RCU_READ_LOCK_GUARD() { 3571 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3572 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3573 NULL, false, false); 3574 if (!block->colo_cache) { 3575 error_report("%s: Can't alloc memory for COLO cache of block %s," 3576 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3577 block->used_length); 3578 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3579 if (block->colo_cache) { 3580 qemu_anon_ram_free(block->colo_cache, block->used_length); 3581 block->colo_cache = NULL; 3582 } 3583 } 3584 return -errno; 3585 } 3586 if (!machine_dump_guest_core(current_machine)) { 3587 qemu_madvise(block->colo_cache, block->used_length, 3588 QEMU_MADV_DONTDUMP); 3589 } 3590 } 3591 } 3592 3593 /* 3594 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3595 * with to decide which page in cache should be flushed into SVM's RAM. Here 3596 * we use the same name 'ram_bitmap' as for migration. 3597 */ 3598 if (ram_bytes_total()) { 3599 RAMBlock *block; 3600 3601 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3602 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3603 block->bmap = bitmap_new(pages); 3604 } 3605 } 3606 3607 colo_init_ram_state(); 3608 return 0; 3609 } 3610 3611 /* TODO: duplicated with ram_init_bitmaps */ 3612 void colo_incoming_start_dirty_log(void) 3613 { 3614 RAMBlock *block = NULL; 3615 /* For memory_global_dirty_log_start below. */ 3616 qemu_mutex_lock_iothread(); 3617 qemu_mutex_lock_ramlist(); 3618 3619 memory_global_dirty_log_sync(); 3620 WITH_RCU_READ_LOCK_GUARD() { 3621 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3622 ramblock_sync_dirty_bitmap(ram_state, block); 3623 /* Discard this dirty bitmap record */ 3624 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3625 } 3626 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3627 } 3628 ram_state->migration_dirty_pages = 0; 3629 qemu_mutex_unlock_ramlist(); 3630 qemu_mutex_unlock_iothread(); 3631 } 3632 3633 /* It is need to hold the global lock to call this helper */ 3634 void colo_release_ram_cache(void) 3635 { 3636 RAMBlock *block; 3637 3638 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3639 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3640 g_free(block->bmap); 3641 block->bmap = NULL; 3642 } 3643 3644 WITH_RCU_READ_LOCK_GUARD() { 3645 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3646 if (block->colo_cache) { 3647 qemu_anon_ram_free(block->colo_cache, block->used_length); 3648 block->colo_cache = NULL; 3649 } 3650 } 3651 } 3652 ram_state_cleanup(&ram_state); 3653 } 3654 3655 /** 3656 * ram_load_setup: Setup RAM for migration incoming side 3657 * 3658 * Returns zero to indicate success and negative for error 3659 * 3660 * @f: QEMUFile where to receive the data 3661 * @opaque: RAMState pointer 3662 */ 3663 static int ram_load_setup(QEMUFile *f, void *opaque) 3664 { 3665 if (compress_threads_load_setup(f)) { 3666 return -1; 3667 } 3668 3669 xbzrle_load_setup(); 3670 ramblock_recv_map_init(); 3671 3672 return 0; 3673 } 3674 3675 static int ram_load_cleanup(void *opaque) 3676 { 3677 RAMBlock *rb; 3678 3679 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3680 qemu_ram_block_writeback(rb); 3681 } 3682 3683 xbzrle_load_cleanup(); 3684 compress_threads_load_cleanup(); 3685 3686 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3687 g_free(rb->receivedmap); 3688 rb->receivedmap = NULL; 3689 } 3690 3691 return 0; 3692 } 3693 3694 /** 3695 * ram_postcopy_incoming_init: allocate postcopy data structures 3696 * 3697 * Returns 0 for success and negative if there was one error 3698 * 3699 * @mis: current migration incoming state 3700 * 3701 * Allocate data structures etc needed by incoming migration with 3702 * postcopy-ram. postcopy-ram's similarly names 3703 * postcopy_ram_incoming_init does the work. 3704 */ 3705 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3706 { 3707 return postcopy_ram_incoming_init(mis); 3708 } 3709 3710 /** 3711 * ram_load_postcopy: load a page in postcopy case 3712 * 3713 * Returns 0 for success or -errno in case of error 3714 * 3715 * Called in postcopy mode by ram_load(). 3716 * rcu_read_lock is taken prior to this being called. 3717 * 3718 * @f: QEMUFile where to send the data 3719 */ 3720 static int ram_load_postcopy(QEMUFile *f) 3721 { 3722 int flags = 0, ret = 0; 3723 bool place_needed = false; 3724 bool matches_target_page_size = false; 3725 MigrationIncomingState *mis = migration_incoming_get_current(); 3726 /* Temporary page that is later 'placed' */ 3727 void *postcopy_host_page = mis->postcopy_tmp_page; 3728 void *host_page = NULL; 3729 bool all_zero = true; 3730 int target_pages = 0; 3731 3732 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3733 ram_addr_t addr; 3734 void *page_buffer = NULL; 3735 void *place_source = NULL; 3736 RAMBlock *block = NULL; 3737 uint8_t ch; 3738 int len; 3739 3740 addr = qemu_get_be64(f); 3741 3742 /* 3743 * If qemu file error, we should stop here, and then "addr" 3744 * may be invalid 3745 */ 3746 ret = qemu_file_get_error(f); 3747 if (ret) { 3748 break; 3749 } 3750 3751 flags = addr & ~TARGET_PAGE_MASK; 3752 addr &= TARGET_PAGE_MASK; 3753 3754 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3755 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3756 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3757 block = ram_block_from_stream(f, flags); 3758 if (!block) { 3759 ret = -EINVAL; 3760 break; 3761 } 3762 3763 /* 3764 * Relying on used_length is racy and can result in false positives. 3765 * We might place pages beyond used_length in case RAM was shrunk 3766 * while in postcopy, which is fine - trying to place via 3767 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3768 */ 3769 if (!block->host || addr >= block->postcopy_length) { 3770 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3771 ret = -EINVAL; 3772 break; 3773 } 3774 target_pages++; 3775 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3776 /* 3777 * Postcopy requires that we place whole host pages atomically; 3778 * these may be huge pages for RAMBlocks that are backed by 3779 * hugetlbfs. 3780 * To make it atomic, the data is read into a temporary page 3781 * that's moved into place later. 3782 * The migration protocol uses, possibly smaller, target-pages 3783 * however the source ensures it always sends all the components 3784 * of a host page in one chunk. 3785 */ 3786 page_buffer = postcopy_host_page + 3787 host_page_offset_from_ram_block_offset(block, addr); 3788 /* If all TP are zero then we can optimise the place */ 3789 if (target_pages == 1) { 3790 host_page = host_page_from_ram_block_offset(block, addr); 3791 } else if (host_page != host_page_from_ram_block_offset(block, 3792 addr)) { 3793 /* not the 1st TP within the HP */ 3794 error_report("Non-same host page %p/%p", host_page, 3795 host_page_from_ram_block_offset(block, addr)); 3796 ret = -EINVAL; 3797 break; 3798 } 3799 3800 /* 3801 * If it's the last part of a host page then we place the host 3802 * page 3803 */ 3804 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3805 place_needed = true; 3806 } 3807 place_source = postcopy_host_page; 3808 } 3809 3810 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3811 case RAM_SAVE_FLAG_ZERO: 3812 ch = qemu_get_byte(f); 3813 /* 3814 * Can skip to set page_buffer when 3815 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3816 */ 3817 if (ch || !matches_target_page_size) { 3818 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3819 } 3820 if (ch) { 3821 all_zero = false; 3822 } 3823 break; 3824 3825 case RAM_SAVE_FLAG_PAGE: 3826 all_zero = false; 3827 if (!matches_target_page_size) { 3828 /* For huge pages, we always use temporary buffer */ 3829 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3830 } else { 3831 /* 3832 * For small pages that matches target page size, we 3833 * avoid the qemu_file copy. Instead we directly use 3834 * the buffer of QEMUFile to place the page. Note: we 3835 * cannot do any QEMUFile operation before using that 3836 * buffer to make sure the buffer is valid when 3837 * placing the page. 3838 */ 3839 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3840 TARGET_PAGE_SIZE); 3841 } 3842 break; 3843 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3844 all_zero = false; 3845 len = qemu_get_be32(f); 3846 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3847 error_report("Invalid compressed data length: %d", len); 3848 ret = -EINVAL; 3849 break; 3850 } 3851 decompress_data_with_multi_threads(f, page_buffer, len); 3852 break; 3853 3854 case RAM_SAVE_FLAG_EOS: 3855 /* normal exit */ 3856 multifd_recv_sync_main(); 3857 break; 3858 default: 3859 error_report("Unknown combination of migration flags: 0x%x" 3860 " (postcopy mode)", flags); 3861 ret = -EINVAL; 3862 break; 3863 } 3864 3865 /* Got the whole host page, wait for decompress before placing. */ 3866 if (place_needed) { 3867 ret |= wait_for_decompress_done(); 3868 } 3869 3870 /* Detect for any possible file errors */ 3871 if (!ret && qemu_file_get_error(f)) { 3872 ret = qemu_file_get_error(f); 3873 } 3874 3875 if (!ret && place_needed) { 3876 if (all_zero) { 3877 ret = postcopy_place_page_zero(mis, host_page, block); 3878 } else { 3879 ret = postcopy_place_page(mis, host_page, place_source, 3880 block); 3881 } 3882 place_needed = false; 3883 target_pages = 0; 3884 /* Assume we have a zero page until we detect something different */ 3885 all_zero = true; 3886 } 3887 } 3888 3889 return ret; 3890 } 3891 3892 static bool postcopy_is_advised(void) 3893 { 3894 PostcopyState ps = postcopy_state_get(); 3895 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3896 } 3897 3898 static bool postcopy_is_running(void) 3899 { 3900 PostcopyState ps = postcopy_state_get(); 3901 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3902 } 3903 3904 /* 3905 * Flush content of RAM cache into SVM's memory. 3906 * Only flush the pages that be dirtied by PVM or SVM or both. 3907 */ 3908 void colo_flush_ram_cache(void) 3909 { 3910 RAMBlock *block = NULL; 3911 void *dst_host; 3912 void *src_host; 3913 unsigned long offset = 0; 3914 3915 memory_global_dirty_log_sync(); 3916 WITH_RCU_READ_LOCK_GUARD() { 3917 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3918 ramblock_sync_dirty_bitmap(ram_state, block); 3919 } 3920 } 3921 3922 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3923 WITH_RCU_READ_LOCK_GUARD() { 3924 block = QLIST_FIRST_RCU(&ram_list.blocks); 3925 3926 while (block) { 3927 unsigned long num = 0; 3928 3929 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3930 if (!offset_in_ramblock(block, 3931 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3932 offset = 0; 3933 num = 0; 3934 block = QLIST_NEXT_RCU(block, next); 3935 } else { 3936 unsigned long i = 0; 3937 3938 for (i = 0; i < num; i++) { 3939 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3940 } 3941 dst_host = block->host 3942 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3943 src_host = block->colo_cache 3944 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3945 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3946 offset += num; 3947 } 3948 } 3949 } 3950 trace_colo_flush_ram_cache_end(); 3951 } 3952 3953 /** 3954 * ram_load_precopy: load pages in precopy case 3955 * 3956 * Returns 0 for success or -errno in case of error 3957 * 3958 * Called in precopy mode by ram_load(). 3959 * rcu_read_lock is taken prior to this being called. 3960 * 3961 * @f: QEMUFile where to send the data 3962 */ 3963 static int ram_load_precopy(QEMUFile *f) 3964 { 3965 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3966 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3967 bool postcopy_advised = postcopy_is_advised(); 3968 if (!migrate_use_compression()) { 3969 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3970 } 3971 3972 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3973 ram_addr_t addr, total_ram_bytes; 3974 void *host = NULL, *host_bak = NULL; 3975 uint8_t ch; 3976 3977 /* 3978 * Yield periodically to let main loop run, but an iteration of 3979 * the main loop is expensive, so do it each some iterations 3980 */ 3981 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3982 aio_co_schedule(qemu_get_current_aio_context(), 3983 qemu_coroutine_self()); 3984 qemu_coroutine_yield(); 3985 } 3986 i++; 3987 3988 addr = qemu_get_be64(f); 3989 flags = addr & ~TARGET_PAGE_MASK; 3990 addr &= TARGET_PAGE_MASK; 3991 3992 if (flags & invalid_flags) { 3993 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3994 error_report("Received an unexpected compressed page"); 3995 } 3996 3997 ret = -EINVAL; 3998 break; 3999 } 4000 4001 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4002 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4003 RAMBlock *block = ram_block_from_stream(f, flags); 4004 4005 host = host_from_ram_block_offset(block, addr); 4006 /* 4007 * After going into COLO stage, we should not load the page 4008 * into SVM's memory directly, we put them into colo_cache firstly. 4009 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4010 * Previously, we copied all these memory in preparing stage of COLO 4011 * while we need to stop VM, which is a time-consuming process. 4012 * Here we optimize it by a trick, back-up every page while in 4013 * migration process while COLO is enabled, though it affects the 4014 * speed of the migration, but it obviously reduce the downtime of 4015 * back-up all SVM'S memory in COLO preparing stage. 4016 */ 4017 if (migration_incoming_colo_enabled()) { 4018 if (migration_incoming_in_colo_state()) { 4019 /* In COLO stage, put all pages into cache temporarily */ 4020 host = colo_cache_from_block_offset(block, addr, true); 4021 } else { 4022 /* 4023 * In migration stage but before COLO stage, 4024 * Put all pages into both cache and SVM's memory. 4025 */ 4026 host_bak = colo_cache_from_block_offset(block, addr, false); 4027 } 4028 } 4029 if (!host) { 4030 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4031 ret = -EINVAL; 4032 break; 4033 } 4034 if (!migration_incoming_in_colo_state()) { 4035 ramblock_recv_bitmap_set(block, host); 4036 } 4037 4038 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4039 } 4040 4041 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4042 case RAM_SAVE_FLAG_MEM_SIZE: 4043 /* Synchronize RAM block list */ 4044 total_ram_bytes = addr; 4045 while (!ret && total_ram_bytes) { 4046 RAMBlock *block; 4047 char id[256]; 4048 ram_addr_t length; 4049 4050 len = qemu_get_byte(f); 4051 qemu_get_buffer(f, (uint8_t *)id, len); 4052 id[len] = 0; 4053 length = qemu_get_be64(f); 4054 4055 block = qemu_ram_block_by_name(id); 4056 if (block && !qemu_ram_is_migratable(block)) { 4057 error_report("block %s should not be migrated !", id); 4058 ret = -EINVAL; 4059 } else if (block) { 4060 if (length != block->used_length) { 4061 Error *local_err = NULL; 4062 4063 ret = qemu_ram_resize(block, length, 4064 &local_err); 4065 if (local_err) { 4066 error_report_err(local_err); 4067 } 4068 } 4069 /* For postcopy we need to check hugepage sizes match */ 4070 if (postcopy_advised && migrate_postcopy_ram() && 4071 block->page_size != qemu_host_page_size) { 4072 uint64_t remote_page_size = qemu_get_be64(f); 4073 if (remote_page_size != block->page_size) { 4074 error_report("Mismatched RAM page size %s " 4075 "(local) %zd != %" PRId64, 4076 id, block->page_size, 4077 remote_page_size); 4078 ret = -EINVAL; 4079 } 4080 } 4081 if (migrate_ignore_shared()) { 4082 hwaddr addr = qemu_get_be64(f); 4083 if (ramblock_is_ignored(block) && 4084 block->mr->addr != addr) { 4085 error_report("Mismatched GPAs for block %s " 4086 "%" PRId64 "!= %" PRId64, 4087 id, (uint64_t)addr, 4088 (uint64_t)block->mr->addr); 4089 ret = -EINVAL; 4090 } 4091 } 4092 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4093 block->idstr); 4094 } else { 4095 error_report("Unknown ramblock \"%s\", cannot " 4096 "accept migration", id); 4097 ret = -EINVAL; 4098 } 4099 4100 total_ram_bytes -= length; 4101 } 4102 break; 4103 4104 case RAM_SAVE_FLAG_ZERO: 4105 ch = qemu_get_byte(f); 4106 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4107 break; 4108 4109 case RAM_SAVE_FLAG_PAGE: 4110 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4111 break; 4112 4113 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4114 len = qemu_get_be32(f); 4115 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4116 error_report("Invalid compressed data length: %d", len); 4117 ret = -EINVAL; 4118 break; 4119 } 4120 decompress_data_with_multi_threads(f, host, len); 4121 break; 4122 4123 case RAM_SAVE_FLAG_XBZRLE: 4124 if (load_xbzrle(f, addr, host) < 0) { 4125 error_report("Failed to decompress XBZRLE page at " 4126 RAM_ADDR_FMT, addr); 4127 ret = -EINVAL; 4128 break; 4129 } 4130 break; 4131 case RAM_SAVE_FLAG_EOS: 4132 /* normal exit */ 4133 multifd_recv_sync_main(); 4134 break; 4135 default: 4136 if (flags & RAM_SAVE_FLAG_HOOK) { 4137 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4138 } else { 4139 error_report("Unknown combination of migration flags: 0x%x", 4140 flags); 4141 ret = -EINVAL; 4142 } 4143 } 4144 if (!ret) { 4145 ret = qemu_file_get_error(f); 4146 } 4147 if (!ret && host_bak) { 4148 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4149 } 4150 } 4151 4152 ret |= wait_for_decompress_done(); 4153 return ret; 4154 } 4155 4156 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4157 { 4158 int ret = 0; 4159 static uint64_t seq_iter; 4160 /* 4161 * If system is running in postcopy mode, page inserts to host memory must 4162 * be atomic 4163 */ 4164 bool postcopy_running = postcopy_is_running(); 4165 4166 seq_iter++; 4167 4168 if (version_id != 4) { 4169 return -EINVAL; 4170 } 4171 4172 /* 4173 * This RCU critical section can be very long running. 4174 * When RCU reclaims in the code start to become numerous, 4175 * it will be necessary to reduce the granularity of this 4176 * critical section. 4177 */ 4178 WITH_RCU_READ_LOCK_GUARD() { 4179 if (postcopy_running) { 4180 ret = ram_load_postcopy(f); 4181 } else { 4182 ret = ram_load_precopy(f); 4183 } 4184 } 4185 trace_ram_load_complete(ret, seq_iter); 4186 4187 return ret; 4188 } 4189 4190 static bool ram_has_postcopy(void *opaque) 4191 { 4192 RAMBlock *rb; 4193 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4194 if (ramblock_is_pmem(rb)) { 4195 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4196 "is not supported now!", rb->idstr, rb->host); 4197 return false; 4198 } 4199 } 4200 4201 return migrate_postcopy_ram(); 4202 } 4203 4204 /* Sync all the dirty bitmap with destination VM. */ 4205 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4206 { 4207 RAMBlock *block; 4208 QEMUFile *file = s->to_dst_file; 4209 int ramblock_count = 0; 4210 4211 trace_ram_dirty_bitmap_sync_start(); 4212 4213 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4214 qemu_savevm_send_recv_bitmap(file, block->idstr); 4215 trace_ram_dirty_bitmap_request(block->idstr); 4216 ramblock_count++; 4217 } 4218 4219 trace_ram_dirty_bitmap_sync_wait(); 4220 4221 /* Wait until all the ramblocks' dirty bitmap synced */ 4222 while (ramblock_count--) { 4223 qemu_sem_wait(&s->rp_state.rp_sem); 4224 } 4225 4226 trace_ram_dirty_bitmap_sync_complete(); 4227 4228 return 0; 4229 } 4230 4231 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4232 { 4233 qemu_sem_post(&s->rp_state.rp_sem); 4234 } 4235 4236 /* 4237 * Read the received bitmap, revert it as the initial dirty bitmap. 4238 * This is only used when the postcopy migration is paused but wants 4239 * to resume from a middle point. 4240 */ 4241 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4242 { 4243 int ret = -EINVAL; 4244 /* from_dst_file is always valid because we're within rp_thread */ 4245 QEMUFile *file = s->rp_state.from_dst_file; 4246 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4247 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4248 uint64_t size, end_mark; 4249 4250 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4251 4252 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4253 error_report("%s: incorrect state %s", __func__, 4254 MigrationStatus_str(s->state)); 4255 return -EINVAL; 4256 } 4257 4258 /* 4259 * Note: see comments in ramblock_recv_bitmap_send() on why we 4260 * need the endianness conversion, and the paddings. 4261 */ 4262 local_size = ROUND_UP(local_size, 8); 4263 4264 /* Add paddings */ 4265 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4266 4267 size = qemu_get_be64(file); 4268 4269 /* The size of the bitmap should match with our ramblock */ 4270 if (size != local_size) { 4271 error_report("%s: ramblock '%s' bitmap size mismatch " 4272 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4273 block->idstr, size, local_size); 4274 ret = -EINVAL; 4275 goto out; 4276 } 4277 4278 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4279 end_mark = qemu_get_be64(file); 4280 4281 ret = qemu_file_get_error(file); 4282 if (ret || size != local_size) { 4283 error_report("%s: read bitmap failed for ramblock '%s': %d" 4284 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4285 __func__, block->idstr, ret, local_size, size); 4286 ret = -EIO; 4287 goto out; 4288 } 4289 4290 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4291 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4292 __func__, block->idstr, end_mark); 4293 ret = -EINVAL; 4294 goto out; 4295 } 4296 4297 /* 4298 * Endianness conversion. We are during postcopy (though paused). 4299 * The dirty bitmap won't change. We can directly modify it. 4300 */ 4301 bitmap_from_le(block->bmap, le_bitmap, nbits); 4302 4303 /* 4304 * What we received is "received bitmap". Revert it as the initial 4305 * dirty bitmap for this ramblock. 4306 */ 4307 bitmap_complement(block->bmap, block->bmap, nbits); 4308 4309 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4310 ramblock_dirty_bitmap_clear_discarded_pages(block); 4311 4312 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4313 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4314 4315 /* 4316 * We succeeded to sync bitmap for current ramblock. If this is 4317 * the last one to sync, we need to notify the main send thread. 4318 */ 4319 ram_dirty_bitmap_reload_notify(s); 4320 4321 ret = 0; 4322 out: 4323 g_free(le_bitmap); 4324 return ret; 4325 } 4326 4327 static int ram_resume_prepare(MigrationState *s, void *opaque) 4328 { 4329 RAMState *rs = *(RAMState **)opaque; 4330 int ret; 4331 4332 ret = ram_dirty_bitmap_sync_all(s, rs); 4333 if (ret) { 4334 return ret; 4335 } 4336 4337 ram_state_resume_prepare(rs, s->to_dst_file); 4338 4339 return 0; 4340 } 4341 4342 static SaveVMHandlers savevm_ram_handlers = { 4343 .save_setup = ram_save_setup, 4344 .save_live_iterate = ram_save_iterate, 4345 .save_live_complete_postcopy = ram_save_complete, 4346 .save_live_complete_precopy = ram_save_complete, 4347 .has_postcopy = ram_has_postcopy, 4348 .save_live_pending = ram_save_pending, 4349 .load_state = ram_load, 4350 .save_cleanup = ram_save_cleanup, 4351 .load_setup = ram_load_setup, 4352 .load_cleanup = ram_load_cleanup, 4353 .resume_prepare = ram_resume_prepare, 4354 }; 4355 4356 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4357 size_t old_size, size_t new_size) 4358 { 4359 PostcopyState ps = postcopy_state_get(); 4360 ram_addr_t offset; 4361 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4362 Error *err = NULL; 4363 4364 if (ramblock_is_ignored(rb)) { 4365 return; 4366 } 4367 4368 if (!migration_is_idle()) { 4369 /* 4370 * Precopy code on the source cannot deal with the size of RAM blocks 4371 * changing at random points in time - especially after sending the 4372 * RAM block sizes in the migration stream, they must no longer change. 4373 * Abort and indicate a proper reason. 4374 */ 4375 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4376 migration_cancel(err); 4377 error_free(err); 4378 } 4379 4380 switch (ps) { 4381 case POSTCOPY_INCOMING_ADVISE: 4382 /* 4383 * Update what ram_postcopy_incoming_init()->init_range() does at the 4384 * time postcopy was advised. Syncing RAM blocks with the source will 4385 * result in RAM resizes. 4386 */ 4387 if (old_size < new_size) { 4388 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4389 error_report("RAM block '%s' discard of resized RAM failed", 4390 rb->idstr); 4391 } 4392 } 4393 rb->postcopy_length = new_size; 4394 break; 4395 case POSTCOPY_INCOMING_NONE: 4396 case POSTCOPY_INCOMING_RUNNING: 4397 case POSTCOPY_INCOMING_END: 4398 /* 4399 * Once our guest is running, postcopy does no longer care about 4400 * resizes. When growing, the new memory was not available on the 4401 * source, no handler needed. 4402 */ 4403 break; 4404 default: 4405 error_report("RAM block '%s' resized during postcopy state: %d", 4406 rb->idstr, ps); 4407 exit(-1); 4408 } 4409 } 4410 4411 static RAMBlockNotifier ram_mig_ram_notifier = { 4412 .ram_block_resized = ram_mig_ram_block_resized, 4413 }; 4414 4415 void ram_mig_init(void) 4416 { 4417 qemu_mutex_init(&XBZRLE.lock); 4418 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4419 ram_block_notifier_add(&ram_mig_ram_notifier); 4420 } 4421