1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/main-loop.h" 34 #include "xbzrle.h" 35 #include "ram.h" 36 #include "migration.h" 37 #include "migration/register.h" 38 #include "migration/misc.h" 39 #include "qemu-file.h" 40 #include "postcopy-ram.h" 41 #include "page_cache.h" 42 #include "qemu/error-report.h" 43 #include "qapi/error.h" 44 #include "qapi/qapi-types-migration.h" 45 #include "qapi/qapi-events-migration.h" 46 #include "qapi/qmp/qerror.h" 47 #include "trace.h" 48 #include "exec/ram_addr.h" 49 #include "exec/target_page.h" 50 #include "qemu/rcu_queue.h" 51 #include "migration/colo.h" 52 #include "block.h" 53 #include "sysemu/cpu-throttle.h" 54 #include "savevm.h" 55 #include "qemu/iov.h" 56 #include "multifd.h" 57 #include "sysemu/runstate.h" 58 59 #if defined(__linux__) 60 #include "qemu/userfaultfd.h" 61 #endif /* defined(__linux__) */ 62 63 /***********************************************************/ 64 /* ram save/restore */ 65 66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 67 * worked for pages that where filled with the same char. We switched 68 * it to only search for the zero value. And to avoid confusion with 69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 70 */ 71 72 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 73 #define RAM_SAVE_FLAG_ZERO 0x02 74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 75 #define RAM_SAVE_FLAG_PAGE 0x08 76 #define RAM_SAVE_FLAG_EOS 0x10 77 #define RAM_SAVE_FLAG_CONTINUE 0x20 78 #define RAM_SAVE_FLAG_XBZRLE 0x40 79 /* 0x80 is reserved in migration.h start with 0x100 next */ 80 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 81 82 static inline bool is_zero_range(uint8_t *p, uint64_t size) 83 { 84 return buffer_is_zero(p, size); 85 } 86 87 XBZRLECacheStats xbzrle_counters; 88 89 /* struct contains XBZRLE cache and a static page 90 used by the compression */ 91 static struct { 92 /* buffer used for XBZRLE encoding */ 93 uint8_t *encoded_buf; 94 /* buffer for storing page content */ 95 uint8_t *current_buf; 96 /* Cache for XBZRLE, Protected by lock. */ 97 PageCache *cache; 98 QemuMutex lock; 99 /* it will store a page full of zeros */ 100 uint8_t *zero_target_page; 101 /* buffer used for XBZRLE decoding */ 102 uint8_t *decoded_buf; 103 } XBZRLE; 104 105 static void XBZRLE_cache_lock(void) 106 { 107 if (migrate_use_xbzrle()) { 108 qemu_mutex_lock(&XBZRLE.lock); 109 } 110 } 111 112 static void XBZRLE_cache_unlock(void) 113 { 114 if (migrate_use_xbzrle()) { 115 qemu_mutex_unlock(&XBZRLE.lock); 116 } 117 } 118 119 /** 120 * xbzrle_cache_resize: resize the xbzrle cache 121 * 122 * This function is called from migrate_params_apply in main 123 * thread, possibly while a migration is in progress. A running 124 * migration may be using the cache and might finish during this call, 125 * hence changes to the cache are protected by XBZRLE.lock(). 126 * 127 * Returns 0 for success or -1 for error 128 * 129 * @new_size: new cache size 130 * @errp: set *errp if the check failed, with reason 131 */ 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 133 { 134 PageCache *new_cache; 135 int64_t ret = 0; 136 137 /* Check for truncation */ 138 if (new_size != (size_t)new_size) { 139 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 140 "exceeding address space"); 141 return -1; 142 } 143 144 if (new_size == migrate_xbzrle_cache_size()) { 145 /* nothing to do */ 146 return 0; 147 } 148 149 XBZRLE_cache_lock(); 150 151 if (XBZRLE.cache != NULL) { 152 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 153 if (!new_cache) { 154 ret = -1; 155 goto out; 156 } 157 158 cache_fini(XBZRLE.cache); 159 XBZRLE.cache = new_cache; 160 } 161 out: 162 XBZRLE_cache_unlock(); 163 return ret; 164 } 165 166 bool ramblock_is_ignored(RAMBlock *block) 167 { 168 return !qemu_ram_is_migratable(block) || 169 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 170 } 171 172 #undef RAMBLOCK_FOREACH 173 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 175 { 176 RAMBlock *block; 177 int ret = 0; 178 179 RCU_READ_LOCK_GUARD(); 180 181 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 182 ret = func(block, opaque); 183 if (ret) { 184 break; 185 } 186 } 187 return ret; 188 } 189 190 static void ramblock_recv_map_init(void) 191 { 192 RAMBlock *rb; 193 194 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 195 assert(!rb->receivedmap); 196 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 197 } 198 } 199 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 201 { 202 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 203 rb->receivedmap); 204 } 205 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 207 { 208 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 209 } 210 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 212 { 213 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 214 } 215 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 217 size_t nr) 218 { 219 bitmap_set_atomic(rb->receivedmap, 220 ramblock_recv_bitmap_offset(host_addr, rb), 221 nr); 222 } 223 224 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 225 226 /* 227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 228 * 229 * Returns >0 if success with sent bytes, or <0 if error. 230 */ 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 232 const char *block_name) 233 { 234 RAMBlock *block = qemu_ram_block_by_name(block_name); 235 unsigned long *le_bitmap, nbits; 236 uint64_t size; 237 238 if (!block) { 239 error_report("%s: invalid block name: %s", __func__, block_name); 240 return -1; 241 } 242 243 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 244 245 /* 246 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 247 * machines we may need 4 more bytes for padding (see below 248 * comment). So extend it a bit before hand. 249 */ 250 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 251 252 /* 253 * Always use little endian when sending the bitmap. This is 254 * required that when source and destination VMs are not using the 255 * same endianness. (Note: big endian won't work.) 256 */ 257 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 258 259 /* Size of the bitmap, in bytes */ 260 size = DIV_ROUND_UP(nbits, 8); 261 262 /* 263 * size is always aligned to 8 bytes for 64bit machines, but it 264 * may not be true for 32bit machines. We need this padding to 265 * make sure the migration can survive even between 32bit and 266 * 64bit machines. 267 */ 268 size = ROUND_UP(size, 8); 269 270 qemu_put_be64(file, size); 271 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 272 /* 273 * Mark as an end, in case the middle part is screwed up due to 274 * some "mysterious" reason. 275 */ 276 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 277 qemu_fflush(file); 278 279 g_free(le_bitmap); 280 281 if (qemu_file_get_error(file)) { 282 return qemu_file_get_error(file); 283 } 284 285 return size + sizeof(size); 286 } 287 288 /* 289 * An outstanding page request, on the source, having been received 290 * and queued 291 */ 292 struct RAMSrcPageRequest { 293 RAMBlock *rb; 294 hwaddr offset; 295 hwaddr len; 296 297 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 298 }; 299 300 /* State of RAM for migration */ 301 struct RAMState { 302 /* QEMUFile used for this migration */ 303 QEMUFile *f; 304 /* UFFD file descriptor, used in 'write-tracking' migration */ 305 int uffdio_fd; 306 /* Last block that we have visited searching for dirty pages */ 307 RAMBlock *last_seen_block; 308 /* Last block from where we have sent data */ 309 RAMBlock *last_sent_block; 310 /* Last dirty target page we have sent */ 311 ram_addr_t last_page; 312 /* last ram version we have seen */ 313 uint32_t last_version; 314 /* How many times we have dirty too many pages */ 315 int dirty_rate_high_cnt; 316 /* these variables are used for bitmap sync */ 317 /* last time we did a full bitmap_sync */ 318 int64_t time_last_bitmap_sync; 319 /* bytes transferred at start_time */ 320 uint64_t bytes_xfer_prev; 321 /* number of dirty pages since start_time */ 322 uint64_t num_dirty_pages_period; 323 /* xbzrle misses since the beginning of the period */ 324 uint64_t xbzrle_cache_miss_prev; 325 /* Amount of xbzrle pages since the beginning of the period */ 326 uint64_t xbzrle_pages_prev; 327 /* Amount of xbzrle encoded bytes since the beginning of the period */ 328 uint64_t xbzrle_bytes_prev; 329 /* Start using XBZRLE (e.g., after the first round). */ 330 bool xbzrle_enabled; 331 332 /* compression statistics since the beginning of the period */ 333 /* amount of count that no free thread to compress data */ 334 uint64_t compress_thread_busy_prev; 335 /* amount bytes after compression */ 336 uint64_t compressed_size_prev; 337 /* amount of compressed pages */ 338 uint64_t compress_pages_prev; 339 340 /* total handled target pages at the beginning of period */ 341 uint64_t target_page_count_prev; 342 /* total handled target pages since start */ 343 uint64_t target_page_count; 344 /* number of dirty bits in the bitmap */ 345 uint64_t migration_dirty_pages; 346 /* Protects modification of the bitmap and migration dirty pages */ 347 QemuMutex bitmap_mutex; 348 /* The RAMBlock used in the last src_page_requests */ 349 RAMBlock *last_req_rb; 350 /* Queue of outstanding page requests from the destination */ 351 QemuMutex src_page_req_mutex; 352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 353 }; 354 typedef struct RAMState RAMState; 355 356 static RAMState *ram_state; 357 358 static NotifierWithReturnList precopy_notifier_list; 359 360 void precopy_infrastructure_init(void) 361 { 362 notifier_with_return_list_init(&precopy_notifier_list); 363 } 364 365 void precopy_add_notifier(NotifierWithReturn *n) 366 { 367 notifier_with_return_list_add(&precopy_notifier_list, n); 368 } 369 370 void precopy_remove_notifier(NotifierWithReturn *n) 371 { 372 notifier_with_return_remove(n); 373 } 374 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 376 { 377 PrecopyNotifyData pnd; 378 pnd.reason = reason; 379 pnd.errp = errp; 380 381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 382 } 383 384 uint64_t ram_bytes_remaining(void) 385 { 386 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 387 0; 388 } 389 390 MigrationStats ram_counters; 391 392 /* used by the search for pages to send */ 393 struct PageSearchStatus { 394 /* Current block being searched */ 395 RAMBlock *block; 396 /* Current page to search from */ 397 unsigned long page; 398 /* Set once we wrap around */ 399 bool complete_round; 400 }; 401 typedef struct PageSearchStatus PageSearchStatus; 402 403 CompressionStats compression_counters; 404 405 struct CompressParam { 406 bool done; 407 bool quit; 408 bool zero_page; 409 QEMUFile *file; 410 QemuMutex mutex; 411 QemuCond cond; 412 RAMBlock *block; 413 ram_addr_t offset; 414 415 /* internally used fields */ 416 z_stream stream; 417 uint8_t *originbuf; 418 }; 419 typedef struct CompressParam CompressParam; 420 421 struct DecompressParam { 422 bool done; 423 bool quit; 424 QemuMutex mutex; 425 QemuCond cond; 426 void *des; 427 uint8_t *compbuf; 428 int len; 429 z_stream stream; 430 }; 431 typedef struct DecompressParam DecompressParam; 432 433 static CompressParam *comp_param; 434 static QemuThread *compress_threads; 435 /* comp_done_cond is used to wake up the migration thread when 436 * one of the compression threads has finished the compression. 437 * comp_done_lock is used to co-work with comp_done_cond. 438 */ 439 static QemuMutex comp_done_lock; 440 static QemuCond comp_done_cond; 441 /* The empty QEMUFileOps will be used by file in CompressParam */ 442 static const QEMUFileOps empty_ops = { }; 443 444 static QEMUFile *decomp_file; 445 static DecompressParam *decomp_param; 446 static QemuThread *decompress_threads; 447 static QemuMutex decomp_done_lock; 448 static QemuCond decomp_done_cond; 449 450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 451 ram_addr_t offset, uint8_t *source_buf); 452 453 static void *do_data_compress(void *opaque) 454 { 455 CompressParam *param = opaque; 456 RAMBlock *block; 457 ram_addr_t offset; 458 bool zero_page; 459 460 qemu_mutex_lock(¶m->mutex); 461 while (!param->quit) { 462 if (param->block) { 463 block = param->block; 464 offset = param->offset; 465 param->block = NULL; 466 qemu_mutex_unlock(¶m->mutex); 467 468 zero_page = do_compress_ram_page(param->file, ¶m->stream, 469 block, offset, param->originbuf); 470 471 qemu_mutex_lock(&comp_done_lock); 472 param->done = true; 473 param->zero_page = zero_page; 474 qemu_cond_signal(&comp_done_cond); 475 qemu_mutex_unlock(&comp_done_lock); 476 477 qemu_mutex_lock(¶m->mutex); 478 } else { 479 qemu_cond_wait(¶m->cond, ¶m->mutex); 480 } 481 } 482 qemu_mutex_unlock(¶m->mutex); 483 484 return NULL; 485 } 486 487 static void compress_threads_save_cleanup(void) 488 { 489 int i, thread_count; 490 491 if (!migrate_use_compression() || !comp_param) { 492 return; 493 } 494 495 thread_count = migrate_compress_threads(); 496 for (i = 0; i < thread_count; i++) { 497 /* 498 * we use it as a indicator which shows if the thread is 499 * properly init'd or not 500 */ 501 if (!comp_param[i].file) { 502 break; 503 } 504 505 qemu_mutex_lock(&comp_param[i].mutex); 506 comp_param[i].quit = true; 507 qemu_cond_signal(&comp_param[i].cond); 508 qemu_mutex_unlock(&comp_param[i].mutex); 509 510 qemu_thread_join(compress_threads + i); 511 qemu_mutex_destroy(&comp_param[i].mutex); 512 qemu_cond_destroy(&comp_param[i].cond); 513 deflateEnd(&comp_param[i].stream); 514 g_free(comp_param[i].originbuf); 515 qemu_fclose(comp_param[i].file); 516 comp_param[i].file = NULL; 517 } 518 qemu_mutex_destroy(&comp_done_lock); 519 qemu_cond_destroy(&comp_done_cond); 520 g_free(compress_threads); 521 g_free(comp_param); 522 compress_threads = NULL; 523 comp_param = NULL; 524 } 525 526 static int compress_threads_save_setup(void) 527 { 528 int i, thread_count; 529 530 if (!migrate_use_compression()) { 531 return 0; 532 } 533 thread_count = migrate_compress_threads(); 534 compress_threads = g_new0(QemuThread, thread_count); 535 comp_param = g_new0(CompressParam, thread_count); 536 qemu_cond_init(&comp_done_cond); 537 qemu_mutex_init(&comp_done_lock); 538 for (i = 0; i < thread_count; i++) { 539 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 540 if (!comp_param[i].originbuf) { 541 goto exit; 542 } 543 544 if (deflateInit(&comp_param[i].stream, 545 migrate_compress_level()) != Z_OK) { 546 g_free(comp_param[i].originbuf); 547 goto exit; 548 } 549 550 /* comp_param[i].file is just used as a dummy buffer to save data, 551 * set its ops to empty. 552 */ 553 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false); 554 comp_param[i].done = true; 555 comp_param[i].quit = false; 556 qemu_mutex_init(&comp_param[i].mutex); 557 qemu_cond_init(&comp_param[i].cond); 558 qemu_thread_create(compress_threads + i, "compress", 559 do_data_compress, comp_param + i, 560 QEMU_THREAD_JOINABLE); 561 } 562 return 0; 563 564 exit: 565 compress_threads_save_cleanup(); 566 return -1; 567 } 568 569 /** 570 * save_page_header: write page header to wire 571 * 572 * If this is the 1st block, it also writes the block identification 573 * 574 * Returns the number of bytes written 575 * 576 * @f: QEMUFile where to send the data 577 * @block: block that contains the page we want to send 578 * @offset: offset inside the block for the page 579 * in the lower bits, it contains flags 580 */ 581 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 582 ram_addr_t offset) 583 { 584 size_t size, len; 585 586 if (block == rs->last_sent_block) { 587 offset |= RAM_SAVE_FLAG_CONTINUE; 588 } 589 qemu_put_be64(f, offset); 590 size = 8; 591 592 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 593 len = strlen(block->idstr); 594 qemu_put_byte(f, len); 595 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 596 size += 1 + len; 597 rs->last_sent_block = block; 598 } 599 return size; 600 } 601 602 /** 603 * mig_throttle_guest_down: throttle down the guest 604 * 605 * Reduce amount of guest cpu execution to hopefully slow down memory 606 * writes. If guest dirty memory rate is reduced below the rate at 607 * which we can transfer pages to the destination then we should be 608 * able to complete migration. Some workloads dirty memory way too 609 * fast and will not effectively converge, even with auto-converge. 610 */ 611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 612 uint64_t bytes_dirty_threshold) 613 { 614 MigrationState *s = migrate_get_current(); 615 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 616 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 617 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 618 int pct_max = s->parameters.max_cpu_throttle; 619 620 uint64_t throttle_now = cpu_throttle_get_percentage(); 621 uint64_t cpu_now, cpu_ideal, throttle_inc; 622 623 /* We have not started throttling yet. Let's start it. */ 624 if (!cpu_throttle_active()) { 625 cpu_throttle_set(pct_initial); 626 } else { 627 /* Throttling already on, just increase the rate */ 628 if (!pct_tailslow) { 629 throttle_inc = pct_increment; 630 } else { 631 /* Compute the ideal CPU percentage used by Guest, which may 632 * make the dirty rate match the dirty rate threshold. */ 633 cpu_now = 100 - throttle_now; 634 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 635 bytes_dirty_period); 636 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 637 } 638 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 639 } 640 } 641 642 /** 643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 644 * 645 * @rs: current RAM state 646 * @current_addr: address for the zero page 647 * 648 * Update the xbzrle cache to reflect a page that's been sent as all 0. 649 * The important thing is that a stale (not-yet-0'd) page be replaced 650 * by the new data. 651 * As a bonus, if the page wasn't in the cache it gets added so that 652 * when a small write is made into the 0'd page it gets XBZRLE sent. 653 */ 654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 655 { 656 if (!rs->xbzrle_enabled) { 657 return; 658 } 659 660 /* We don't care if this fails to allocate a new cache page 661 * as long as it updated an old one */ 662 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 663 ram_counters.dirty_sync_count); 664 } 665 666 #define ENCODING_FLAG_XBZRLE 0x1 667 668 /** 669 * save_xbzrle_page: compress and send current page 670 * 671 * Returns: 1 means that we wrote the page 672 * 0 means that page is identical to the one already sent 673 * -1 means that xbzrle would be longer than normal 674 * 675 * @rs: current RAM state 676 * @current_data: pointer to the address of the page contents 677 * @current_addr: addr of the page 678 * @block: block that contains the page we want to send 679 * @offset: offset inside the block for the page 680 * @last_stage: if we are at the completion stage 681 */ 682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 683 ram_addr_t current_addr, RAMBlock *block, 684 ram_addr_t offset, bool last_stage) 685 { 686 int encoded_len = 0, bytes_xbzrle; 687 uint8_t *prev_cached_page; 688 689 if (!cache_is_cached(XBZRLE.cache, current_addr, 690 ram_counters.dirty_sync_count)) { 691 xbzrle_counters.cache_miss++; 692 if (!last_stage) { 693 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 694 ram_counters.dirty_sync_count) == -1) { 695 return -1; 696 } else { 697 /* update *current_data when the page has been 698 inserted into cache */ 699 *current_data = get_cached_data(XBZRLE.cache, current_addr); 700 } 701 } 702 return -1; 703 } 704 705 /* 706 * Reaching here means the page has hit the xbzrle cache, no matter what 707 * encoding result it is (normal encoding, overflow or skipping the page), 708 * count the page as encoded. This is used to calculate the encoding rate. 709 * 710 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 711 * 2nd page turns out to be skipped (i.e. no new bytes written to the 712 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 713 * skipped page included. In this way, the encoding rate can tell if the 714 * guest page is good for xbzrle encoding. 715 */ 716 xbzrle_counters.pages++; 717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 718 719 /* save current buffer into memory */ 720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 721 722 /* XBZRLE encoding (if there is no overflow) */ 723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 725 TARGET_PAGE_SIZE); 726 727 /* 728 * Update the cache contents, so that it corresponds to the data 729 * sent, in all cases except where we skip the page. 730 */ 731 if (!last_stage && encoded_len != 0) { 732 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 733 /* 734 * In the case where we couldn't compress, ensure that the caller 735 * sends the data from the cache, since the guest might have 736 * changed the RAM since we copied it. 737 */ 738 *current_data = prev_cached_page; 739 } 740 741 if (encoded_len == 0) { 742 trace_save_xbzrle_page_skipping(); 743 return 0; 744 } else if (encoded_len == -1) { 745 trace_save_xbzrle_page_overflow(); 746 xbzrle_counters.overflow++; 747 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 748 return -1; 749 } 750 751 /* Send XBZRLE based compressed page */ 752 bytes_xbzrle = save_page_header(rs, rs->f, block, 753 offset | RAM_SAVE_FLAG_XBZRLE); 754 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 755 qemu_put_be16(rs->f, encoded_len); 756 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 757 bytes_xbzrle += encoded_len + 1 + 2; 758 /* 759 * Like compressed_size (please see update_compress_thread_counts), 760 * the xbzrle encoded bytes don't count the 8 byte header with 761 * RAM_SAVE_FLAG_CONTINUE. 762 */ 763 xbzrle_counters.bytes += bytes_xbzrle - 8; 764 ram_counters.transferred += bytes_xbzrle; 765 766 return 1; 767 } 768 769 /** 770 * migration_bitmap_find_dirty: find the next dirty page from start 771 * 772 * Returns the page offset within memory region of the start of a dirty page 773 * 774 * @rs: current RAM state 775 * @rb: RAMBlock where to search for dirty pages 776 * @start: page where we start the search 777 */ 778 static inline 779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 780 unsigned long start) 781 { 782 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 783 unsigned long *bitmap = rb->bmap; 784 785 if (ramblock_is_ignored(rb)) { 786 return size; 787 } 788 789 return find_next_bit(bitmap, size, start); 790 } 791 792 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 793 unsigned long page) 794 { 795 uint8_t shift; 796 hwaddr size, start; 797 798 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 799 return; 800 } 801 802 shift = rb->clear_bmap_shift; 803 /* 804 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 805 * can make things easier sometimes since then start address 806 * of the small chunk will always be 64 pages aligned so the 807 * bitmap will always be aligned to unsigned long. We should 808 * even be able to remove this restriction but I'm simply 809 * keeping it. 810 */ 811 assert(shift >= 6); 812 813 size = 1ULL << (TARGET_PAGE_BITS + shift); 814 start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); 815 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 816 memory_region_clear_dirty_bitmap(rb->mr, start, size); 817 } 818 819 static void 820 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 821 unsigned long start, 822 unsigned long npages) 823 { 824 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 825 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 826 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 827 828 /* 829 * Clear pages from start to start + npages - 1, so the end boundary is 830 * exclusive. 831 */ 832 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 833 migration_clear_memory_region_dirty_bitmap(rb, i); 834 } 835 } 836 837 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 838 RAMBlock *rb, 839 unsigned long page) 840 { 841 bool ret; 842 843 /* 844 * Clear dirty bitmap if needed. This _must_ be called before we 845 * send any of the page in the chunk because we need to make sure 846 * we can capture further page content changes when we sync dirty 847 * log the next time. So as long as we are going to send any of 848 * the page in the chunk we clear the remote dirty bitmap for all. 849 * Clearing it earlier won't be a problem, but too late will. 850 */ 851 migration_clear_memory_region_dirty_bitmap(rb, page); 852 853 ret = test_and_clear_bit(page, rb->bmap); 854 if (ret) { 855 rs->migration_dirty_pages--; 856 } 857 858 return ret; 859 } 860 861 /* Called with RCU critical section */ 862 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 863 { 864 uint64_t new_dirty_pages = 865 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 866 867 rs->migration_dirty_pages += new_dirty_pages; 868 rs->num_dirty_pages_period += new_dirty_pages; 869 } 870 871 /** 872 * ram_pagesize_summary: calculate all the pagesizes of a VM 873 * 874 * Returns a summary bitmap of the page sizes of all RAMBlocks 875 * 876 * For VMs with just normal pages this is equivalent to the host page 877 * size. If it's got some huge pages then it's the OR of all the 878 * different page sizes. 879 */ 880 uint64_t ram_pagesize_summary(void) 881 { 882 RAMBlock *block; 883 uint64_t summary = 0; 884 885 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 886 summary |= block->page_size; 887 } 888 889 return summary; 890 } 891 892 uint64_t ram_get_total_transferred_pages(void) 893 { 894 return ram_counters.normal + ram_counters.duplicate + 895 compression_counters.pages + xbzrle_counters.pages; 896 } 897 898 static void migration_update_rates(RAMState *rs, int64_t end_time) 899 { 900 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 901 double compressed_size; 902 903 /* calculate period counters */ 904 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 905 / (end_time - rs->time_last_bitmap_sync); 906 907 if (!page_count) { 908 return; 909 } 910 911 if (migrate_use_xbzrle()) { 912 double encoded_size, unencoded_size; 913 914 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 915 rs->xbzrle_cache_miss_prev) / page_count; 916 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 917 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 918 TARGET_PAGE_SIZE; 919 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 920 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 921 xbzrle_counters.encoding_rate = 0; 922 } else { 923 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 924 } 925 rs->xbzrle_pages_prev = xbzrle_counters.pages; 926 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 927 } 928 929 if (migrate_use_compression()) { 930 compression_counters.busy_rate = (double)(compression_counters.busy - 931 rs->compress_thread_busy_prev) / page_count; 932 rs->compress_thread_busy_prev = compression_counters.busy; 933 934 compressed_size = compression_counters.compressed_size - 935 rs->compressed_size_prev; 936 if (compressed_size) { 937 double uncompressed_size = (compression_counters.pages - 938 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 939 940 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 941 compression_counters.compression_rate = 942 uncompressed_size / compressed_size; 943 944 rs->compress_pages_prev = compression_counters.pages; 945 rs->compressed_size_prev = compression_counters.compressed_size; 946 } 947 } 948 } 949 950 static void migration_trigger_throttle(RAMState *rs) 951 { 952 MigrationState *s = migrate_get_current(); 953 uint64_t threshold = s->parameters.throttle_trigger_threshold; 954 955 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 956 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 957 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 958 959 /* During block migration the auto-converge logic incorrectly detects 960 * that ram migration makes no progress. Avoid this by disabling the 961 * throttling logic during the bulk phase of block migration. */ 962 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 963 /* The following detection logic can be refined later. For now: 964 Check to see if the ratio between dirtied bytes and the approx. 965 amount of bytes that just got transferred since the last time 966 we were in this routine reaches the threshold. If that happens 967 twice, start or increase throttling. */ 968 969 if ((bytes_dirty_period > bytes_dirty_threshold) && 970 (++rs->dirty_rate_high_cnt >= 2)) { 971 trace_migration_throttle(); 972 rs->dirty_rate_high_cnt = 0; 973 mig_throttle_guest_down(bytes_dirty_period, 974 bytes_dirty_threshold); 975 } 976 } 977 } 978 979 static void migration_bitmap_sync(RAMState *rs) 980 { 981 RAMBlock *block; 982 int64_t end_time; 983 984 ram_counters.dirty_sync_count++; 985 986 if (!rs->time_last_bitmap_sync) { 987 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 988 } 989 990 trace_migration_bitmap_sync_start(); 991 memory_global_dirty_log_sync(); 992 993 qemu_mutex_lock(&rs->bitmap_mutex); 994 WITH_RCU_READ_LOCK_GUARD() { 995 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 996 ramblock_sync_dirty_bitmap(rs, block); 997 } 998 ram_counters.remaining = ram_bytes_remaining(); 999 } 1000 qemu_mutex_unlock(&rs->bitmap_mutex); 1001 1002 memory_global_after_dirty_log_sync(); 1003 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1004 1005 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1006 1007 /* more than 1 second = 1000 millisecons */ 1008 if (end_time > rs->time_last_bitmap_sync + 1000) { 1009 migration_trigger_throttle(rs); 1010 1011 migration_update_rates(rs, end_time); 1012 1013 rs->target_page_count_prev = rs->target_page_count; 1014 1015 /* reset period counters */ 1016 rs->time_last_bitmap_sync = end_time; 1017 rs->num_dirty_pages_period = 0; 1018 rs->bytes_xfer_prev = ram_counters.transferred; 1019 } 1020 if (migrate_use_events()) { 1021 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1022 } 1023 } 1024 1025 static void migration_bitmap_sync_precopy(RAMState *rs) 1026 { 1027 Error *local_err = NULL; 1028 1029 /* 1030 * The current notifier usage is just an optimization to migration, so we 1031 * don't stop the normal migration process in the error case. 1032 */ 1033 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1034 error_report_err(local_err); 1035 local_err = NULL; 1036 } 1037 1038 migration_bitmap_sync(rs); 1039 1040 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1041 error_report_err(local_err); 1042 } 1043 } 1044 1045 /** 1046 * save_zero_page_to_file: send the zero page to the file 1047 * 1048 * Returns the size of data written to the file, 0 means the page is not 1049 * a zero page 1050 * 1051 * @rs: current RAM state 1052 * @file: the file where the data is saved 1053 * @block: block that contains the page we want to send 1054 * @offset: offset inside the block for the page 1055 */ 1056 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1057 RAMBlock *block, ram_addr_t offset) 1058 { 1059 uint8_t *p = block->host + offset; 1060 int len = 0; 1061 1062 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1063 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1064 qemu_put_byte(file, 0); 1065 len += 1; 1066 } 1067 return len; 1068 } 1069 1070 /** 1071 * save_zero_page: send the zero page to the stream 1072 * 1073 * Returns the number of pages written. 1074 * 1075 * @rs: current RAM state 1076 * @block: block that contains the page we want to send 1077 * @offset: offset inside the block for the page 1078 */ 1079 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1080 { 1081 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1082 1083 if (len) { 1084 ram_counters.duplicate++; 1085 ram_counters.transferred += len; 1086 return 1; 1087 } 1088 return -1; 1089 } 1090 1091 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1092 { 1093 if (!migrate_release_ram() || !migration_in_postcopy()) { 1094 return; 1095 } 1096 1097 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1098 } 1099 1100 /* 1101 * @pages: the number of pages written by the control path, 1102 * < 0 - error 1103 * > 0 - number of pages written 1104 * 1105 * Return true if the pages has been saved, otherwise false is returned. 1106 */ 1107 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1108 int *pages) 1109 { 1110 uint64_t bytes_xmit = 0; 1111 int ret; 1112 1113 *pages = -1; 1114 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1115 &bytes_xmit); 1116 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1117 return false; 1118 } 1119 1120 if (bytes_xmit) { 1121 ram_counters.transferred += bytes_xmit; 1122 *pages = 1; 1123 } 1124 1125 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1126 return true; 1127 } 1128 1129 if (bytes_xmit > 0) { 1130 ram_counters.normal++; 1131 } else if (bytes_xmit == 0) { 1132 ram_counters.duplicate++; 1133 } 1134 1135 return true; 1136 } 1137 1138 /* 1139 * directly send the page to the stream 1140 * 1141 * Returns the number of pages written. 1142 * 1143 * @rs: current RAM state 1144 * @block: block that contains the page we want to send 1145 * @offset: offset inside the block for the page 1146 * @buf: the page to be sent 1147 * @async: send to page asyncly 1148 */ 1149 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1150 uint8_t *buf, bool async) 1151 { 1152 ram_counters.transferred += save_page_header(rs, rs->f, block, 1153 offset | RAM_SAVE_FLAG_PAGE); 1154 if (async) { 1155 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1156 migrate_release_ram() & 1157 migration_in_postcopy()); 1158 } else { 1159 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1160 } 1161 ram_counters.transferred += TARGET_PAGE_SIZE; 1162 ram_counters.normal++; 1163 return 1; 1164 } 1165 1166 /** 1167 * ram_save_page: send the given page to the stream 1168 * 1169 * Returns the number of pages written. 1170 * < 0 - error 1171 * >=0 - Number of pages written - this might legally be 0 1172 * if xbzrle noticed the page was the same. 1173 * 1174 * @rs: current RAM state 1175 * @block: block that contains the page we want to send 1176 * @offset: offset inside the block for the page 1177 * @last_stage: if we are at the completion stage 1178 */ 1179 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1180 { 1181 int pages = -1; 1182 uint8_t *p; 1183 bool send_async = true; 1184 RAMBlock *block = pss->block; 1185 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1186 ram_addr_t current_addr = block->offset + offset; 1187 1188 p = block->host + offset; 1189 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1190 1191 XBZRLE_cache_lock(); 1192 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1193 pages = save_xbzrle_page(rs, &p, current_addr, block, 1194 offset, last_stage); 1195 if (!last_stage) { 1196 /* Can't send this cached data async, since the cache page 1197 * might get updated before it gets to the wire 1198 */ 1199 send_async = false; 1200 } 1201 } 1202 1203 /* XBZRLE overflow or normal page */ 1204 if (pages == -1) { 1205 pages = save_normal_page(rs, block, offset, p, send_async); 1206 } 1207 1208 XBZRLE_cache_unlock(); 1209 1210 return pages; 1211 } 1212 1213 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1214 ram_addr_t offset) 1215 { 1216 if (multifd_queue_page(rs->f, block, offset) < 0) { 1217 return -1; 1218 } 1219 ram_counters.normal++; 1220 1221 return 1; 1222 } 1223 1224 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1225 ram_addr_t offset, uint8_t *source_buf) 1226 { 1227 RAMState *rs = ram_state; 1228 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1229 bool zero_page = false; 1230 int ret; 1231 1232 if (save_zero_page_to_file(rs, f, block, offset)) { 1233 zero_page = true; 1234 goto exit; 1235 } 1236 1237 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1238 1239 /* 1240 * copy it to a internal buffer to avoid it being modified by VM 1241 * so that we can catch up the error during compression and 1242 * decompression 1243 */ 1244 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1245 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1246 if (ret < 0) { 1247 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1248 error_report("compressed data failed!"); 1249 return false; 1250 } 1251 1252 exit: 1253 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1254 return zero_page; 1255 } 1256 1257 static void 1258 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1259 { 1260 ram_counters.transferred += bytes_xmit; 1261 1262 if (param->zero_page) { 1263 ram_counters.duplicate++; 1264 return; 1265 } 1266 1267 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1268 compression_counters.compressed_size += bytes_xmit - 8; 1269 compression_counters.pages++; 1270 } 1271 1272 static bool save_page_use_compression(RAMState *rs); 1273 1274 static void flush_compressed_data(RAMState *rs) 1275 { 1276 int idx, len, thread_count; 1277 1278 if (!save_page_use_compression(rs)) { 1279 return; 1280 } 1281 thread_count = migrate_compress_threads(); 1282 1283 qemu_mutex_lock(&comp_done_lock); 1284 for (idx = 0; idx < thread_count; idx++) { 1285 while (!comp_param[idx].done) { 1286 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1287 } 1288 } 1289 qemu_mutex_unlock(&comp_done_lock); 1290 1291 for (idx = 0; idx < thread_count; idx++) { 1292 qemu_mutex_lock(&comp_param[idx].mutex); 1293 if (!comp_param[idx].quit) { 1294 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1295 /* 1296 * it's safe to fetch zero_page without holding comp_done_lock 1297 * as there is no further request submitted to the thread, 1298 * i.e, the thread should be waiting for a request at this point. 1299 */ 1300 update_compress_thread_counts(&comp_param[idx], len); 1301 } 1302 qemu_mutex_unlock(&comp_param[idx].mutex); 1303 } 1304 } 1305 1306 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1307 ram_addr_t offset) 1308 { 1309 param->block = block; 1310 param->offset = offset; 1311 } 1312 1313 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1314 ram_addr_t offset) 1315 { 1316 int idx, thread_count, bytes_xmit = -1, pages = -1; 1317 bool wait = migrate_compress_wait_thread(); 1318 1319 thread_count = migrate_compress_threads(); 1320 qemu_mutex_lock(&comp_done_lock); 1321 retry: 1322 for (idx = 0; idx < thread_count; idx++) { 1323 if (comp_param[idx].done) { 1324 comp_param[idx].done = false; 1325 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1326 qemu_mutex_lock(&comp_param[idx].mutex); 1327 set_compress_params(&comp_param[idx], block, offset); 1328 qemu_cond_signal(&comp_param[idx].cond); 1329 qemu_mutex_unlock(&comp_param[idx].mutex); 1330 pages = 1; 1331 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1332 break; 1333 } 1334 } 1335 1336 /* 1337 * wait for the free thread if the user specifies 'compress-wait-thread', 1338 * otherwise we will post the page out in the main thread as normal page. 1339 */ 1340 if (pages < 0 && wait) { 1341 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1342 goto retry; 1343 } 1344 qemu_mutex_unlock(&comp_done_lock); 1345 1346 return pages; 1347 } 1348 1349 /** 1350 * find_dirty_block: find the next dirty page and update any state 1351 * associated with the search process. 1352 * 1353 * Returns true if a page is found 1354 * 1355 * @rs: current RAM state 1356 * @pss: data about the state of the current dirty page scan 1357 * @again: set to false if the search has scanned the whole of RAM 1358 */ 1359 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1360 { 1361 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1362 if (pss->complete_round && pss->block == rs->last_seen_block && 1363 pss->page >= rs->last_page) { 1364 /* 1365 * We've been once around the RAM and haven't found anything. 1366 * Give up. 1367 */ 1368 *again = false; 1369 return false; 1370 } 1371 if (!offset_in_ramblock(pss->block, 1372 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1373 /* Didn't find anything in this RAM Block */ 1374 pss->page = 0; 1375 pss->block = QLIST_NEXT_RCU(pss->block, next); 1376 if (!pss->block) { 1377 /* 1378 * If memory migration starts over, we will meet a dirtied page 1379 * which may still exists in compression threads's ring, so we 1380 * should flush the compressed data to make sure the new page 1381 * is not overwritten by the old one in the destination. 1382 * 1383 * Also If xbzrle is on, stop using the data compression at this 1384 * point. In theory, xbzrle can do better than compression. 1385 */ 1386 flush_compressed_data(rs); 1387 1388 /* Hit the end of the list */ 1389 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1390 /* Flag that we've looped */ 1391 pss->complete_round = true; 1392 /* After the first round, enable XBZRLE. */ 1393 if (migrate_use_xbzrle()) { 1394 rs->xbzrle_enabled = true; 1395 } 1396 } 1397 /* Didn't find anything this time, but try again on the new block */ 1398 *again = true; 1399 return false; 1400 } else { 1401 /* Can go around again, but... */ 1402 *again = true; 1403 /* We've found something so probably don't need to */ 1404 return true; 1405 } 1406 } 1407 1408 /** 1409 * unqueue_page: gets a page of the queue 1410 * 1411 * Helper for 'get_queued_page' - gets a page off the queue 1412 * 1413 * Returns the block of the page (or NULL if none available) 1414 * 1415 * @rs: current RAM state 1416 * @offset: used to return the offset within the RAMBlock 1417 */ 1418 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1419 { 1420 RAMBlock *block = NULL; 1421 1422 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1423 return NULL; 1424 } 1425 1426 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1427 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1428 struct RAMSrcPageRequest *entry = 1429 QSIMPLEQ_FIRST(&rs->src_page_requests); 1430 block = entry->rb; 1431 *offset = entry->offset; 1432 1433 if (entry->len > TARGET_PAGE_SIZE) { 1434 entry->len -= TARGET_PAGE_SIZE; 1435 entry->offset += TARGET_PAGE_SIZE; 1436 } else { 1437 memory_region_unref(block->mr); 1438 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1439 g_free(entry); 1440 migration_consume_urgent_request(); 1441 } 1442 } 1443 1444 return block; 1445 } 1446 1447 #if defined(__linux__) 1448 /** 1449 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1450 * is found, return RAM block pointer and page offset 1451 * 1452 * Returns pointer to the RAMBlock containing faulting page, 1453 * NULL if no write faults are pending 1454 * 1455 * @rs: current RAM state 1456 * @offset: page offset from the beginning of the block 1457 */ 1458 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1459 { 1460 struct uffd_msg uffd_msg; 1461 void *page_address; 1462 RAMBlock *block; 1463 int res; 1464 1465 if (!migrate_background_snapshot()) { 1466 return NULL; 1467 } 1468 1469 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1470 if (res <= 0) { 1471 return NULL; 1472 } 1473 1474 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1475 block = qemu_ram_block_from_host(page_address, false, offset); 1476 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1477 return block; 1478 } 1479 1480 /** 1481 * ram_save_release_protection: release UFFD write protection after 1482 * a range of pages has been saved 1483 * 1484 * @rs: current RAM state 1485 * @pss: page-search-status structure 1486 * @start_page: index of the first page in the range relative to pss->block 1487 * 1488 * Returns 0 on success, negative value in case of an error 1489 */ 1490 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1491 unsigned long start_page) 1492 { 1493 int res = 0; 1494 1495 /* Check if page is from UFFD-managed region. */ 1496 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1497 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1498 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; 1499 1500 /* Flush async buffers before un-protect. */ 1501 qemu_fflush(rs->f); 1502 /* Un-protect memory range. */ 1503 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1504 false, false); 1505 } 1506 1507 return res; 1508 } 1509 1510 /* ram_write_tracking_available: check if kernel supports required UFFD features 1511 * 1512 * Returns true if supports, false otherwise 1513 */ 1514 bool ram_write_tracking_available(void) 1515 { 1516 uint64_t uffd_features; 1517 int res; 1518 1519 res = uffd_query_features(&uffd_features); 1520 return (res == 0 && 1521 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1522 } 1523 1524 /* ram_write_tracking_compatible: check if guest configuration is 1525 * compatible with 'write-tracking' 1526 * 1527 * Returns true if compatible, false otherwise 1528 */ 1529 bool ram_write_tracking_compatible(void) 1530 { 1531 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1532 int uffd_fd; 1533 RAMBlock *block; 1534 bool ret = false; 1535 1536 /* Open UFFD file descriptor */ 1537 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1538 if (uffd_fd < 0) { 1539 return false; 1540 } 1541 1542 RCU_READ_LOCK_GUARD(); 1543 1544 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1545 uint64_t uffd_ioctls; 1546 1547 /* Nothing to do with read-only and MMIO-writable regions */ 1548 if (block->mr->readonly || block->mr->rom_device) { 1549 continue; 1550 } 1551 /* Try to register block memory via UFFD-IO to track writes */ 1552 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1553 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1554 goto out; 1555 } 1556 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1557 goto out; 1558 } 1559 } 1560 ret = true; 1561 1562 out: 1563 uffd_close_fd(uffd_fd); 1564 return ret; 1565 } 1566 1567 /* 1568 * ram_block_populate_pages: populate memory in the RAM block by reading 1569 * an integer from the beginning of each page. 1570 * 1571 * Since it's solely used for userfault_fd WP feature, here we just 1572 * hardcode page size to qemu_real_host_page_size. 1573 * 1574 * @block: RAM block to populate 1575 */ 1576 static void ram_block_populate_pages(RAMBlock *block) 1577 { 1578 char *ptr = (char *) block->host; 1579 1580 for (ram_addr_t offset = 0; offset < block->used_length; 1581 offset += qemu_real_host_page_size) { 1582 char tmp = *(ptr + offset); 1583 1584 /* Don't optimize the read out */ 1585 asm volatile("" : "+r" (tmp)); 1586 } 1587 } 1588 1589 /* 1590 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1591 */ 1592 void ram_write_tracking_prepare(void) 1593 { 1594 RAMBlock *block; 1595 1596 RCU_READ_LOCK_GUARD(); 1597 1598 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1599 /* Nothing to do with read-only and MMIO-writable regions */ 1600 if (block->mr->readonly || block->mr->rom_device) { 1601 continue; 1602 } 1603 1604 /* 1605 * Populate pages of the RAM block before enabling userfault_fd 1606 * write protection. 1607 * 1608 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1609 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1610 * pages with pte_none() entries in page table. 1611 */ 1612 ram_block_populate_pages(block); 1613 } 1614 } 1615 1616 /* 1617 * ram_write_tracking_start: start UFFD-WP memory tracking 1618 * 1619 * Returns 0 for success or negative value in case of error 1620 */ 1621 int ram_write_tracking_start(void) 1622 { 1623 int uffd_fd; 1624 RAMState *rs = ram_state; 1625 RAMBlock *block; 1626 1627 /* Open UFFD file descriptor */ 1628 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1629 if (uffd_fd < 0) { 1630 return uffd_fd; 1631 } 1632 rs->uffdio_fd = uffd_fd; 1633 1634 RCU_READ_LOCK_GUARD(); 1635 1636 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1637 /* Nothing to do with read-only and MMIO-writable regions */ 1638 if (block->mr->readonly || block->mr->rom_device) { 1639 continue; 1640 } 1641 1642 /* Register block memory with UFFD to track writes */ 1643 if (uffd_register_memory(rs->uffdio_fd, block->host, 1644 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1645 goto fail; 1646 } 1647 /* Apply UFFD write protection to the block memory range */ 1648 if (uffd_change_protection(rs->uffdio_fd, block->host, 1649 block->max_length, true, false)) { 1650 goto fail; 1651 } 1652 block->flags |= RAM_UF_WRITEPROTECT; 1653 memory_region_ref(block->mr); 1654 1655 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1656 block->host, block->max_length); 1657 } 1658 1659 return 0; 1660 1661 fail: 1662 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1663 1664 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1665 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1666 continue; 1667 } 1668 /* 1669 * In case some memory block failed to be write-protected 1670 * remove protection and unregister all succeeded RAM blocks 1671 */ 1672 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1673 false, false); 1674 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1675 /* Cleanup flags and remove reference */ 1676 block->flags &= ~RAM_UF_WRITEPROTECT; 1677 memory_region_unref(block->mr); 1678 } 1679 1680 uffd_close_fd(uffd_fd); 1681 rs->uffdio_fd = -1; 1682 return -1; 1683 } 1684 1685 /** 1686 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1687 */ 1688 void ram_write_tracking_stop(void) 1689 { 1690 RAMState *rs = ram_state; 1691 RAMBlock *block; 1692 1693 RCU_READ_LOCK_GUARD(); 1694 1695 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1696 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1697 continue; 1698 } 1699 /* Remove protection and unregister all affected RAM blocks */ 1700 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1701 false, false); 1702 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1703 1704 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1705 block->host, block->max_length); 1706 1707 /* Cleanup flags and remove reference */ 1708 block->flags &= ~RAM_UF_WRITEPROTECT; 1709 memory_region_unref(block->mr); 1710 } 1711 1712 /* Finally close UFFD file descriptor */ 1713 uffd_close_fd(rs->uffdio_fd); 1714 rs->uffdio_fd = -1; 1715 } 1716 1717 #else 1718 /* No target OS support, stubs just fail or ignore */ 1719 1720 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1721 { 1722 (void) rs; 1723 (void) offset; 1724 1725 return NULL; 1726 } 1727 1728 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1729 unsigned long start_page) 1730 { 1731 (void) rs; 1732 (void) pss; 1733 (void) start_page; 1734 1735 return 0; 1736 } 1737 1738 bool ram_write_tracking_available(void) 1739 { 1740 return false; 1741 } 1742 1743 bool ram_write_tracking_compatible(void) 1744 { 1745 assert(0); 1746 return false; 1747 } 1748 1749 int ram_write_tracking_start(void) 1750 { 1751 assert(0); 1752 return -1; 1753 } 1754 1755 void ram_write_tracking_stop(void) 1756 { 1757 assert(0); 1758 } 1759 #endif /* defined(__linux__) */ 1760 1761 /** 1762 * get_queued_page: unqueue a page from the postcopy requests 1763 * 1764 * Skips pages that are already sent (!dirty) 1765 * 1766 * Returns true if a queued page is found 1767 * 1768 * @rs: current RAM state 1769 * @pss: data about the state of the current dirty page scan 1770 */ 1771 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1772 { 1773 RAMBlock *block; 1774 ram_addr_t offset; 1775 bool dirty; 1776 1777 do { 1778 block = unqueue_page(rs, &offset); 1779 /* 1780 * We're sending this page, and since it's postcopy nothing else 1781 * will dirty it, and we must make sure it doesn't get sent again 1782 * even if this queue request was received after the background 1783 * search already sent it. 1784 */ 1785 if (block) { 1786 unsigned long page; 1787 1788 page = offset >> TARGET_PAGE_BITS; 1789 dirty = test_bit(page, block->bmap); 1790 if (!dirty) { 1791 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1792 page); 1793 } else { 1794 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1795 } 1796 } 1797 1798 } while (block && !dirty); 1799 1800 if (!block) { 1801 /* 1802 * Poll write faults too if background snapshot is enabled; that's 1803 * when we have vcpus got blocked by the write protected pages. 1804 */ 1805 block = poll_fault_page(rs, &offset); 1806 } 1807 1808 if (block) { 1809 /* 1810 * We want the background search to continue from the queued page 1811 * since the guest is likely to want other pages near to the page 1812 * it just requested. 1813 */ 1814 pss->block = block; 1815 pss->page = offset >> TARGET_PAGE_BITS; 1816 1817 /* 1818 * This unqueued page would break the "one round" check, even is 1819 * really rare. 1820 */ 1821 pss->complete_round = false; 1822 } 1823 1824 return !!block; 1825 } 1826 1827 /** 1828 * migration_page_queue_free: drop any remaining pages in the ram 1829 * request queue 1830 * 1831 * It should be empty at the end anyway, but in error cases there may 1832 * be some left. in case that there is any page left, we drop it. 1833 * 1834 */ 1835 static void migration_page_queue_free(RAMState *rs) 1836 { 1837 struct RAMSrcPageRequest *mspr, *next_mspr; 1838 /* This queue generally should be empty - but in the case of a failed 1839 * migration might have some droppings in. 1840 */ 1841 RCU_READ_LOCK_GUARD(); 1842 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1843 memory_region_unref(mspr->rb->mr); 1844 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1845 g_free(mspr); 1846 } 1847 } 1848 1849 /** 1850 * ram_save_queue_pages: queue the page for transmission 1851 * 1852 * A request from postcopy destination for example. 1853 * 1854 * Returns zero on success or negative on error 1855 * 1856 * @rbname: Name of the RAMBLock of the request. NULL means the 1857 * same that last one. 1858 * @start: starting address from the start of the RAMBlock 1859 * @len: length (in bytes) to send 1860 */ 1861 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1862 { 1863 RAMBlock *ramblock; 1864 RAMState *rs = ram_state; 1865 1866 ram_counters.postcopy_requests++; 1867 RCU_READ_LOCK_GUARD(); 1868 1869 if (!rbname) { 1870 /* Reuse last RAMBlock */ 1871 ramblock = rs->last_req_rb; 1872 1873 if (!ramblock) { 1874 /* 1875 * Shouldn't happen, we can't reuse the last RAMBlock if 1876 * it's the 1st request. 1877 */ 1878 error_report("ram_save_queue_pages no previous block"); 1879 return -1; 1880 } 1881 } else { 1882 ramblock = qemu_ram_block_by_name(rbname); 1883 1884 if (!ramblock) { 1885 /* We shouldn't be asked for a non-existent RAMBlock */ 1886 error_report("ram_save_queue_pages no block '%s'", rbname); 1887 return -1; 1888 } 1889 rs->last_req_rb = ramblock; 1890 } 1891 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1892 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1893 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1894 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1895 __func__, start, len, ramblock->used_length); 1896 return -1; 1897 } 1898 1899 struct RAMSrcPageRequest *new_entry = 1900 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1901 new_entry->rb = ramblock; 1902 new_entry->offset = start; 1903 new_entry->len = len; 1904 1905 memory_region_ref(ramblock->mr); 1906 qemu_mutex_lock(&rs->src_page_req_mutex); 1907 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1908 migration_make_urgent_request(); 1909 qemu_mutex_unlock(&rs->src_page_req_mutex); 1910 1911 return 0; 1912 } 1913 1914 static bool save_page_use_compression(RAMState *rs) 1915 { 1916 if (!migrate_use_compression()) { 1917 return false; 1918 } 1919 1920 /* 1921 * If xbzrle is enabled (e.g., after first round of migration), stop 1922 * using the data compression. In theory, xbzrle can do better than 1923 * compression. 1924 */ 1925 if (rs->xbzrle_enabled) { 1926 return false; 1927 } 1928 1929 return true; 1930 } 1931 1932 /* 1933 * try to compress the page before posting it out, return true if the page 1934 * has been properly handled by compression, otherwise needs other 1935 * paths to handle it 1936 */ 1937 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1938 { 1939 if (!save_page_use_compression(rs)) { 1940 return false; 1941 } 1942 1943 /* 1944 * When starting the process of a new block, the first page of 1945 * the block should be sent out before other pages in the same 1946 * block, and all the pages in last block should have been sent 1947 * out, keeping this order is important, because the 'cont' flag 1948 * is used to avoid resending the block name. 1949 * 1950 * We post the fist page as normal page as compression will take 1951 * much CPU resource. 1952 */ 1953 if (block != rs->last_sent_block) { 1954 flush_compressed_data(rs); 1955 return false; 1956 } 1957 1958 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 1959 return true; 1960 } 1961 1962 compression_counters.busy++; 1963 return false; 1964 } 1965 1966 /** 1967 * ram_save_target_page: save one target page 1968 * 1969 * Returns the number of pages written 1970 * 1971 * @rs: current RAM state 1972 * @pss: data about the page we want to send 1973 * @last_stage: if we are at the completion stage 1974 */ 1975 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1976 bool last_stage) 1977 { 1978 RAMBlock *block = pss->block; 1979 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1980 int res; 1981 1982 if (control_save_page(rs, block, offset, &res)) { 1983 return res; 1984 } 1985 1986 if (save_compress_page(rs, block, offset)) { 1987 return 1; 1988 } 1989 1990 res = save_zero_page(rs, block, offset); 1991 if (res > 0) { 1992 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1993 * page would be stale 1994 */ 1995 if (!save_page_use_compression(rs)) { 1996 XBZRLE_cache_lock(); 1997 xbzrle_cache_zero_page(rs, block->offset + offset); 1998 XBZRLE_cache_unlock(); 1999 } 2000 ram_release_pages(block->idstr, offset, res); 2001 return res; 2002 } 2003 2004 /* 2005 * Do not use multifd for: 2006 * 1. Compression as the first page in the new block should be posted out 2007 * before sending the compressed page 2008 * 2. In postcopy as one whole host page should be placed 2009 */ 2010 if (!save_page_use_compression(rs) && migrate_use_multifd() 2011 && !migration_in_postcopy()) { 2012 return ram_save_multifd_page(rs, block, offset); 2013 } 2014 2015 return ram_save_page(rs, pss, last_stage); 2016 } 2017 2018 /** 2019 * ram_save_host_page: save a whole host page 2020 * 2021 * Starting at *offset send pages up to the end of the current host 2022 * page. It's valid for the initial offset to point into the middle of 2023 * a host page in which case the remainder of the hostpage is sent. 2024 * Only dirty target pages are sent. Note that the host page size may 2025 * be a huge page for this block. 2026 * The saving stops at the boundary of the used_length of the block 2027 * if the RAMBlock isn't a multiple of the host page size. 2028 * 2029 * Returns the number of pages written or negative on error 2030 * 2031 * @rs: current RAM state 2032 * @ms: current migration state 2033 * @pss: data about the page we want to send 2034 * @last_stage: if we are at the completion stage 2035 */ 2036 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 2037 bool last_stage) 2038 { 2039 int tmppages, pages = 0; 2040 size_t pagesize_bits = 2041 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2042 unsigned long hostpage_boundary = 2043 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2044 unsigned long start_page = pss->page; 2045 int res; 2046 2047 if (ramblock_is_ignored(pss->block)) { 2048 error_report("block %s should not be migrated !", pss->block->idstr); 2049 return 0; 2050 } 2051 2052 do { 2053 /* Check the pages is dirty and if it is send it */ 2054 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2055 tmppages = ram_save_target_page(rs, pss, last_stage); 2056 if (tmppages < 0) { 2057 return tmppages; 2058 } 2059 2060 pages += tmppages; 2061 /* 2062 * Allow rate limiting to happen in the middle of huge pages if 2063 * something is sent in the current iteration. 2064 */ 2065 if (pagesize_bits > 1 && tmppages > 0) { 2066 migration_rate_limit(); 2067 } 2068 } 2069 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2070 } while ((pss->page < hostpage_boundary) && 2071 offset_in_ramblock(pss->block, 2072 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2073 /* The offset we leave with is the min boundary of host page and block */ 2074 pss->page = MIN(pss->page, hostpage_boundary) - 1; 2075 2076 res = ram_save_release_protection(rs, pss, start_page); 2077 return (res < 0 ? res : pages); 2078 } 2079 2080 /** 2081 * ram_find_and_save_block: finds a dirty page and sends it to f 2082 * 2083 * Called within an RCU critical section. 2084 * 2085 * Returns the number of pages written where zero means no dirty pages, 2086 * or negative on error 2087 * 2088 * @rs: current RAM state 2089 * @last_stage: if we are at the completion stage 2090 * 2091 * On systems where host-page-size > target-page-size it will send all the 2092 * pages in a host page that are dirty. 2093 */ 2094 2095 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 2096 { 2097 PageSearchStatus pss; 2098 int pages = 0; 2099 bool again, found; 2100 2101 /* No dirty page as there is zero RAM */ 2102 if (!ram_bytes_total()) { 2103 return pages; 2104 } 2105 2106 pss.block = rs->last_seen_block; 2107 pss.page = rs->last_page; 2108 pss.complete_round = false; 2109 2110 if (!pss.block) { 2111 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2112 } 2113 2114 do { 2115 again = true; 2116 found = get_queued_page(rs, &pss); 2117 2118 if (!found) { 2119 /* priority queue empty, so just search for something dirty */ 2120 found = find_dirty_block(rs, &pss, &again); 2121 } 2122 2123 if (found) { 2124 pages = ram_save_host_page(rs, &pss, last_stage); 2125 } 2126 } while (!pages && again); 2127 2128 rs->last_seen_block = pss.block; 2129 rs->last_page = pss.page; 2130 2131 return pages; 2132 } 2133 2134 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2135 { 2136 uint64_t pages = size / TARGET_PAGE_SIZE; 2137 2138 if (zero) { 2139 ram_counters.duplicate += pages; 2140 } else { 2141 ram_counters.normal += pages; 2142 ram_counters.transferred += size; 2143 qemu_update_position(f, size); 2144 } 2145 } 2146 2147 static uint64_t ram_bytes_total_common(bool count_ignored) 2148 { 2149 RAMBlock *block; 2150 uint64_t total = 0; 2151 2152 RCU_READ_LOCK_GUARD(); 2153 2154 if (count_ignored) { 2155 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2156 total += block->used_length; 2157 } 2158 } else { 2159 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2160 total += block->used_length; 2161 } 2162 } 2163 return total; 2164 } 2165 2166 uint64_t ram_bytes_total(void) 2167 { 2168 return ram_bytes_total_common(false); 2169 } 2170 2171 static void xbzrle_load_setup(void) 2172 { 2173 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2174 } 2175 2176 static void xbzrle_load_cleanup(void) 2177 { 2178 g_free(XBZRLE.decoded_buf); 2179 XBZRLE.decoded_buf = NULL; 2180 } 2181 2182 static void ram_state_cleanup(RAMState **rsp) 2183 { 2184 if (*rsp) { 2185 migration_page_queue_free(*rsp); 2186 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2187 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2188 g_free(*rsp); 2189 *rsp = NULL; 2190 } 2191 } 2192 2193 static void xbzrle_cleanup(void) 2194 { 2195 XBZRLE_cache_lock(); 2196 if (XBZRLE.cache) { 2197 cache_fini(XBZRLE.cache); 2198 g_free(XBZRLE.encoded_buf); 2199 g_free(XBZRLE.current_buf); 2200 g_free(XBZRLE.zero_target_page); 2201 XBZRLE.cache = NULL; 2202 XBZRLE.encoded_buf = NULL; 2203 XBZRLE.current_buf = NULL; 2204 XBZRLE.zero_target_page = NULL; 2205 } 2206 XBZRLE_cache_unlock(); 2207 } 2208 2209 static void ram_save_cleanup(void *opaque) 2210 { 2211 RAMState **rsp = opaque; 2212 RAMBlock *block; 2213 2214 /* We don't use dirty log with background snapshots */ 2215 if (!migrate_background_snapshot()) { 2216 /* caller have hold iothread lock or is in a bh, so there is 2217 * no writing race against the migration bitmap 2218 */ 2219 memory_global_dirty_log_stop(); 2220 } 2221 2222 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2223 g_free(block->clear_bmap); 2224 block->clear_bmap = NULL; 2225 g_free(block->bmap); 2226 block->bmap = NULL; 2227 } 2228 2229 xbzrle_cleanup(); 2230 compress_threads_save_cleanup(); 2231 ram_state_cleanup(rsp); 2232 } 2233 2234 static void ram_state_reset(RAMState *rs) 2235 { 2236 rs->last_seen_block = NULL; 2237 rs->last_sent_block = NULL; 2238 rs->last_page = 0; 2239 rs->last_version = ram_list.version; 2240 rs->xbzrle_enabled = false; 2241 } 2242 2243 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2244 2245 /* 2246 * 'expected' is the value you expect the bitmap mostly to be full 2247 * of; it won't bother printing lines that are all this value. 2248 * If 'todump' is null the migration bitmap is dumped. 2249 */ 2250 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 2251 unsigned long pages) 2252 { 2253 int64_t cur; 2254 int64_t linelen = 128; 2255 char linebuf[129]; 2256 2257 for (cur = 0; cur < pages; cur += linelen) { 2258 int64_t curb; 2259 bool found = false; 2260 /* 2261 * Last line; catch the case where the line length 2262 * is longer than remaining ram 2263 */ 2264 if (cur + linelen > pages) { 2265 linelen = pages - cur; 2266 } 2267 for (curb = 0; curb < linelen; curb++) { 2268 bool thisbit = test_bit(cur + curb, todump); 2269 linebuf[curb] = thisbit ? '1' : '.'; 2270 found = found || (thisbit != expected); 2271 } 2272 if (found) { 2273 linebuf[curb] = '\0'; 2274 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 2275 } 2276 } 2277 } 2278 2279 /* **** functions for postcopy ***** */ 2280 2281 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2282 { 2283 struct RAMBlock *block; 2284 2285 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2286 unsigned long *bitmap = block->bmap; 2287 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2288 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2289 2290 while (run_start < range) { 2291 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2292 ram_discard_range(block->idstr, 2293 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2294 ((ram_addr_t)(run_end - run_start)) 2295 << TARGET_PAGE_BITS); 2296 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2297 } 2298 } 2299 } 2300 2301 /** 2302 * postcopy_send_discard_bm_ram: discard a RAMBlock 2303 * 2304 * Returns zero on success 2305 * 2306 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2307 * 2308 * @ms: current migration state 2309 * @block: RAMBlock to discard 2310 */ 2311 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2312 { 2313 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2314 unsigned long current; 2315 unsigned long *bitmap = block->bmap; 2316 2317 for (current = 0; current < end; ) { 2318 unsigned long one = find_next_bit(bitmap, end, current); 2319 unsigned long zero, discard_length; 2320 2321 if (one >= end) { 2322 break; 2323 } 2324 2325 zero = find_next_zero_bit(bitmap, end, one + 1); 2326 2327 if (zero >= end) { 2328 discard_length = end - one; 2329 } else { 2330 discard_length = zero - one; 2331 } 2332 postcopy_discard_send_range(ms, one, discard_length); 2333 current = one + discard_length; 2334 } 2335 2336 return 0; 2337 } 2338 2339 /** 2340 * postcopy_each_ram_send_discard: discard all RAMBlocks 2341 * 2342 * Returns 0 for success or negative for error 2343 * 2344 * Utility for the outgoing postcopy code. 2345 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2346 * passing it bitmap indexes and name. 2347 * (qemu_ram_foreach_block ends up passing unscaled lengths 2348 * which would mean postcopy code would have to deal with target page) 2349 * 2350 * @ms: current migration state 2351 */ 2352 static int postcopy_each_ram_send_discard(MigrationState *ms) 2353 { 2354 struct RAMBlock *block; 2355 int ret; 2356 2357 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2358 postcopy_discard_send_init(ms, block->idstr); 2359 2360 /* 2361 * Postcopy sends chunks of bitmap over the wire, but it 2362 * just needs indexes at this point, avoids it having 2363 * target page specific code. 2364 */ 2365 ret = postcopy_send_discard_bm_ram(ms, block); 2366 postcopy_discard_send_finish(ms); 2367 if (ret) { 2368 return ret; 2369 } 2370 } 2371 2372 return 0; 2373 } 2374 2375 /** 2376 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2377 * 2378 * Helper for postcopy_chunk_hostpages; it's called twice to 2379 * canonicalize the two bitmaps, that are similar, but one is 2380 * inverted. 2381 * 2382 * Postcopy requires that all target pages in a hostpage are dirty or 2383 * clean, not a mix. This function canonicalizes the bitmaps. 2384 * 2385 * @ms: current migration state 2386 * @block: block that contains the page we want to canonicalize 2387 */ 2388 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2389 { 2390 RAMState *rs = ram_state; 2391 unsigned long *bitmap = block->bmap; 2392 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2393 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2394 unsigned long run_start; 2395 2396 if (block->page_size == TARGET_PAGE_SIZE) { 2397 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2398 return; 2399 } 2400 2401 /* Find a dirty page */ 2402 run_start = find_next_bit(bitmap, pages, 0); 2403 2404 while (run_start < pages) { 2405 2406 /* 2407 * If the start of this run of pages is in the middle of a host 2408 * page, then we need to fixup this host page. 2409 */ 2410 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2411 /* Find the end of this run */ 2412 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2413 /* 2414 * If the end isn't at the start of a host page, then the 2415 * run doesn't finish at the end of a host page 2416 * and we need to discard. 2417 */ 2418 } 2419 2420 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2421 unsigned long page; 2422 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2423 host_ratio); 2424 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2425 2426 /* Clean up the bitmap */ 2427 for (page = fixup_start_addr; 2428 page < fixup_start_addr + host_ratio; page++) { 2429 /* 2430 * Remark them as dirty, updating the count for any pages 2431 * that weren't previously dirty. 2432 */ 2433 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2434 } 2435 } 2436 2437 /* Find the next dirty page for the next iteration */ 2438 run_start = find_next_bit(bitmap, pages, run_start); 2439 } 2440 } 2441 2442 /** 2443 * postcopy_chunk_hostpages: discard any partially sent host page 2444 * 2445 * Utility for the outgoing postcopy code. 2446 * 2447 * Discard any partially sent host-page size chunks, mark any partially 2448 * dirty host-page size chunks as all dirty. In this case the host-page 2449 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2450 * 2451 * Returns zero on success 2452 * 2453 * @ms: current migration state 2454 * @block: block we want to work with 2455 */ 2456 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2457 { 2458 postcopy_discard_send_init(ms, block->idstr); 2459 2460 /* 2461 * Ensure that all partially dirty host pages are made fully dirty. 2462 */ 2463 postcopy_chunk_hostpages_pass(ms, block); 2464 2465 postcopy_discard_send_finish(ms); 2466 return 0; 2467 } 2468 2469 /** 2470 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2471 * 2472 * Returns zero on success 2473 * 2474 * Transmit the set of pages to be discarded after precopy to the target 2475 * these are pages that: 2476 * a) Have been previously transmitted but are now dirty again 2477 * b) Pages that have never been transmitted, this ensures that 2478 * any pages on the destination that have been mapped by background 2479 * tasks get discarded (transparent huge pages is the specific concern) 2480 * Hopefully this is pretty sparse 2481 * 2482 * @ms: current migration state 2483 */ 2484 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2485 { 2486 RAMState *rs = ram_state; 2487 RAMBlock *block; 2488 int ret; 2489 2490 RCU_READ_LOCK_GUARD(); 2491 2492 /* This should be our last sync, the src is now paused */ 2493 migration_bitmap_sync(rs); 2494 2495 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2496 rs->last_seen_block = NULL; 2497 rs->last_sent_block = NULL; 2498 rs->last_page = 0; 2499 2500 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2501 /* Deal with TPS != HPS and huge pages */ 2502 ret = postcopy_chunk_hostpages(ms, block); 2503 if (ret) { 2504 return ret; 2505 } 2506 2507 #ifdef DEBUG_POSTCOPY 2508 ram_debug_dump_bitmap(block->bmap, true, 2509 block->used_length >> TARGET_PAGE_BITS); 2510 #endif 2511 } 2512 trace_ram_postcopy_send_discard_bitmap(); 2513 2514 return postcopy_each_ram_send_discard(ms); 2515 } 2516 2517 /** 2518 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2519 * 2520 * Returns zero on success 2521 * 2522 * @rbname: name of the RAMBlock of the request. NULL means the 2523 * same that last one. 2524 * @start: RAMBlock starting page 2525 * @length: RAMBlock size 2526 */ 2527 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2528 { 2529 trace_ram_discard_range(rbname, start, length); 2530 2531 RCU_READ_LOCK_GUARD(); 2532 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2533 2534 if (!rb) { 2535 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2536 return -1; 2537 } 2538 2539 /* 2540 * On source VM, we don't need to update the received bitmap since 2541 * we don't even have one. 2542 */ 2543 if (rb->receivedmap) { 2544 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2545 length >> qemu_target_page_bits()); 2546 } 2547 2548 return ram_block_discard_range(rb, start, length); 2549 } 2550 2551 /* 2552 * For every allocation, we will try not to crash the VM if the 2553 * allocation failed. 2554 */ 2555 static int xbzrle_init(void) 2556 { 2557 Error *local_err = NULL; 2558 2559 if (!migrate_use_xbzrle()) { 2560 return 0; 2561 } 2562 2563 XBZRLE_cache_lock(); 2564 2565 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2566 if (!XBZRLE.zero_target_page) { 2567 error_report("%s: Error allocating zero page", __func__); 2568 goto err_out; 2569 } 2570 2571 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2572 TARGET_PAGE_SIZE, &local_err); 2573 if (!XBZRLE.cache) { 2574 error_report_err(local_err); 2575 goto free_zero_page; 2576 } 2577 2578 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2579 if (!XBZRLE.encoded_buf) { 2580 error_report("%s: Error allocating encoded_buf", __func__); 2581 goto free_cache; 2582 } 2583 2584 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2585 if (!XBZRLE.current_buf) { 2586 error_report("%s: Error allocating current_buf", __func__); 2587 goto free_encoded_buf; 2588 } 2589 2590 /* We are all good */ 2591 XBZRLE_cache_unlock(); 2592 return 0; 2593 2594 free_encoded_buf: 2595 g_free(XBZRLE.encoded_buf); 2596 XBZRLE.encoded_buf = NULL; 2597 free_cache: 2598 cache_fini(XBZRLE.cache); 2599 XBZRLE.cache = NULL; 2600 free_zero_page: 2601 g_free(XBZRLE.zero_target_page); 2602 XBZRLE.zero_target_page = NULL; 2603 err_out: 2604 XBZRLE_cache_unlock(); 2605 return -ENOMEM; 2606 } 2607 2608 static int ram_state_init(RAMState **rsp) 2609 { 2610 *rsp = g_try_new0(RAMState, 1); 2611 2612 if (!*rsp) { 2613 error_report("%s: Init ramstate fail", __func__); 2614 return -1; 2615 } 2616 2617 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2618 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2619 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2620 2621 /* 2622 * Count the total number of pages used by ram blocks not including any 2623 * gaps due to alignment or unplugs. 2624 * This must match with the initial values of dirty bitmap. 2625 */ 2626 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2627 ram_state_reset(*rsp); 2628 2629 return 0; 2630 } 2631 2632 static void ram_list_init_bitmaps(void) 2633 { 2634 MigrationState *ms = migrate_get_current(); 2635 RAMBlock *block; 2636 unsigned long pages; 2637 uint8_t shift; 2638 2639 /* Skip setting bitmap if there is no RAM */ 2640 if (ram_bytes_total()) { 2641 shift = ms->clear_bitmap_shift; 2642 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2643 error_report("clear_bitmap_shift (%u) too big, using " 2644 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2645 shift = CLEAR_BITMAP_SHIFT_MAX; 2646 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2647 error_report("clear_bitmap_shift (%u) too small, using " 2648 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2649 shift = CLEAR_BITMAP_SHIFT_MIN; 2650 } 2651 2652 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2653 pages = block->max_length >> TARGET_PAGE_BITS; 2654 /* 2655 * The initial dirty bitmap for migration must be set with all 2656 * ones to make sure we'll migrate every guest RAM page to 2657 * destination. 2658 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2659 * new migration after a failed migration, ram_list. 2660 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2661 * guest memory. 2662 */ 2663 block->bmap = bitmap_new(pages); 2664 bitmap_set(block->bmap, 0, pages); 2665 block->clear_bmap_shift = shift; 2666 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2667 } 2668 } 2669 } 2670 2671 static void ram_init_bitmaps(RAMState *rs) 2672 { 2673 /* For memory_global_dirty_log_start below. */ 2674 qemu_mutex_lock_iothread(); 2675 qemu_mutex_lock_ramlist(); 2676 2677 WITH_RCU_READ_LOCK_GUARD() { 2678 ram_list_init_bitmaps(); 2679 /* We don't use dirty log with background snapshots */ 2680 if (!migrate_background_snapshot()) { 2681 memory_global_dirty_log_start(); 2682 migration_bitmap_sync_precopy(rs); 2683 } 2684 } 2685 qemu_mutex_unlock_ramlist(); 2686 qemu_mutex_unlock_iothread(); 2687 } 2688 2689 static int ram_init_all(RAMState **rsp) 2690 { 2691 if (ram_state_init(rsp)) { 2692 return -1; 2693 } 2694 2695 if (xbzrle_init()) { 2696 ram_state_cleanup(rsp); 2697 return -1; 2698 } 2699 2700 ram_init_bitmaps(*rsp); 2701 2702 return 0; 2703 } 2704 2705 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2706 { 2707 RAMBlock *block; 2708 uint64_t pages = 0; 2709 2710 /* 2711 * Postcopy is not using xbzrle/compression, so no need for that. 2712 * Also, since source are already halted, we don't need to care 2713 * about dirty page logging as well. 2714 */ 2715 2716 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2717 pages += bitmap_count_one(block->bmap, 2718 block->used_length >> TARGET_PAGE_BITS); 2719 } 2720 2721 /* This may not be aligned with current bitmaps. Recalculate. */ 2722 rs->migration_dirty_pages = pages; 2723 2724 ram_state_reset(rs); 2725 2726 /* Update RAMState cache of output QEMUFile */ 2727 rs->f = out; 2728 2729 trace_ram_state_resume_prepare(pages); 2730 } 2731 2732 /* 2733 * This function clears bits of the free pages reported by the caller from the 2734 * migration dirty bitmap. @addr is the host address corresponding to the 2735 * start of the continuous guest free pages, and @len is the total bytes of 2736 * those pages. 2737 */ 2738 void qemu_guest_free_page_hint(void *addr, size_t len) 2739 { 2740 RAMBlock *block; 2741 ram_addr_t offset; 2742 size_t used_len, start, npages; 2743 MigrationState *s = migrate_get_current(); 2744 2745 /* This function is currently expected to be used during live migration */ 2746 if (!migration_is_setup_or_active(s->state)) { 2747 return; 2748 } 2749 2750 for (; len > 0; len -= used_len, addr += used_len) { 2751 block = qemu_ram_block_from_host(addr, false, &offset); 2752 if (unlikely(!block || offset >= block->used_length)) { 2753 /* 2754 * The implementation might not support RAMBlock resize during 2755 * live migration, but it could happen in theory with future 2756 * updates. So we add a check here to capture that case. 2757 */ 2758 error_report_once("%s unexpected error", __func__); 2759 return; 2760 } 2761 2762 if (len <= block->used_length - offset) { 2763 used_len = len; 2764 } else { 2765 used_len = block->used_length - offset; 2766 } 2767 2768 start = offset >> TARGET_PAGE_BITS; 2769 npages = used_len >> TARGET_PAGE_BITS; 2770 2771 qemu_mutex_lock(&ram_state->bitmap_mutex); 2772 /* 2773 * The skipped free pages are equavalent to be sent from clear_bmap's 2774 * perspective, so clear the bits from the memory region bitmap which 2775 * are initially set. Otherwise those skipped pages will be sent in 2776 * the next round after syncing from the memory region bitmap. 2777 */ 2778 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2779 ram_state->migration_dirty_pages -= 2780 bitmap_count_one_with_offset(block->bmap, start, npages); 2781 bitmap_clear(block->bmap, start, npages); 2782 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2783 } 2784 } 2785 2786 /* 2787 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2788 * long-running RCU critical section. When rcu-reclaims in the code 2789 * start to become numerous it will be necessary to reduce the 2790 * granularity of these critical sections. 2791 */ 2792 2793 /** 2794 * ram_save_setup: Setup RAM for migration 2795 * 2796 * Returns zero to indicate success and negative for error 2797 * 2798 * @f: QEMUFile where to send the data 2799 * @opaque: RAMState pointer 2800 */ 2801 static int ram_save_setup(QEMUFile *f, void *opaque) 2802 { 2803 RAMState **rsp = opaque; 2804 RAMBlock *block; 2805 2806 if (compress_threads_save_setup()) { 2807 return -1; 2808 } 2809 2810 /* migration has already setup the bitmap, reuse it. */ 2811 if (!migration_in_colo_state()) { 2812 if (ram_init_all(rsp) != 0) { 2813 compress_threads_save_cleanup(); 2814 return -1; 2815 } 2816 } 2817 (*rsp)->f = f; 2818 2819 WITH_RCU_READ_LOCK_GUARD() { 2820 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2821 2822 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2823 qemu_put_byte(f, strlen(block->idstr)); 2824 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2825 qemu_put_be64(f, block->used_length); 2826 if (migrate_postcopy_ram() && block->page_size != 2827 qemu_host_page_size) { 2828 qemu_put_be64(f, block->page_size); 2829 } 2830 if (migrate_ignore_shared()) { 2831 qemu_put_be64(f, block->mr->addr); 2832 } 2833 } 2834 } 2835 2836 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2837 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2838 2839 multifd_send_sync_main(f); 2840 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2841 qemu_fflush(f); 2842 2843 return 0; 2844 } 2845 2846 /** 2847 * ram_save_iterate: iterative stage for migration 2848 * 2849 * Returns zero to indicate success and negative for error 2850 * 2851 * @f: QEMUFile where to send the data 2852 * @opaque: RAMState pointer 2853 */ 2854 static int ram_save_iterate(QEMUFile *f, void *opaque) 2855 { 2856 RAMState **temp = opaque; 2857 RAMState *rs = *temp; 2858 int ret = 0; 2859 int i; 2860 int64_t t0; 2861 int done = 0; 2862 2863 if (blk_mig_bulk_active()) { 2864 /* Avoid transferring ram during bulk phase of block migration as 2865 * the bulk phase will usually take a long time and transferring 2866 * ram updates during that time is pointless. */ 2867 goto out; 2868 } 2869 2870 /* 2871 * We'll take this lock a little bit long, but it's okay for two reasons. 2872 * Firstly, the only possible other thread to take it is who calls 2873 * qemu_guest_free_page_hint(), which should be rare; secondly, see 2874 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 2875 * guarantees that we'll at least released it in a regular basis. 2876 */ 2877 qemu_mutex_lock(&rs->bitmap_mutex); 2878 WITH_RCU_READ_LOCK_GUARD() { 2879 if (ram_list.version != rs->last_version) { 2880 ram_state_reset(rs); 2881 } 2882 2883 /* Read version before ram_list.blocks */ 2884 smp_rmb(); 2885 2886 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2887 2888 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2889 i = 0; 2890 while ((ret = qemu_file_rate_limit(f)) == 0 || 2891 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 2892 int pages; 2893 2894 if (qemu_file_get_error(f)) { 2895 break; 2896 } 2897 2898 pages = ram_find_and_save_block(rs, false); 2899 /* no more pages to sent */ 2900 if (pages == 0) { 2901 done = 1; 2902 break; 2903 } 2904 2905 if (pages < 0) { 2906 qemu_file_set_error(f, pages); 2907 break; 2908 } 2909 2910 rs->target_page_count += pages; 2911 2912 /* 2913 * During postcopy, it is necessary to make sure one whole host 2914 * page is sent in one chunk. 2915 */ 2916 if (migrate_postcopy_ram()) { 2917 flush_compressed_data(rs); 2918 } 2919 2920 /* 2921 * we want to check in the 1st loop, just in case it was the 1st 2922 * time and we had to sync the dirty bitmap. 2923 * qemu_clock_get_ns() is a bit expensive, so we only check each 2924 * some iterations 2925 */ 2926 if ((i & 63) == 0) { 2927 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 2928 1000000; 2929 if (t1 > MAX_WAIT) { 2930 trace_ram_save_iterate_big_wait(t1, i); 2931 break; 2932 } 2933 } 2934 i++; 2935 } 2936 } 2937 qemu_mutex_unlock(&rs->bitmap_mutex); 2938 2939 /* 2940 * Must occur before EOS (or any QEMUFile operation) 2941 * because of RDMA protocol. 2942 */ 2943 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2944 2945 out: 2946 if (ret >= 0 2947 && migration_is_setup_or_active(migrate_get_current()->state)) { 2948 multifd_send_sync_main(rs->f); 2949 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2950 qemu_fflush(f); 2951 ram_counters.transferred += 8; 2952 2953 ret = qemu_file_get_error(f); 2954 } 2955 if (ret < 0) { 2956 return ret; 2957 } 2958 2959 return done; 2960 } 2961 2962 /** 2963 * ram_save_complete: function called to send the remaining amount of ram 2964 * 2965 * Returns zero to indicate success or negative on error 2966 * 2967 * Called with iothread lock 2968 * 2969 * @f: QEMUFile where to send the data 2970 * @opaque: RAMState pointer 2971 */ 2972 static int ram_save_complete(QEMUFile *f, void *opaque) 2973 { 2974 RAMState **temp = opaque; 2975 RAMState *rs = *temp; 2976 int ret = 0; 2977 2978 WITH_RCU_READ_LOCK_GUARD() { 2979 if (!migration_in_postcopy()) { 2980 migration_bitmap_sync_precopy(rs); 2981 } 2982 2983 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2984 2985 /* try transferring iterative blocks of memory */ 2986 2987 /* flush all remaining blocks regardless of rate limiting */ 2988 while (true) { 2989 int pages; 2990 2991 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2992 /* no more blocks to sent */ 2993 if (pages == 0) { 2994 break; 2995 } 2996 if (pages < 0) { 2997 ret = pages; 2998 break; 2999 } 3000 } 3001 3002 flush_compressed_data(rs); 3003 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3004 } 3005 3006 if (ret >= 0) { 3007 multifd_send_sync_main(rs->f); 3008 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3009 qemu_fflush(f); 3010 } 3011 3012 return ret; 3013 } 3014 3015 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3016 uint64_t *res_precopy_only, 3017 uint64_t *res_compatible, 3018 uint64_t *res_postcopy_only) 3019 { 3020 RAMState **temp = opaque; 3021 RAMState *rs = *temp; 3022 uint64_t remaining_size; 3023 3024 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3025 3026 if (!migration_in_postcopy() && 3027 remaining_size < max_size) { 3028 qemu_mutex_lock_iothread(); 3029 WITH_RCU_READ_LOCK_GUARD() { 3030 migration_bitmap_sync_precopy(rs); 3031 } 3032 qemu_mutex_unlock_iothread(); 3033 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3034 } 3035 3036 if (migrate_postcopy_ram()) { 3037 /* We can do postcopy, and all the data is postcopiable */ 3038 *res_compatible += remaining_size; 3039 } else { 3040 *res_precopy_only += remaining_size; 3041 } 3042 } 3043 3044 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3045 { 3046 unsigned int xh_len; 3047 int xh_flags; 3048 uint8_t *loaded_data; 3049 3050 /* extract RLE header */ 3051 xh_flags = qemu_get_byte(f); 3052 xh_len = qemu_get_be16(f); 3053 3054 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3055 error_report("Failed to load XBZRLE page - wrong compression!"); 3056 return -1; 3057 } 3058 3059 if (xh_len > TARGET_PAGE_SIZE) { 3060 error_report("Failed to load XBZRLE page - len overflow!"); 3061 return -1; 3062 } 3063 loaded_data = XBZRLE.decoded_buf; 3064 /* load data and decode */ 3065 /* it can change loaded_data to point to an internal buffer */ 3066 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3067 3068 /* decode RLE */ 3069 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3070 TARGET_PAGE_SIZE) == -1) { 3071 error_report("Failed to load XBZRLE page - decode error!"); 3072 return -1; 3073 } 3074 3075 return 0; 3076 } 3077 3078 /** 3079 * ram_block_from_stream: read a RAMBlock id from the migration stream 3080 * 3081 * Must be called from within a rcu critical section. 3082 * 3083 * Returns a pointer from within the RCU-protected ram_list. 3084 * 3085 * @f: QEMUFile where to read the data from 3086 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3087 */ 3088 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 3089 { 3090 static RAMBlock *block; 3091 char id[256]; 3092 uint8_t len; 3093 3094 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3095 if (!block) { 3096 error_report("Ack, bad migration stream!"); 3097 return NULL; 3098 } 3099 return block; 3100 } 3101 3102 len = qemu_get_byte(f); 3103 qemu_get_buffer(f, (uint8_t *)id, len); 3104 id[len] = 0; 3105 3106 block = qemu_ram_block_by_name(id); 3107 if (!block) { 3108 error_report("Can't find block %s", id); 3109 return NULL; 3110 } 3111 3112 if (ramblock_is_ignored(block)) { 3113 error_report("block %s should not be migrated !", id); 3114 return NULL; 3115 } 3116 3117 return block; 3118 } 3119 3120 static inline void *host_from_ram_block_offset(RAMBlock *block, 3121 ram_addr_t offset) 3122 { 3123 if (!offset_in_ramblock(block, offset)) { 3124 return NULL; 3125 } 3126 3127 return block->host + offset; 3128 } 3129 3130 static void *host_page_from_ram_block_offset(RAMBlock *block, 3131 ram_addr_t offset) 3132 { 3133 /* Note: Explicitly no check against offset_in_ramblock(). */ 3134 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3135 block->page_size); 3136 } 3137 3138 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3139 ram_addr_t offset) 3140 { 3141 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3142 } 3143 3144 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3145 ram_addr_t offset, bool record_bitmap) 3146 { 3147 if (!offset_in_ramblock(block, offset)) { 3148 return NULL; 3149 } 3150 if (!block->colo_cache) { 3151 error_report("%s: colo_cache is NULL in block :%s", 3152 __func__, block->idstr); 3153 return NULL; 3154 } 3155 3156 /* 3157 * During colo checkpoint, we need bitmap of these migrated pages. 3158 * It help us to decide which pages in ram cache should be flushed 3159 * into VM's RAM later. 3160 */ 3161 if (record_bitmap && 3162 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3163 ram_state->migration_dirty_pages++; 3164 } 3165 return block->colo_cache + offset; 3166 } 3167 3168 /** 3169 * ram_handle_compressed: handle the zero page case 3170 * 3171 * If a page (or a whole RDMA chunk) has been 3172 * determined to be zero, then zap it. 3173 * 3174 * @host: host address for the zero page 3175 * @ch: what the page is filled from. We only support zero 3176 * @size: size of the zero page 3177 */ 3178 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3179 { 3180 if (ch != 0 || !is_zero_range(host, size)) { 3181 memset(host, ch, size); 3182 } 3183 } 3184 3185 /* return the size after decompression, or negative value on error */ 3186 static int 3187 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3188 const uint8_t *source, size_t source_len) 3189 { 3190 int err; 3191 3192 err = inflateReset(stream); 3193 if (err != Z_OK) { 3194 return -1; 3195 } 3196 3197 stream->avail_in = source_len; 3198 stream->next_in = (uint8_t *)source; 3199 stream->avail_out = dest_len; 3200 stream->next_out = dest; 3201 3202 err = inflate(stream, Z_NO_FLUSH); 3203 if (err != Z_STREAM_END) { 3204 return -1; 3205 } 3206 3207 return stream->total_out; 3208 } 3209 3210 static void *do_data_decompress(void *opaque) 3211 { 3212 DecompressParam *param = opaque; 3213 unsigned long pagesize; 3214 uint8_t *des; 3215 int len, ret; 3216 3217 qemu_mutex_lock(¶m->mutex); 3218 while (!param->quit) { 3219 if (param->des) { 3220 des = param->des; 3221 len = param->len; 3222 param->des = 0; 3223 qemu_mutex_unlock(¶m->mutex); 3224 3225 pagesize = TARGET_PAGE_SIZE; 3226 3227 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3228 param->compbuf, len); 3229 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3230 error_report("decompress data failed"); 3231 qemu_file_set_error(decomp_file, ret); 3232 } 3233 3234 qemu_mutex_lock(&decomp_done_lock); 3235 param->done = true; 3236 qemu_cond_signal(&decomp_done_cond); 3237 qemu_mutex_unlock(&decomp_done_lock); 3238 3239 qemu_mutex_lock(¶m->mutex); 3240 } else { 3241 qemu_cond_wait(¶m->cond, ¶m->mutex); 3242 } 3243 } 3244 qemu_mutex_unlock(¶m->mutex); 3245 3246 return NULL; 3247 } 3248 3249 static int wait_for_decompress_done(void) 3250 { 3251 int idx, thread_count; 3252 3253 if (!migrate_use_compression()) { 3254 return 0; 3255 } 3256 3257 thread_count = migrate_decompress_threads(); 3258 qemu_mutex_lock(&decomp_done_lock); 3259 for (idx = 0; idx < thread_count; idx++) { 3260 while (!decomp_param[idx].done) { 3261 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3262 } 3263 } 3264 qemu_mutex_unlock(&decomp_done_lock); 3265 return qemu_file_get_error(decomp_file); 3266 } 3267 3268 static void compress_threads_load_cleanup(void) 3269 { 3270 int i, thread_count; 3271 3272 if (!migrate_use_compression()) { 3273 return; 3274 } 3275 thread_count = migrate_decompress_threads(); 3276 for (i = 0; i < thread_count; i++) { 3277 /* 3278 * we use it as a indicator which shows if the thread is 3279 * properly init'd or not 3280 */ 3281 if (!decomp_param[i].compbuf) { 3282 break; 3283 } 3284 3285 qemu_mutex_lock(&decomp_param[i].mutex); 3286 decomp_param[i].quit = true; 3287 qemu_cond_signal(&decomp_param[i].cond); 3288 qemu_mutex_unlock(&decomp_param[i].mutex); 3289 } 3290 for (i = 0; i < thread_count; i++) { 3291 if (!decomp_param[i].compbuf) { 3292 break; 3293 } 3294 3295 qemu_thread_join(decompress_threads + i); 3296 qemu_mutex_destroy(&decomp_param[i].mutex); 3297 qemu_cond_destroy(&decomp_param[i].cond); 3298 inflateEnd(&decomp_param[i].stream); 3299 g_free(decomp_param[i].compbuf); 3300 decomp_param[i].compbuf = NULL; 3301 } 3302 g_free(decompress_threads); 3303 g_free(decomp_param); 3304 decompress_threads = NULL; 3305 decomp_param = NULL; 3306 decomp_file = NULL; 3307 } 3308 3309 static int compress_threads_load_setup(QEMUFile *f) 3310 { 3311 int i, thread_count; 3312 3313 if (!migrate_use_compression()) { 3314 return 0; 3315 } 3316 3317 thread_count = migrate_decompress_threads(); 3318 decompress_threads = g_new0(QemuThread, thread_count); 3319 decomp_param = g_new0(DecompressParam, thread_count); 3320 qemu_mutex_init(&decomp_done_lock); 3321 qemu_cond_init(&decomp_done_cond); 3322 decomp_file = f; 3323 for (i = 0; i < thread_count; i++) { 3324 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3325 goto exit; 3326 } 3327 3328 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3329 qemu_mutex_init(&decomp_param[i].mutex); 3330 qemu_cond_init(&decomp_param[i].cond); 3331 decomp_param[i].done = true; 3332 decomp_param[i].quit = false; 3333 qemu_thread_create(decompress_threads + i, "decompress", 3334 do_data_decompress, decomp_param + i, 3335 QEMU_THREAD_JOINABLE); 3336 } 3337 return 0; 3338 exit: 3339 compress_threads_load_cleanup(); 3340 return -1; 3341 } 3342 3343 static void decompress_data_with_multi_threads(QEMUFile *f, 3344 void *host, int len) 3345 { 3346 int idx, thread_count; 3347 3348 thread_count = migrate_decompress_threads(); 3349 QEMU_LOCK_GUARD(&decomp_done_lock); 3350 while (true) { 3351 for (idx = 0; idx < thread_count; idx++) { 3352 if (decomp_param[idx].done) { 3353 decomp_param[idx].done = false; 3354 qemu_mutex_lock(&decomp_param[idx].mutex); 3355 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3356 decomp_param[idx].des = host; 3357 decomp_param[idx].len = len; 3358 qemu_cond_signal(&decomp_param[idx].cond); 3359 qemu_mutex_unlock(&decomp_param[idx].mutex); 3360 break; 3361 } 3362 } 3363 if (idx < thread_count) { 3364 break; 3365 } else { 3366 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3367 } 3368 } 3369 } 3370 3371 static void colo_init_ram_state(void) 3372 { 3373 ram_state_init(&ram_state); 3374 } 3375 3376 /* 3377 * colo cache: this is for secondary VM, we cache the whole 3378 * memory of the secondary VM, it is need to hold the global lock 3379 * to call this helper. 3380 */ 3381 int colo_init_ram_cache(void) 3382 { 3383 RAMBlock *block; 3384 3385 WITH_RCU_READ_LOCK_GUARD() { 3386 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3387 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3388 NULL, false, false); 3389 if (!block->colo_cache) { 3390 error_report("%s: Can't alloc memory for COLO cache of block %s," 3391 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3392 block->used_length); 3393 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3394 if (block->colo_cache) { 3395 qemu_anon_ram_free(block->colo_cache, block->used_length); 3396 block->colo_cache = NULL; 3397 } 3398 } 3399 return -errno; 3400 } 3401 } 3402 } 3403 3404 /* 3405 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3406 * with to decide which page in cache should be flushed into SVM's RAM. Here 3407 * we use the same name 'ram_bitmap' as for migration. 3408 */ 3409 if (ram_bytes_total()) { 3410 RAMBlock *block; 3411 3412 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3413 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3414 block->bmap = bitmap_new(pages); 3415 } 3416 } 3417 3418 colo_init_ram_state(); 3419 return 0; 3420 } 3421 3422 /* TODO: duplicated with ram_init_bitmaps */ 3423 void colo_incoming_start_dirty_log(void) 3424 { 3425 RAMBlock *block = NULL; 3426 /* For memory_global_dirty_log_start below. */ 3427 qemu_mutex_lock_iothread(); 3428 qemu_mutex_lock_ramlist(); 3429 3430 memory_global_dirty_log_sync(); 3431 WITH_RCU_READ_LOCK_GUARD() { 3432 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3433 ramblock_sync_dirty_bitmap(ram_state, block); 3434 /* Discard this dirty bitmap record */ 3435 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3436 } 3437 memory_global_dirty_log_start(); 3438 } 3439 ram_state->migration_dirty_pages = 0; 3440 qemu_mutex_unlock_ramlist(); 3441 qemu_mutex_unlock_iothread(); 3442 } 3443 3444 /* It is need to hold the global lock to call this helper */ 3445 void colo_release_ram_cache(void) 3446 { 3447 RAMBlock *block; 3448 3449 memory_global_dirty_log_stop(); 3450 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3451 g_free(block->bmap); 3452 block->bmap = NULL; 3453 } 3454 3455 WITH_RCU_READ_LOCK_GUARD() { 3456 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3457 if (block->colo_cache) { 3458 qemu_anon_ram_free(block->colo_cache, block->used_length); 3459 block->colo_cache = NULL; 3460 } 3461 } 3462 } 3463 ram_state_cleanup(&ram_state); 3464 } 3465 3466 /** 3467 * ram_load_setup: Setup RAM for migration incoming side 3468 * 3469 * Returns zero to indicate success and negative for error 3470 * 3471 * @f: QEMUFile where to receive the data 3472 * @opaque: RAMState pointer 3473 */ 3474 static int ram_load_setup(QEMUFile *f, void *opaque) 3475 { 3476 if (compress_threads_load_setup(f)) { 3477 return -1; 3478 } 3479 3480 xbzrle_load_setup(); 3481 ramblock_recv_map_init(); 3482 3483 return 0; 3484 } 3485 3486 static int ram_load_cleanup(void *opaque) 3487 { 3488 RAMBlock *rb; 3489 3490 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3491 qemu_ram_block_writeback(rb); 3492 } 3493 3494 xbzrle_load_cleanup(); 3495 compress_threads_load_cleanup(); 3496 3497 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3498 g_free(rb->receivedmap); 3499 rb->receivedmap = NULL; 3500 } 3501 3502 return 0; 3503 } 3504 3505 /** 3506 * ram_postcopy_incoming_init: allocate postcopy data structures 3507 * 3508 * Returns 0 for success and negative if there was one error 3509 * 3510 * @mis: current migration incoming state 3511 * 3512 * Allocate data structures etc needed by incoming migration with 3513 * postcopy-ram. postcopy-ram's similarly names 3514 * postcopy_ram_incoming_init does the work. 3515 */ 3516 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3517 { 3518 return postcopy_ram_incoming_init(mis); 3519 } 3520 3521 /** 3522 * ram_load_postcopy: load a page in postcopy case 3523 * 3524 * Returns 0 for success or -errno in case of error 3525 * 3526 * Called in postcopy mode by ram_load(). 3527 * rcu_read_lock is taken prior to this being called. 3528 * 3529 * @f: QEMUFile where to send the data 3530 */ 3531 static int ram_load_postcopy(QEMUFile *f) 3532 { 3533 int flags = 0, ret = 0; 3534 bool place_needed = false; 3535 bool matches_target_page_size = false; 3536 MigrationIncomingState *mis = migration_incoming_get_current(); 3537 /* Temporary page that is later 'placed' */ 3538 void *postcopy_host_page = mis->postcopy_tmp_page; 3539 void *host_page = NULL; 3540 bool all_zero = true; 3541 int target_pages = 0; 3542 3543 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3544 ram_addr_t addr; 3545 void *page_buffer = NULL; 3546 void *place_source = NULL; 3547 RAMBlock *block = NULL; 3548 uint8_t ch; 3549 int len; 3550 3551 addr = qemu_get_be64(f); 3552 3553 /* 3554 * If qemu file error, we should stop here, and then "addr" 3555 * may be invalid 3556 */ 3557 ret = qemu_file_get_error(f); 3558 if (ret) { 3559 break; 3560 } 3561 3562 flags = addr & ~TARGET_PAGE_MASK; 3563 addr &= TARGET_PAGE_MASK; 3564 3565 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3566 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3567 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3568 block = ram_block_from_stream(f, flags); 3569 if (!block) { 3570 ret = -EINVAL; 3571 break; 3572 } 3573 3574 /* 3575 * Relying on used_length is racy and can result in false positives. 3576 * We might place pages beyond used_length in case RAM was shrunk 3577 * while in postcopy, which is fine - trying to place via 3578 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3579 */ 3580 if (!block->host || addr >= block->postcopy_length) { 3581 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3582 ret = -EINVAL; 3583 break; 3584 } 3585 target_pages++; 3586 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3587 /* 3588 * Postcopy requires that we place whole host pages atomically; 3589 * these may be huge pages for RAMBlocks that are backed by 3590 * hugetlbfs. 3591 * To make it atomic, the data is read into a temporary page 3592 * that's moved into place later. 3593 * The migration protocol uses, possibly smaller, target-pages 3594 * however the source ensures it always sends all the components 3595 * of a host page in one chunk. 3596 */ 3597 page_buffer = postcopy_host_page + 3598 host_page_offset_from_ram_block_offset(block, addr); 3599 /* If all TP are zero then we can optimise the place */ 3600 if (target_pages == 1) { 3601 host_page = host_page_from_ram_block_offset(block, addr); 3602 } else if (host_page != host_page_from_ram_block_offset(block, 3603 addr)) { 3604 /* not the 1st TP within the HP */ 3605 error_report("Non-same host page %p/%p", host_page, 3606 host_page_from_ram_block_offset(block, addr)); 3607 ret = -EINVAL; 3608 break; 3609 } 3610 3611 /* 3612 * If it's the last part of a host page then we place the host 3613 * page 3614 */ 3615 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3616 place_needed = true; 3617 } 3618 place_source = postcopy_host_page; 3619 } 3620 3621 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3622 case RAM_SAVE_FLAG_ZERO: 3623 ch = qemu_get_byte(f); 3624 /* 3625 * Can skip to set page_buffer when 3626 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3627 */ 3628 if (ch || !matches_target_page_size) { 3629 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3630 } 3631 if (ch) { 3632 all_zero = false; 3633 } 3634 break; 3635 3636 case RAM_SAVE_FLAG_PAGE: 3637 all_zero = false; 3638 if (!matches_target_page_size) { 3639 /* For huge pages, we always use temporary buffer */ 3640 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3641 } else { 3642 /* 3643 * For small pages that matches target page size, we 3644 * avoid the qemu_file copy. Instead we directly use 3645 * the buffer of QEMUFile to place the page. Note: we 3646 * cannot do any QEMUFile operation before using that 3647 * buffer to make sure the buffer is valid when 3648 * placing the page. 3649 */ 3650 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3651 TARGET_PAGE_SIZE); 3652 } 3653 break; 3654 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3655 all_zero = false; 3656 len = qemu_get_be32(f); 3657 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3658 error_report("Invalid compressed data length: %d", len); 3659 ret = -EINVAL; 3660 break; 3661 } 3662 decompress_data_with_multi_threads(f, page_buffer, len); 3663 break; 3664 3665 case RAM_SAVE_FLAG_EOS: 3666 /* normal exit */ 3667 multifd_recv_sync_main(); 3668 break; 3669 default: 3670 error_report("Unknown combination of migration flags: 0x%x" 3671 " (postcopy mode)", flags); 3672 ret = -EINVAL; 3673 break; 3674 } 3675 3676 /* Got the whole host page, wait for decompress before placing. */ 3677 if (place_needed) { 3678 ret |= wait_for_decompress_done(); 3679 } 3680 3681 /* Detect for any possible file errors */ 3682 if (!ret && qemu_file_get_error(f)) { 3683 ret = qemu_file_get_error(f); 3684 } 3685 3686 if (!ret && place_needed) { 3687 if (all_zero) { 3688 ret = postcopy_place_page_zero(mis, host_page, block); 3689 } else { 3690 ret = postcopy_place_page(mis, host_page, place_source, 3691 block); 3692 } 3693 place_needed = false; 3694 target_pages = 0; 3695 /* Assume we have a zero page until we detect something different */ 3696 all_zero = true; 3697 } 3698 } 3699 3700 return ret; 3701 } 3702 3703 static bool postcopy_is_advised(void) 3704 { 3705 PostcopyState ps = postcopy_state_get(); 3706 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3707 } 3708 3709 static bool postcopy_is_running(void) 3710 { 3711 PostcopyState ps = postcopy_state_get(); 3712 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3713 } 3714 3715 /* 3716 * Flush content of RAM cache into SVM's memory. 3717 * Only flush the pages that be dirtied by PVM or SVM or both. 3718 */ 3719 void colo_flush_ram_cache(void) 3720 { 3721 RAMBlock *block = NULL; 3722 void *dst_host; 3723 void *src_host; 3724 unsigned long offset = 0; 3725 3726 memory_global_dirty_log_sync(); 3727 qemu_mutex_lock(&ram_state->bitmap_mutex); 3728 WITH_RCU_READ_LOCK_GUARD() { 3729 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3730 ramblock_sync_dirty_bitmap(ram_state, block); 3731 } 3732 } 3733 3734 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3735 WITH_RCU_READ_LOCK_GUARD() { 3736 block = QLIST_FIRST_RCU(&ram_list.blocks); 3737 3738 while (block) { 3739 offset = migration_bitmap_find_dirty(ram_state, block, offset); 3740 3741 if (!offset_in_ramblock(block, 3742 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3743 offset = 0; 3744 block = QLIST_NEXT_RCU(block, next); 3745 } else { 3746 migration_bitmap_clear_dirty(ram_state, block, offset); 3747 dst_host = block->host 3748 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3749 src_host = block->colo_cache 3750 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3751 memcpy(dst_host, src_host, TARGET_PAGE_SIZE); 3752 } 3753 } 3754 } 3755 trace_colo_flush_ram_cache_end(); 3756 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3757 } 3758 3759 /** 3760 * ram_load_precopy: load pages in precopy case 3761 * 3762 * Returns 0 for success or -errno in case of error 3763 * 3764 * Called in precopy mode by ram_load(). 3765 * rcu_read_lock is taken prior to this being called. 3766 * 3767 * @f: QEMUFile where to send the data 3768 */ 3769 static int ram_load_precopy(QEMUFile *f) 3770 { 3771 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3772 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3773 bool postcopy_advised = postcopy_is_advised(); 3774 if (!migrate_use_compression()) { 3775 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3776 } 3777 3778 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3779 ram_addr_t addr, total_ram_bytes; 3780 void *host = NULL, *host_bak = NULL; 3781 uint8_t ch; 3782 3783 /* 3784 * Yield periodically to let main loop run, but an iteration of 3785 * the main loop is expensive, so do it each some iterations 3786 */ 3787 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3788 aio_co_schedule(qemu_get_current_aio_context(), 3789 qemu_coroutine_self()); 3790 qemu_coroutine_yield(); 3791 } 3792 i++; 3793 3794 addr = qemu_get_be64(f); 3795 flags = addr & ~TARGET_PAGE_MASK; 3796 addr &= TARGET_PAGE_MASK; 3797 3798 if (flags & invalid_flags) { 3799 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3800 error_report("Received an unexpected compressed page"); 3801 } 3802 3803 ret = -EINVAL; 3804 break; 3805 } 3806 3807 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3808 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3809 RAMBlock *block = ram_block_from_stream(f, flags); 3810 3811 host = host_from_ram_block_offset(block, addr); 3812 /* 3813 * After going into COLO stage, we should not load the page 3814 * into SVM's memory directly, we put them into colo_cache firstly. 3815 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3816 * Previously, we copied all these memory in preparing stage of COLO 3817 * while we need to stop VM, which is a time-consuming process. 3818 * Here we optimize it by a trick, back-up every page while in 3819 * migration process while COLO is enabled, though it affects the 3820 * speed of the migration, but it obviously reduce the downtime of 3821 * back-up all SVM'S memory in COLO preparing stage. 3822 */ 3823 if (migration_incoming_colo_enabled()) { 3824 if (migration_incoming_in_colo_state()) { 3825 /* In COLO stage, put all pages into cache temporarily */ 3826 host = colo_cache_from_block_offset(block, addr, true); 3827 } else { 3828 /* 3829 * In migration stage but before COLO stage, 3830 * Put all pages into both cache and SVM's memory. 3831 */ 3832 host_bak = colo_cache_from_block_offset(block, addr, false); 3833 } 3834 } 3835 if (!host) { 3836 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3837 ret = -EINVAL; 3838 break; 3839 } 3840 if (!migration_incoming_in_colo_state()) { 3841 ramblock_recv_bitmap_set(block, host); 3842 } 3843 3844 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3845 } 3846 3847 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3848 case RAM_SAVE_FLAG_MEM_SIZE: 3849 /* Synchronize RAM block list */ 3850 total_ram_bytes = addr; 3851 while (!ret && total_ram_bytes) { 3852 RAMBlock *block; 3853 char id[256]; 3854 ram_addr_t length; 3855 3856 len = qemu_get_byte(f); 3857 qemu_get_buffer(f, (uint8_t *)id, len); 3858 id[len] = 0; 3859 length = qemu_get_be64(f); 3860 3861 block = qemu_ram_block_by_name(id); 3862 if (block && !qemu_ram_is_migratable(block)) { 3863 error_report("block %s should not be migrated !", id); 3864 ret = -EINVAL; 3865 } else if (block) { 3866 if (length != block->used_length) { 3867 Error *local_err = NULL; 3868 3869 ret = qemu_ram_resize(block, length, 3870 &local_err); 3871 if (local_err) { 3872 error_report_err(local_err); 3873 } 3874 } 3875 /* For postcopy we need to check hugepage sizes match */ 3876 if (postcopy_advised && migrate_postcopy_ram() && 3877 block->page_size != qemu_host_page_size) { 3878 uint64_t remote_page_size = qemu_get_be64(f); 3879 if (remote_page_size != block->page_size) { 3880 error_report("Mismatched RAM page size %s " 3881 "(local) %zd != %" PRId64, 3882 id, block->page_size, 3883 remote_page_size); 3884 ret = -EINVAL; 3885 } 3886 } 3887 if (migrate_ignore_shared()) { 3888 hwaddr addr = qemu_get_be64(f); 3889 if (ramblock_is_ignored(block) && 3890 block->mr->addr != addr) { 3891 error_report("Mismatched GPAs for block %s " 3892 "%" PRId64 "!= %" PRId64, 3893 id, (uint64_t)addr, 3894 (uint64_t)block->mr->addr); 3895 ret = -EINVAL; 3896 } 3897 } 3898 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3899 block->idstr); 3900 } else { 3901 error_report("Unknown ramblock \"%s\", cannot " 3902 "accept migration", id); 3903 ret = -EINVAL; 3904 } 3905 3906 total_ram_bytes -= length; 3907 } 3908 break; 3909 3910 case RAM_SAVE_FLAG_ZERO: 3911 ch = qemu_get_byte(f); 3912 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3913 break; 3914 3915 case RAM_SAVE_FLAG_PAGE: 3916 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3917 break; 3918 3919 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3920 len = qemu_get_be32(f); 3921 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3922 error_report("Invalid compressed data length: %d", len); 3923 ret = -EINVAL; 3924 break; 3925 } 3926 decompress_data_with_multi_threads(f, host, len); 3927 break; 3928 3929 case RAM_SAVE_FLAG_XBZRLE: 3930 if (load_xbzrle(f, addr, host) < 0) { 3931 error_report("Failed to decompress XBZRLE page at " 3932 RAM_ADDR_FMT, addr); 3933 ret = -EINVAL; 3934 break; 3935 } 3936 break; 3937 case RAM_SAVE_FLAG_EOS: 3938 /* normal exit */ 3939 multifd_recv_sync_main(); 3940 break; 3941 default: 3942 if (flags & RAM_SAVE_FLAG_HOOK) { 3943 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3944 } else { 3945 error_report("Unknown combination of migration flags: 0x%x", 3946 flags); 3947 ret = -EINVAL; 3948 } 3949 } 3950 if (!ret) { 3951 ret = qemu_file_get_error(f); 3952 } 3953 if (!ret && host_bak) { 3954 memcpy(host_bak, host, TARGET_PAGE_SIZE); 3955 } 3956 } 3957 3958 ret |= wait_for_decompress_done(); 3959 return ret; 3960 } 3961 3962 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3963 { 3964 int ret = 0; 3965 static uint64_t seq_iter; 3966 /* 3967 * If system is running in postcopy mode, page inserts to host memory must 3968 * be atomic 3969 */ 3970 bool postcopy_running = postcopy_is_running(); 3971 3972 seq_iter++; 3973 3974 if (version_id != 4) { 3975 return -EINVAL; 3976 } 3977 3978 /* 3979 * This RCU critical section can be very long running. 3980 * When RCU reclaims in the code start to become numerous, 3981 * it will be necessary to reduce the granularity of this 3982 * critical section. 3983 */ 3984 WITH_RCU_READ_LOCK_GUARD() { 3985 if (postcopy_running) { 3986 ret = ram_load_postcopy(f); 3987 } else { 3988 ret = ram_load_precopy(f); 3989 } 3990 } 3991 trace_ram_load_complete(ret, seq_iter); 3992 3993 return ret; 3994 } 3995 3996 static bool ram_has_postcopy(void *opaque) 3997 { 3998 RAMBlock *rb; 3999 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4000 if (ramblock_is_pmem(rb)) { 4001 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4002 "is not supported now!", rb->idstr, rb->host); 4003 return false; 4004 } 4005 } 4006 4007 return migrate_postcopy_ram(); 4008 } 4009 4010 /* Sync all the dirty bitmap with destination VM. */ 4011 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4012 { 4013 RAMBlock *block; 4014 QEMUFile *file = s->to_dst_file; 4015 int ramblock_count = 0; 4016 4017 trace_ram_dirty_bitmap_sync_start(); 4018 4019 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4020 qemu_savevm_send_recv_bitmap(file, block->idstr); 4021 trace_ram_dirty_bitmap_request(block->idstr); 4022 ramblock_count++; 4023 } 4024 4025 trace_ram_dirty_bitmap_sync_wait(); 4026 4027 /* Wait until all the ramblocks' dirty bitmap synced */ 4028 while (ramblock_count--) { 4029 qemu_sem_wait(&s->rp_state.rp_sem); 4030 } 4031 4032 trace_ram_dirty_bitmap_sync_complete(); 4033 4034 return 0; 4035 } 4036 4037 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4038 { 4039 qemu_sem_post(&s->rp_state.rp_sem); 4040 } 4041 4042 /* 4043 * Read the received bitmap, revert it as the initial dirty bitmap. 4044 * This is only used when the postcopy migration is paused but wants 4045 * to resume from a middle point. 4046 */ 4047 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4048 { 4049 int ret = -EINVAL; 4050 /* from_dst_file is always valid because we're within rp_thread */ 4051 QEMUFile *file = s->rp_state.from_dst_file; 4052 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4053 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4054 uint64_t size, end_mark; 4055 4056 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4057 4058 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4059 error_report("%s: incorrect state %s", __func__, 4060 MigrationStatus_str(s->state)); 4061 return -EINVAL; 4062 } 4063 4064 /* 4065 * Note: see comments in ramblock_recv_bitmap_send() on why we 4066 * need the endianness conversion, and the paddings. 4067 */ 4068 local_size = ROUND_UP(local_size, 8); 4069 4070 /* Add paddings */ 4071 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4072 4073 size = qemu_get_be64(file); 4074 4075 /* The size of the bitmap should match with our ramblock */ 4076 if (size != local_size) { 4077 error_report("%s: ramblock '%s' bitmap size mismatch " 4078 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4079 block->idstr, size, local_size); 4080 ret = -EINVAL; 4081 goto out; 4082 } 4083 4084 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4085 end_mark = qemu_get_be64(file); 4086 4087 ret = qemu_file_get_error(file); 4088 if (ret || size != local_size) { 4089 error_report("%s: read bitmap failed for ramblock '%s': %d" 4090 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4091 __func__, block->idstr, ret, local_size, size); 4092 ret = -EIO; 4093 goto out; 4094 } 4095 4096 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4097 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4098 __func__, block->idstr, end_mark); 4099 ret = -EINVAL; 4100 goto out; 4101 } 4102 4103 /* 4104 * Endianness conversion. We are during postcopy (though paused). 4105 * The dirty bitmap won't change. We can directly modify it. 4106 */ 4107 bitmap_from_le(block->bmap, le_bitmap, nbits); 4108 4109 /* 4110 * What we received is "received bitmap". Revert it as the initial 4111 * dirty bitmap for this ramblock. 4112 */ 4113 bitmap_complement(block->bmap, block->bmap, nbits); 4114 4115 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4116 4117 /* 4118 * We succeeded to sync bitmap for current ramblock. If this is 4119 * the last one to sync, we need to notify the main send thread. 4120 */ 4121 ram_dirty_bitmap_reload_notify(s); 4122 4123 ret = 0; 4124 out: 4125 g_free(le_bitmap); 4126 return ret; 4127 } 4128 4129 static int ram_resume_prepare(MigrationState *s, void *opaque) 4130 { 4131 RAMState *rs = *(RAMState **)opaque; 4132 int ret; 4133 4134 ret = ram_dirty_bitmap_sync_all(s, rs); 4135 if (ret) { 4136 return ret; 4137 } 4138 4139 ram_state_resume_prepare(rs, s->to_dst_file); 4140 4141 return 0; 4142 } 4143 4144 static SaveVMHandlers savevm_ram_handlers = { 4145 .save_setup = ram_save_setup, 4146 .save_live_iterate = ram_save_iterate, 4147 .save_live_complete_postcopy = ram_save_complete, 4148 .save_live_complete_precopy = ram_save_complete, 4149 .has_postcopy = ram_has_postcopy, 4150 .save_live_pending = ram_save_pending, 4151 .load_state = ram_load, 4152 .save_cleanup = ram_save_cleanup, 4153 .load_setup = ram_load_setup, 4154 .load_cleanup = ram_load_cleanup, 4155 .resume_prepare = ram_resume_prepare, 4156 }; 4157 4158 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4159 size_t old_size, size_t new_size) 4160 { 4161 PostcopyState ps = postcopy_state_get(); 4162 ram_addr_t offset; 4163 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4164 Error *err = NULL; 4165 4166 if (ramblock_is_ignored(rb)) { 4167 return; 4168 } 4169 4170 if (!migration_is_idle()) { 4171 /* 4172 * Precopy code on the source cannot deal with the size of RAM blocks 4173 * changing at random points in time - especially after sending the 4174 * RAM block sizes in the migration stream, they must no longer change. 4175 * Abort and indicate a proper reason. 4176 */ 4177 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4178 migrate_set_error(migrate_get_current(), err); 4179 error_free(err); 4180 migration_cancel(); 4181 } 4182 4183 switch (ps) { 4184 case POSTCOPY_INCOMING_ADVISE: 4185 /* 4186 * Update what ram_postcopy_incoming_init()->init_range() does at the 4187 * time postcopy was advised. Syncing RAM blocks with the source will 4188 * result in RAM resizes. 4189 */ 4190 if (old_size < new_size) { 4191 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4192 error_report("RAM block '%s' discard of resized RAM failed", 4193 rb->idstr); 4194 } 4195 } 4196 rb->postcopy_length = new_size; 4197 break; 4198 case POSTCOPY_INCOMING_NONE: 4199 case POSTCOPY_INCOMING_RUNNING: 4200 case POSTCOPY_INCOMING_END: 4201 /* 4202 * Once our guest is running, postcopy does no longer care about 4203 * resizes. When growing, the new memory was not available on the 4204 * source, no handler needed. 4205 */ 4206 break; 4207 default: 4208 error_report("RAM block '%s' resized during postcopy state: %d", 4209 rb->idstr, ps); 4210 exit(-1); 4211 } 4212 } 4213 4214 static RAMBlockNotifier ram_mig_ram_notifier = { 4215 .ram_block_resized = ram_mig_ram_block_resized, 4216 }; 4217 4218 void ram_mig_init(void) 4219 { 4220 qemu_mutex_init(&XBZRLE.lock); 4221 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4222 ram_block_notifier_add(&ram_mig_ram_notifier); 4223 } 4224