1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/main-loop.h" 34 #include "xbzrle.h" 35 #include "ram.h" 36 #include "migration.h" 37 #include "migration/register.h" 38 #include "migration/misc.h" 39 #include "qemu-file.h" 40 #include "postcopy-ram.h" 41 #include "page_cache.h" 42 #include "qemu/error-report.h" 43 #include "qapi/error.h" 44 #include "qapi/qapi-types-migration.h" 45 #include "qapi/qapi-events-migration.h" 46 #include "qapi/qmp/qerror.h" 47 #include "trace.h" 48 #include "exec/ram_addr.h" 49 #include "exec/target_page.h" 50 #include "qemu/rcu_queue.h" 51 #include "migration/colo.h" 52 #include "block.h" 53 #include "sysemu/cpu-throttle.h" 54 #include "savevm.h" 55 #include "qemu/iov.h" 56 #include "multifd.h" 57 #include "sysemu/runstate.h" 58 59 #if defined(__linux__) 60 #include "qemu/userfaultfd.h" 61 #endif /* defined(__linux__) */ 62 63 /***********************************************************/ 64 /* ram save/restore */ 65 66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 67 * worked for pages that where filled with the same char. We switched 68 * it to only search for the zero value. And to avoid confusion with 69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 70 */ 71 72 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 73 #define RAM_SAVE_FLAG_ZERO 0x02 74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 75 #define RAM_SAVE_FLAG_PAGE 0x08 76 #define RAM_SAVE_FLAG_EOS 0x10 77 #define RAM_SAVE_FLAG_CONTINUE 0x20 78 #define RAM_SAVE_FLAG_XBZRLE 0x40 79 /* 0x80 is reserved in migration.h start with 0x100 next */ 80 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 81 82 static inline bool is_zero_range(uint8_t *p, uint64_t size) 83 { 84 return buffer_is_zero(p, size); 85 } 86 87 XBZRLECacheStats xbzrle_counters; 88 89 /* struct contains XBZRLE cache and a static page 90 used by the compression */ 91 static struct { 92 /* buffer used for XBZRLE encoding */ 93 uint8_t *encoded_buf; 94 /* buffer for storing page content */ 95 uint8_t *current_buf; 96 /* Cache for XBZRLE, Protected by lock. */ 97 PageCache *cache; 98 QemuMutex lock; 99 /* it will store a page full of zeros */ 100 uint8_t *zero_target_page; 101 /* buffer used for XBZRLE decoding */ 102 uint8_t *decoded_buf; 103 } XBZRLE; 104 105 static void XBZRLE_cache_lock(void) 106 { 107 if (migrate_use_xbzrle()) { 108 qemu_mutex_lock(&XBZRLE.lock); 109 } 110 } 111 112 static void XBZRLE_cache_unlock(void) 113 { 114 if (migrate_use_xbzrle()) { 115 qemu_mutex_unlock(&XBZRLE.lock); 116 } 117 } 118 119 /** 120 * xbzrle_cache_resize: resize the xbzrle cache 121 * 122 * This function is called from migrate_params_apply in main 123 * thread, possibly while a migration is in progress. A running 124 * migration may be using the cache and might finish during this call, 125 * hence changes to the cache are protected by XBZRLE.lock(). 126 * 127 * Returns 0 for success or -1 for error 128 * 129 * @new_size: new cache size 130 * @errp: set *errp if the check failed, with reason 131 */ 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 133 { 134 PageCache *new_cache; 135 int64_t ret = 0; 136 137 /* Check for truncation */ 138 if (new_size != (size_t)new_size) { 139 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 140 "exceeding address space"); 141 return -1; 142 } 143 144 if (new_size == migrate_xbzrle_cache_size()) { 145 /* nothing to do */ 146 return 0; 147 } 148 149 XBZRLE_cache_lock(); 150 151 if (XBZRLE.cache != NULL) { 152 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 153 if (!new_cache) { 154 ret = -1; 155 goto out; 156 } 157 158 cache_fini(XBZRLE.cache); 159 XBZRLE.cache = new_cache; 160 } 161 out: 162 XBZRLE_cache_unlock(); 163 return ret; 164 } 165 166 bool ramblock_is_ignored(RAMBlock *block) 167 { 168 return !qemu_ram_is_migratable(block) || 169 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 170 } 171 172 #undef RAMBLOCK_FOREACH 173 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 175 { 176 RAMBlock *block; 177 int ret = 0; 178 179 RCU_READ_LOCK_GUARD(); 180 181 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 182 ret = func(block, opaque); 183 if (ret) { 184 break; 185 } 186 } 187 return ret; 188 } 189 190 static void ramblock_recv_map_init(void) 191 { 192 RAMBlock *rb; 193 194 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 195 assert(!rb->receivedmap); 196 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 197 } 198 } 199 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 201 { 202 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 203 rb->receivedmap); 204 } 205 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 207 { 208 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 209 } 210 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 212 { 213 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 214 } 215 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 217 size_t nr) 218 { 219 bitmap_set_atomic(rb->receivedmap, 220 ramblock_recv_bitmap_offset(host_addr, rb), 221 nr); 222 } 223 224 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 225 226 /* 227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 228 * 229 * Returns >0 if success with sent bytes, or <0 if error. 230 */ 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 232 const char *block_name) 233 { 234 RAMBlock *block = qemu_ram_block_by_name(block_name); 235 unsigned long *le_bitmap, nbits; 236 uint64_t size; 237 238 if (!block) { 239 error_report("%s: invalid block name: %s", __func__, block_name); 240 return -1; 241 } 242 243 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 244 245 /* 246 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 247 * machines we may need 4 more bytes for padding (see below 248 * comment). So extend it a bit before hand. 249 */ 250 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 251 252 /* 253 * Always use little endian when sending the bitmap. This is 254 * required that when source and destination VMs are not using the 255 * same endianness. (Note: big endian won't work.) 256 */ 257 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 258 259 /* Size of the bitmap, in bytes */ 260 size = DIV_ROUND_UP(nbits, 8); 261 262 /* 263 * size is always aligned to 8 bytes for 64bit machines, but it 264 * may not be true for 32bit machines. We need this padding to 265 * make sure the migration can survive even between 32bit and 266 * 64bit machines. 267 */ 268 size = ROUND_UP(size, 8); 269 270 qemu_put_be64(file, size); 271 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 272 /* 273 * Mark as an end, in case the middle part is screwed up due to 274 * some "mysterious" reason. 275 */ 276 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 277 qemu_fflush(file); 278 279 g_free(le_bitmap); 280 281 if (qemu_file_get_error(file)) { 282 return qemu_file_get_error(file); 283 } 284 285 return size + sizeof(size); 286 } 287 288 /* 289 * An outstanding page request, on the source, having been received 290 * and queued 291 */ 292 struct RAMSrcPageRequest { 293 RAMBlock *rb; 294 hwaddr offset; 295 hwaddr len; 296 297 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 298 }; 299 300 /* State of RAM for migration */ 301 struct RAMState { 302 /* QEMUFile used for this migration */ 303 QEMUFile *f; 304 /* UFFD file descriptor, used in 'write-tracking' migration */ 305 int uffdio_fd; 306 /* Last block that we have visited searching for dirty pages */ 307 RAMBlock *last_seen_block; 308 /* Last block from where we have sent data */ 309 RAMBlock *last_sent_block; 310 /* Last dirty target page we have sent */ 311 ram_addr_t last_page; 312 /* last ram version we have seen */ 313 uint32_t last_version; 314 /* How many times we have dirty too many pages */ 315 int dirty_rate_high_cnt; 316 /* these variables are used for bitmap sync */ 317 /* last time we did a full bitmap_sync */ 318 int64_t time_last_bitmap_sync; 319 /* bytes transferred at start_time */ 320 uint64_t bytes_xfer_prev; 321 /* number of dirty pages since start_time */ 322 uint64_t num_dirty_pages_period; 323 /* xbzrle misses since the beginning of the period */ 324 uint64_t xbzrle_cache_miss_prev; 325 /* Amount of xbzrle pages since the beginning of the period */ 326 uint64_t xbzrle_pages_prev; 327 /* Amount of xbzrle encoded bytes since the beginning of the period */ 328 uint64_t xbzrle_bytes_prev; 329 /* Start using XBZRLE (e.g., after the first round). */ 330 bool xbzrle_enabled; 331 332 /* compression statistics since the beginning of the period */ 333 /* amount of count that no free thread to compress data */ 334 uint64_t compress_thread_busy_prev; 335 /* amount bytes after compression */ 336 uint64_t compressed_size_prev; 337 /* amount of compressed pages */ 338 uint64_t compress_pages_prev; 339 340 /* total handled target pages at the beginning of period */ 341 uint64_t target_page_count_prev; 342 /* total handled target pages since start */ 343 uint64_t target_page_count; 344 /* number of dirty bits in the bitmap */ 345 uint64_t migration_dirty_pages; 346 /* Protects modification of the bitmap and migration dirty pages */ 347 QemuMutex bitmap_mutex; 348 /* The RAMBlock used in the last src_page_requests */ 349 RAMBlock *last_req_rb; 350 /* Queue of outstanding page requests from the destination */ 351 QemuMutex src_page_req_mutex; 352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 353 }; 354 typedef struct RAMState RAMState; 355 356 static RAMState *ram_state; 357 358 static NotifierWithReturnList precopy_notifier_list; 359 360 void precopy_infrastructure_init(void) 361 { 362 notifier_with_return_list_init(&precopy_notifier_list); 363 } 364 365 void precopy_add_notifier(NotifierWithReturn *n) 366 { 367 notifier_with_return_list_add(&precopy_notifier_list, n); 368 } 369 370 void precopy_remove_notifier(NotifierWithReturn *n) 371 { 372 notifier_with_return_remove(n); 373 } 374 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 376 { 377 PrecopyNotifyData pnd; 378 pnd.reason = reason; 379 pnd.errp = errp; 380 381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 382 } 383 384 uint64_t ram_bytes_remaining(void) 385 { 386 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 387 0; 388 } 389 390 MigrationStats ram_counters; 391 392 /* used by the search for pages to send */ 393 struct PageSearchStatus { 394 /* Current block being searched */ 395 RAMBlock *block; 396 /* Current page to search from */ 397 unsigned long page; 398 /* Set once we wrap around */ 399 bool complete_round; 400 }; 401 typedef struct PageSearchStatus PageSearchStatus; 402 403 CompressionStats compression_counters; 404 405 struct CompressParam { 406 bool done; 407 bool quit; 408 bool zero_page; 409 QEMUFile *file; 410 QemuMutex mutex; 411 QemuCond cond; 412 RAMBlock *block; 413 ram_addr_t offset; 414 415 /* internally used fields */ 416 z_stream stream; 417 uint8_t *originbuf; 418 }; 419 typedef struct CompressParam CompressParam; 420 421 struct DecompressParam { 422 bool done; 423 bool quit; 424 QemuMutex mutex; 425 QemuCond cond; 426 void *des; 427 uint8_t *compbuf; 428 int len; 429 z_stream stream; 430 }; 431 typedef struct DecompressParam DecompressParam; 432 433 static CompressParam *comp_param; 434 static QemuThread *compress_threads; 435 /* comp_done_cond is used to wake up the migration thread when 436 * one of the compression threads has finished the compression. 437 * comp_done_lock is used to co-work with comp_done_cond. 438 */ 439 static QemuMutex comp_done_lock; 440 static QemuCond comp_done_cond; 441 /* The empty QEMUFileOps will be used by file in CompressParam */ 442 static const QEMUFileOps empty_ops = { }; 443 444 static QEMUFile *decomp_file; 445 static DecompressParam *decomp_param; 446 static QemuThread *decompress_threads; 447 static QemuMutex decomp_done_lock; 448 static QemuCond decomp_done_cond; 449 450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 451 ram_addr_t offset, uint8_t *source_buf); 452 453 static void *do_data_compress(void *opaque) 454 { 455 CompressParam *param = opaque; 456 RAMBlock *block; 457 ram_addr_t offset; 458 bool zero_page; 459 460 qemu_mutex_lock(¶m->mutex); 461 while (!param->quit) { 462 if (param->block) { 463 block = param->block; 464 offset = param->offset; 465 param->block = NULL; 466 qemu_mutex_unlock(¶m->mutex); 467 468 zero_page = do_compress_ram_page(param->file, ¶m->stream, 469 block, offset, param->originbuf); 470 471 qemu_mutex_lock(&comp_done_lock); 472 param->done = true; 473 param->zero_page = zero_page; 474 qemu_cond_signal(&comp_done_cond); 475 qemu_mutex_unlock(&comp_done_lock); 476 477 qemu_mutex_lock(¶m->mutex); 478 } else { 479 qemu_cond_wait(¶m->cond, ¶m->mutex); 480 } 481 } 482 qemu_mutex_unlock(¶m->mutex); 483 484 return NULL; 485 } 486 487 static void compress_threads_save_cleanup(void) 488 { 489 int i, thread_count; 490 491 if (!migrate_use_compression() || !comp_param) { 492 return; 493 } 494 495 thread_count = migrate_compress_threads(); 496 for (i = 0; i < thread_count; i++) { 497 /* 498 * we use it as a indicator which shows if the thread is 499 * properly init'd or not 500 */ 501 if (!comp_param[i].file) { 502 break; 503 } 504 505 qemu_mutex_lock(&comp_param[i].mutex); 506 comp_param[i].quit = true; 507 qemu_cond_signal(&comp_param[i].cond); 508 qemu_mutex_unlock(&comp_param[i].mutex); 509 510 qemu_thread_join(compress_threads + i); 511 qemu_mutex_destroy(&comp_param[i].mutex); 512 qemu_cond_destroy(&comp_param[i].cond); 513 deflateEnd(&comp_param[i].stream); 514 g_free(comp_param[i].originbuf); 515 qemu_fclose(comp_param[i].file); 516 comp_param[i].file = NULL; 517 } 518 qemu_mutex_destroy(&comp_done_lock); 519 qemu_cond_destroy(&comp_done_cond); 520 g_free(compress_threads); 521 g_free(comp_param); 522 compress_threads = NULL; 523 comp_param = NULL; 524 } 525 526 static int compress_threads_save_setup(void) 527 { 528 int i, thread_count; 529 530 if (!migrate_use_compression()) { 531 return 0; 532 } 533 thread_count = migrate_compress_threads(); 534 compress_threads = g_new0(QemuThread, thread_count); 535 comp_param = g_new0(CompressParam, thread_count); 536 qemu_cond_init(&comp_done_cond); 537 qemu_mutex_init(&comp_done_lock); 538 for (i = 0; i < thread_count; i++) { 539 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 540 if (!comp_param[i].originbuf) { 541 goto exit; 542 } 543 544 if (deflateInit(&comp_param[i].stream, 545 migrate_compress_level()) != Z_OK) { 546 g_free(comp_param[i].originbuf); 547 goto exit; 548 } 549 550 /* comp_param[i].file is just used as a dummy buffer to save data, 551 * set its ops to empty. 552 */ 553 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false); 554 comp_param[i].done = true; 555 comp_param[i].quit = false; 556 qemu_mutex_init(&comp_param[i].mutex); 557 qemu_cond_init(&comp_param[i].cond); 558 qemu_thread_create(compress_threads + i, "compress", 559 do_data_compress, comp_param + i, 560 QEMU_THREAD_JOINABLE); 561 } 562 return 0; 563 564 exit: 565 compress_threads_save_cleanup(); 566 return -1; 567 } 568 569 /** 570 * save_page_header: write page header to wire 571 * 572 * If this is the 1st block, it also writes the block identification 573 * 574 * Returns the number of bytes written 575 * 576 * @f: QEMUFile where to send the data 577 * @block: block that contains the page we want to send 578 * @offset: offset inside the block for the page 579 * in the lower bits, it contains flags 580 */ 581 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 582 ram_addr_t offset) 583 { 584 size_t size, len; 585 586 if (block == rs->last_sent_block) { 587 offset |= RAM_SAVE_FLAG_CONTINUE; 588 } 589 qemu_put_be64(f, offset); 590 size = 8; 591 592 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 593 len = strlen(block->idstr); 594 qemu_put_byte(f, len); 595 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 596 size += 1 + len; 597 rs->last_sent_block = block; 598 } 599 return size; 600 } 601 602 /** 603 * mig_throttle_guest_down: throttle down the guest 604 * 605 * Reduce amount of guest cpu execution to hopefully slow down memory 606 * writes. If guest dirty memory rate is reduced below the rate at 607 * which we can transfer pages to the destination then we should be 608 * able to complete migration. Some workloads dirty memory way too 609 * fast and will not effectively converge, even with auto-converge. 610 */ 611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 612 uint64_t bytes_dirty_threshold) 613 { 614 MigrationState *s = migrate_get_current(); 615 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 616 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 617 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 618 int pct_max = s->parameters.max_cpu_throttle; 619 620 uint64_t throttle_now = cpu_throttle_get_percentage(); 621 uint64_t cpu_now, cpu_ideal, throttle_inc; 622 623 /* We have not started throttling yet. Let's start it. */ 624 if (!cpu_throttle_active()) { 625 cpu_throttle_set(pct_initial); 626 } else { 627 /* Throttling already on, just increase the rate */ 628 if (!pct_tailslow) { 629 throttle_inc = pct_increment; 630 } else { 631 /* Compute the ideal CPU percentage used by Guest, which may 632 * make the dirty rate match the dirty rate threshold. */ 633 cpu_now = 100 - throttle_now; 634 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 635 bytes_dirty_period); 636 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 637 } 638 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 639 } 640 } 641 642 /** 643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 644 * 645 * @rs: current RAM state 646 * @current_addr: address for the zero page 647 * 648 * Update the xbzrle cache to reflect a page that's been sent as all 0. 649 * The important thing is that a stale (not-yet-0'd) page be replaced 650 * by the new data. 651 * As a bonus, if the page wasn't in the cache it gets added so that 652 * when a small write is made into the 0'd page it gets XBZRLE sent. 653 */ 654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 655 { 656 if (!rs->xbzrle_enabled) { 657 return; 658 } 659 660 /* We don't care if this fails to allocate a new cache page 661 * as long as it updated an old one */ 662 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 663 ram_counters.dirty_sync_count); 664 } 665 666 #define ENCODING_FLAG_XBZRLE 0x1 667 668 /** 669 * save_xbzrle_page: compress and send current page 670 * 671 * Returns: 1 means that we wrote the page 672 * 0 means that page is identical to the one already sent 673 * -1 means that xbzrle would be longer than normal 674 * 675 * @rs: current RAM state 676 * @current_data: pointer to the address of the page contents 677 * @current_addr: addr of the page 678 * @block: block that contains the page we want to send 679 * @offset: offset inside the block for the page 680 * @last_stage: if we are at the completion stage 681 */ 682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 683 ram_addr_t current_addr, RAMBlock *block, 684 ram_addr_t offset, bool last_stage) 685 { 686 int encoded_len = 0, bytes_xbzrle; 687 uint8_t *prev_cached_page; 688 689 if (!cache_is_cached(XBZRLE.cache, current_addr, 690 ram_counters.dirty_sync_count)) { 691 xbzrle_counters.cache_miss++; 692 if (!last_stage) { 693 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 694 ram_counters.dirty_sync_count) == -1) { 695 return -1; 696 } else { 697 /* update *current_data when the page has been 698 inserted into cache */ 699 *current_data = get_cached_data(XBZRLE.cache, current_addr); 700 } 701 } 702 return -1; 703 } 704 705 /* 706 * Reaching here means the page has hit the xbzrle cache, no matter what 707 * encoding result it is (normal encoding, overflow or skipping the page), 708 * count the page as encoded. This is used to calculate the encoding rate. 709 * 710 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 711 * 2nd page turns out to be skipped (i.e. no new bytes written to the 712 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 713 * skipped page included. In this way, the encoding rate can tell if the 714 * guest page is good for xbzrle encoding. 715 */ 716 xbzrle_counters.pages++; 717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 718 719 /* save current buffer into memory */ 720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 721 722 /* XBZRLE encoding (if there is no overflow) */ 723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 725 TARGET_PAGE_SIZE); 726 727 /* 728 * Update the cache contents, so that it corresponds to the data 729 * sent, in all cases except where we skip the page. 730 */ 731 if (!last_stage && encoded_len != 0) { 732 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 733 /* 734 * In the case where we couldn't compress, ensure that the caller 735 * sends the data from the cache, since the guest might have 736 * changed the RAM since we copied it. 737 */ 738 *current_data = prev_cached_page; 739 } 740 741 if (encoded_len == 0) { 742 trace_save_xbzrle_page_skipping(); 743 return 0; 744 } else if (encoded_len == -1) { 745 trace_save_xbzrle_page_overflow(); 746 xbzrle_counters.overflow++; 747 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 748 return -1; 749 } 750 751 /* Send XBZRLE based compressed page */ 752 bytes_xbzrle = save_page_header(rs, rs->f, block, 753 offset | RAM_SAVE_FLAG_XBZRLE); 754 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 755 qemu_put_be16(rs->f, encoded_len); 756 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 757 bytes_xbzrle += encoded_len + 1 + 2; 758 /* 759 * Like compressed_size (please see update_compress_thread_counts), 760 * the xbzrle encoded bytes don't count the 8 byte header with 761 * RAM_SAVE_FLAG_CONTINUE. 762 */ 763 xbzrle_counters.bytes += bytes_xbzrle - 8; 764 ram_counters.transferred += bytes_xbzrle; 765 766 return 1; 767 } 768 769 /** 770 * migration_bitmap_find_dirty: find the next dirty page from start 771 * 772 * Returns the page offset within memory region of the start of a dirty page 773 * 774 * @rs: current RAM state 775 * @rb: RAMBlock where to search for dirty pages 776 * @start: page where we start the search 777 */ 778 static inline 779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 780 unsigned long start) 781 { 782 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 783 unsigned long *bitmap = rb->bmap; 784 785 if (ramblock_is_ignored(rb)) { 786 return size; 787 } 788 789 return find_next_bit(bitmap, size, start); 790 } 791 792 static void migration_clear_memory_region_dirty_bitmap(RAMState *rs, 793 RAMBlock *rb, 794 unsigned long page) 795 { 796 uint8_t shift; 797 hwaddr size, start; 798 799 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 800 return; 801 } 802 803 shift = rb->clear_bmap_shift; 804 /* 805 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 806 * can make things easier sometimes since then start address 807 * of the small chunk will always be 64 pages aligned so the 808 * bitmap will always be aligned to unsigned long. We should 809 * even be able to remove this restriction but I'm simply 810 * keeping it. 811 */ 812 assert(shift >= 6); 813 814 size = 1ULL << (TARGET_PAGE_BITS + shift); 815 start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); 816 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 817 memory_region_clear_dirty_bitmap(rb->mr, start, size); 818 } 819 820 static void 821 migration_clear_memory_region_dirty_bitmap_range(RAMState *rs, 822 RAMBlock *rb, 823 unsigned long start, 824 unsigned long npages) 825 { 826 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 827 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 828 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 829 830 /* 831 * Clear pages from start to start + npages - 1, so the end boundary is 832 * exclusive. 833 */ 834 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 835 migration_clear_memory_region_dirty_bitmap(rs, rb, i); 836 } 837 } 838 839 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 840 RAMBlock *rb, 841 unsigned long page) 842 { 843 bool ret; 844 845 /* 846 * Clear dirty bitmap if needed. This _must_ be called before we 847 * send any of the page in the chunk because we need to make sure 848 * we can capture further page content changes when we sync dirty 849 * log the next time. So as long as we are going to send any of 850 * the page in the chunk we clear the remote dirty bitmap for all. 851 * Clearing it earlier won't be a problem, but too late will. 852 */ 853 migration_clear_memory_region_dirty_bitmap(rs, rb, page); 854 855 ret = test_and_clear_bit(page, rb->bmap); 856 if (ret) { 857 rs->migration_dirty_pages--; 858 } 859 860 return ret; 861 } 862 863 /* Called with RCU critical section */ 864 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 865 { 866 uint64_t new_dirty_pages = 867 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 868 869 rs->migration_dirty_pages += new_dirty_pages; 870 rs->num_dirty_pages_period += new_dirty_pages; 871 } 872 873 /** 874 * ram_pagesize_summary: calculate all the pagesizes of a VM 875 * 876 * Returns a summary bitmap of the page sizes of all RAMBlocks 877 * 878 * For VMs with just normal pages this is equivalent to the host page 879 * size. If it's got some huge pages then it's the OR of all the 880 * different page sizes. 881 */ 882 uint64_t ram_pagesize_summary(void) 883 { 884 RAMBlock *block; 885 uint64_t summary = 0; 886 887 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 888 summary |= block->page_size; 889 } 890 891 return summary; 892 } 893 894 uint64_t ram_get_total_transferred_pages(void) 895 { 896 return ram_counters.normal + ram_counters.duplicate + 897 compression_counters.pages + xbzrle_counters.pages; 898 } 899 900 static void migration_update_rates(RAMState *rs, int64_t end_time) 901 { 902 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 903 double compressed_size; 904 905 /* calculate period counters */ 906 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 907 / (end_time - rs->time_last_bitmap_sync); 908 909 if (!page_count) { 910 return; 911 } 912 913 if (migrate_use_xbzrle()) { 914 double encoded_size, unencoded_size; 915 916 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 917 rs->xbzrle_cache_miss_prev) / page_count; 918 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 919 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 920 TARGET_PAGE_SIZE; 921 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 922 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 923 xbzrle_counters.encoding_rate = 0; 924 } else { 925 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 926 } 927 rs->xbzrle_pages_prev = xbzrle_counters.pages; 928 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 929 } 930 931 if (migrate_use_compression()) { 932 compression_counters.busy_rate = (double)(compression_counters.busy - 933 rs->compress_thread_busy_prev) / page_count; 934 rs->compress_thread_busy_prev = compression_counters.busy; 935 936 compressed_size = compression_counters.compressed_size - 937 rs->compressed_size_prev; 938 if (compressed_size) { 939 double uncompressed_size = (compression_counters.pages - 940 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 941 942 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 943 compression_counters.compression_rate = 944 uncompressed_size / compressed_size; 945 946 rs->compress_pages_prev = compression_counters.pages; 947 rs->compressed_size_prev = compression_counters.compressed_size; 948 } 949 } 950 } 951 952 static void migration_trigger_throttle(RAMState *rs) 953 { 954 MigrationState *s = migrate_get_current(); 955 uint64_t threshold = s->parameters.throttle_trigger_threshold; 956 957 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 958 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 959 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 960 961 /* During block migration the auto-converge logic incorrectly detects 962 * that ram migration makes no progress. Avoid this by disabling the 963 * throttling logic during the bulk phase of block migration. */ 964 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 965 /* The following detection logic can be refined later. For now: 966 Check to see if the ratio between dirtied bytes and the approx. 967 amount of bytes that just got transferred since the last time 968 we were in this routine reaches the threshold. If that happens 969 twice, start or increase throttling. */ 970 971 if ((bytes_dirty_period > bytes_dirty_threshold) && 972 (++rs->dirty_rate_high_cnt >= 2)) { 973 trace_migration_throttle(); 974 rs->dirty_rate_high_cnt = 0; 975 mig_throttle_guest_down(bytes_dirty_period, 976 bytes_dirty_threshold); 977 } 978 } 979 } 980 981 static void migration_bitmap_sync(RAMState *rs) 982 { 983 RAMBlock *block; 984 int64_t end_time; 985 986 ram_counters.dirty_sync_count++; 987 988 if (!rs->time_last_bitmap_sync) { 989 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 990 } 991 992 trace_migration_bitmap_sync_start(); 993 memory_global_dirty_log_sync(); 994 995 qemu_mutex_lock(&rs->bitmap_mutex); 996 WITH_RCU_READ_LOCK_GUARD() { 997 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 998 ramblock_sync_dirty_bitmap(rs, block); 999 } 1000 ram_counters.remaining = ram_bytes_remaining(); 1001 } 1002 qemu_mutex_unlock(&rs->bitmap_mutex); 1003 1004 memory_global_after_dirty_log_sync(); 1005 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1006 1007 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1008 1009 /* more than 1 second = 1000 millisecons */ 1010 if (end_time > rs->time_last_bitmap_sync + 1000) { 1011 migration_trigger_throttle(rs); 1012 1013 migration_update_rates(rs, end_time); 1014 1015 rs->target_page_count_prev = rs->target_page_count; 1016 1017 /* reset period counters */ 1018 rs->time_last_bitmap_sync = end_time; 1019 rs->num_dirty_pages_period = 0; 1020 rs->bytes_xfer_prev = ram_counters.transferred; 1021 } 1022 if (migrate_use_events()) { 1023 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1024 } 1025 } 1026 1027 static void migration_bitmap_sync_precopy(RAMState *rs) 1028 { 1029 Error *local_err = NULL; 1030 1031 /* 1032 * The current notifier usage is just an optimization to migration, so we 1033 * don't stop the normal migration process in the error case. 1034 */ 1035 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1036 error_report_err(local_err); 1037 local_err = NULL; 1038 } 1039 1040 migration_bitmap_sync(rs); 1041 1042 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1043 error_report_err(local_err); 1044 } 1045 } 1046 1047 /** 1048 * save_zero_page_to_file: send the zero page to the file 1049 * 1050 * Returns the size of data written to the file, 0 means the page is not 1051 * a zero page 1052 * 1053 * @rs: current RAM state 1054 * @file: the file where the data is saved 1055 * @block: block that contains the page we want to send 1056 * @offset: offset inside the block for the page 1057 */ 1058 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1059 RAMBlock *block, ram_addr_t offset) 1060 { 1061 uint8_t *p = block->host + offset; 1062 int len = 0; 1063 1064 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1065 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1066 qemu_put_byte(file, 0); 1067 len += 1; 1068 } 1069 return len; 1070 } 1071 1072 /** 1073 * save_zero_page: send the zero page to the stream 1074 * 1075 * Returns the number of pages written. 1076 * 1077 * @rs: current RAM state 1078 * @block: block that contains the page we want to send 1079 * @offset: offset inside the block for the page 1080 */ 1081 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1082 { 1083 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1084 1085 if (len) { 1086 ram_counters.duplicate++; 1087 ram_counters.transferred += len; 1088 return 1; 1089 } 1090 return -1; 1091 } 1092 1093 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1094 { 1095 if (!migrate_release_ram() || !migration_in_postcopy()) { 1096 return; 1097 } 1098 1099 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1100 } 1101 1102 /* 1103 * @pages: the number of pages written by the control path, 1104 * < 0 - error 1105 * > 0 - number of pages written 1106 * 1107 * Return true if the pages has been saved, otherwise false is returned. 1108 */ 1109 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1110 int *pages) 1111 { 1112 uint64_t bytes_xmit = 0; 1113 int ret; 1114 1115 *pages = -1; 1116 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1117 &bytes_xmit); 1118 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1119 return false; 1120 } 1121 1122 if (bytes_xmit) { 1123 ram_counters.transferred += bytes_xmit; 1124 *pages = 1; 1125 } 1126 1127 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1128 return true; 1129 } 1130 1131 if (bytes_xmit > 0) { 1132 ram_counters.normal++; 1133 } else if (bytes_xmit == 0) { 1134 ram_counters.duplicate++; 1135 } 1136 1137 return true; 1138 } 1139 1140 /* 1141 * directly send the page to the stream 1142 * 1143 * Returns the number of pages written. 1144 * 1145 * @rs: current RAM state 1146 * @block: block that contains the page we want to send 1147 * @offset: offset inside the block for the page 1148 * @buf: the page to be sent 1149 * @async: send to page asyncly 1150 */ 1151 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1152 uint8_t *buf, bool async) 1153 { 1154 ram_counters.transferred += save_page_header(rs, rs->f, block, 1155 offset | RAM_SAVE_FLAG_PAGE); 1156 if (async) { 1157 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1158 migrate_release_ram() & 1159 migration_in_postcopy()); 1160 } else { 1161 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1162 } 1163 ram_counters.transferred += TARGET_PAGE_SIZE; 1164 ram_counters.normal++; 1165 return 1; 1166 } 1167 1168 /** 1169 * ram_save_page: send the given page to the stream 1170 * 1171 * Returns the number of pages written. 1172 * < 0 - error 1173 * >=0 - Number of pages written - this might legally be 0 1174 * if xbzrle noticed the page was the same. 1175 * 1176 * @rs: current RAM state 1177 * @block: block that contains the page we want to send 1178 * @offset: offset inside the block for the page 1179 * @last_stage: if we are at the completion stage 1180 */ 1181 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1182 { 1183 int pages = -1; 1184 uint8_t *p; 1185 bool send_async = true; 1186 RAMBlock *block = pss->block; 1187 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1188 ram_addr_t current_addr = block->offset + offset; 1189 1190 p = block->host + offset; 1191 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1192 1193 XBZRLE_cache_lock(); 1194 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1195 pages = save_xbzrle_page(rs, &p, current_addr, block, 1196 offset, last_stage); 1197 if (!last_stage) { 1198 /* Can't send this cached data async, since the cache page 1199 * might get updated before it gets to the wire 1200 */ 1201 send_async = false; 1202 } 1203 } 1204 1205 /* XBZRLE overflow or normal page */ 1206 if (pages == -1) { 1207 pages = save_normal_page(rs, block, offset, p, send_async); 1208 } 1209 1210 XBZRLE_cache_unlock(); 1211 1212 return pages; 1213 } 1214 1215 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1216 ram_addr_t offset) 1217 { 1218 if (multifd_queue_page(rs->f, block, offset) < 0) { 1219 return -1; 1220 } 1221 ram_counters.normal++; 1222 1223 return 1; 1224 } 1225 1226 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1227 ram_addr_t offset, uint8_t *source_buf) 1228 { 1229 RAMState *rs = ram_state; 1230 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1231 bool zero_page = false; 1232 int ret; 1233 1234 if (save_zero_page_to_file(rs, f, block, offset)) { 1235 zero_page = true; 1236 goto exit; 1237 } 1238 1239 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1240 1241 /* 1242 * copy it to a internal buffer to avoid it being modified by VM 1243 * so that we can catch up the error during compression and 1244 * decompression 1245 */ 1246 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1247 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1248 if (ret < 0) { 1249 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1250 error_report("compressed data failed!"); 1251 return false; 1252 } 1253 1254 exit: 1255 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1256 return zero_page; 1257 } 1258 1259 static void 1260 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1261 { 1262 ram_counters.transferred += bytes_xmit; 1263 1264 if (param->zero_page) { 1265 ram_counters.duplicate++; 1266 return; 1267 } 1268 1269 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1270 compression_counters.compressed_size += bytes_xmit - 8; 1271 compression_counters.pages++; 1272 } 1273 1274 static bool save_page_use_compression(RAMState *rs); 1275 1276 static void flush_compressed_data(RAMState *rs) 1277 { 1278 int idx, len, thread_count; 1279 1280 if (!save_page_use_compression(rs)) { 1281 return; 1282 } 1283 thread_count = migrate_compress_threads(); 1284 1285 qemu_mutex_lock(&comp_done_lock); 1286 for (idx = 0; idx < thread_count; idx++) { 1287 while (!comp_param[idx].done) { 1288 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1289 } 1290 } 1291 qemu_mutex_unlock(&comp_done_lock); 1292 1293 for (idx = 0; idx < thread_count; idx++) { 1294 qemu_mutex_lock(&comp_param[idx].mutex); 1295 if (!comp_param[idx].quit) { 1296 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1297 /* 1298 * it's safe to fetch zero_page without holding comp_done_lock 1299 * as there is no further request submitted to the thread, 1300 * i.e, the thread should be waiting for a request at this point. 1301 */ 1302 update_compress_thread_counts(&comp_param[idx], len); 1303 } 1304 qemu_mutex_unlock(&comp_param[idx].mutex); 1305 } 1306 } 1307 1308 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1309 ram_addr_t offset) 1310 { 1311 param->block = block; 1312 param->offset = offset; 1313 } 1314 1315 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1316 ram_addr_t offset) 1317 { 1318 int idx, thread_count, bytes_xmit = -1, pages = -1; 1319 bool wait = migrate_compress_wait_thread(); 1320 1321 thread_count = migrate_compress_threads(); 1322 qemu_mutex_lock(&comp_done_lock); 1323 retry: 1324 for (idx = 0; idx < thread_count; idx++) { 1325 if (comp_param[idx].done) { 1326 comp_param[idx].done = false; 1327 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1328 qemu_mutex_lock(&comp_param[idx].mutex); 1329 set_compress_params(&comp_param[idx], block, offset); 1330 qemu_cond_signal(&comp_param[idx].cond); 1331 qemu_mutex_unlock(&comp_param[idx].mutex); 1332 pages = 1; 1333 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1334 break; 1335 } 1336 } 1337 1338 /* 1339 * wait for the free thread if the user specifies 'compress-wait-thread', 1340 * otherwise we will post the page out in the main thread as normal page. 1341 */ 1342 if (pages < 0 && wait) { 1343 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1344 goto retry; 1345 } 1346 qemu_mutex_unlock(&comp_done_lock); 1347 1348 return pages; 1349 } 1350 1351 /** 1352 * find_dirty_block: find the next dirty page and update any state 1353 * associated with the search process. 1354 * 1355 * Returns true if a page is found 1356 * 1357 * @rs: current RAM state 1358 * @pss: data about the state of the current dirty page scan 1359 * @again: set to false if the search has scanned the whole of RAM 1360 */ 1361 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1362 { 1363 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1364 if (pss->complete_round && pss->block == rs->last_seen_block && 1365 pss->page >= rs->last_page) { 1366 /* 1367 * We've been once around the RAM and haven't found anything. 1368 * Give up. 1369 */ 1370 *again = false; 1371 return false; 1372 } 1373 if (!offset_in_ramblock(pss->block, 1374 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1375 /* Didn't find anything in this RAM Block */ 1376 pss->page = 0; 1377 pss->block = QLIST_NEXT_RCU(pss->block, next); 1378 if (!pss->block) { 1379 /* 1380 * If memory migration starts over, we will meet a dirtied page 1381 * which may still exists in compression threads's ring, so we 1382 * should flush the compressed data to make sure the new page 1383 * is not overwritten by the old one in the destination. 1384 * 1385 * Also If xbzrle is on, stop using the data compression at this 1386 * point. In theory, xbzrle can do better than compression. 1387 */ 1388 flush_compressed_data(rs); 1389 1390 /* Hit the end of the list */ 1391 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1392 /* Flag that we've looped */ 1393 pss->complete_round = true; 1394 /* After the first round, enable XBZRLE. */ 1395 if (migrate_use_xbzrle()) { 1396 rs->xbzrle_enabled = true; 1397 } 1398 } 1399 /* Didn't find anything this time, but try again on the new block */ 1400 *again = true; 1401 return false; 1402 } else { 1403 /* Can go around again, but... */ 1404 *again = true; 1405 /* We've found something so probably don't need to */ 1406 return true; 1407 } 1408 } 1409 1410 /** 1411 * unqueue_page: gets a page of the queue 1412 * 1413 * Helper for 'get_queued_page' - gets a page off the queue 1414 * 1415 * Returns the block of the page (or NULL if none available) 1416 * 1417 * @rs: current RAM state 1418 * @offset: used to return the offset within the RAMBlock 1419 */ 1420 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1421 { 1422 RAMBlock *block = NULL; 1423 1424 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1425 return NULL; 1426 } 1427 1428 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1429 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1430 struct RAMSrcPageRequest *entry = 1431 QSIMPLEQ_FIRST(&rs->src_page_requests); 1432 block = entry->rb; 1433 *offset = entry->offset; 1434 1435 if (entry->len > TARGET_PAGE_SIZE) { 1436 entry->len -= TARGET_PAGE_SIZE; 1437 entry->offset += TARGET_PAGE_SIZE; 1438 } else { 1439 memory_region_unref(block->mr); 1440 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1441 g_free(entry); 1442 migration_consume_urgent_request(); 1443 } 1444 } 1445 1446 return block; 1447 } 1448 1449 #if defined(__linux__) 1450 /** 1451 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1452 * is found, return RAM block pointer and page offset 1453 * 1454 * Returns pointer to the RAMBlock containing faulting page, 1455 * NULL if no write faults are pending 1456 * 1457 * @rs: current RAM state 1458 * @offset: page offset from the beginning of the block 1459 */ 1460 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1461 { 1462 struct uffd_msg uffd_msg; 1463 void *page_address; 1464 RAMBlock *block; 1465 int res; 1466 1467 if (!migrate_background_snapshot()) { 1468 return NULL; 1469 } 1470 1471 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1472 if (res <= 0) { 1473 return NULL; 1474 } 1475 1476 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1477 block = qemu_ram_block_from_host(page_address, false, offset); 1478 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1479 return block; 1480 } 1481 1482 /** 1483 * ram_save_release_protection: release UFFD write protection after 1484 * a range of pages has been saved 1485 * 1486 * @rs: current RAM state 1487 * @pss: page-search-status structure 1488 * @start_page: index of the first page in the range relative to pss->block 1489 * 1490 * Returns 0 on success, negative value in case of an error 1491 */ 1492 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1493 unsigned long start_page) 1494 { 1495 int res = 0; 1496 1497 /* Check if page is from UFFD-managed region. */ 1498 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1499 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1500 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; 1501 1502 /* Flush async buffers before un-protect. */ 1503 qemu_fflush(rs->f); 1504 /* Un-protect memory range. */ 1505 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1506 false, false); 1507 } 1508 1509 return res; 1510 } 1511 1512 /* ram_write_tracking_available: check if kernel supports required UFFD features 1513 * 1514 * Returns true if supports, false otherwise 1515 */ 1516 bool ram_write_tracking_available(void) 1517 { 1518 uint64_t uffd_features; 1519 int res; 1520 1521 res = uffd_query_features(&uffd_features); 1522 return (res == 0 && 1523 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1524 } 1525 1526 /* ram_write_tracking_compatible: check if guest configuration is 1527 * compatible with 'write-tracking' 1528 * 1529 * Returns true if compatible, false otherwise 1530 */ 1531 bool ram_write_tracking_compatible(void) 1532 { 1533 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1534 int uffd_fd; 1535 RAMBlock *block; 1536 bool ret = false; 1537 1538 /* Open UFFD file descriptor */ 1539 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1540 if (uffd_fd < 0) { 1541 return false; 1542 } 1543 1544 RCU_READ_LOCK_GUARD(); 1545 1546 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1547 uint64_t uffd_ioctls; 1548 1549 /* Nothing to do with read-only and MMIO-writable regions */ 1550 if (block->mr->readonly || block->mr->rom_device) { 1551 continue; 1552 } 1553 /* Try to register block memory via UFFD-IO to track writes */ 1554 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1555 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1556 goto out; 1557 } 1558 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1559 goto out; 1560 } 1561 } 1562 ret = true; 1563 1564 out: 1565 uffd_close_fd(uffd_fd); 1566 return ret; 1567 } 1568 1569 /* 1570 * ram_block_populate_pages: populate memory in the RAM block by reading 1571 * an integer from the beginning of each page. 1572 * 1573 * Since it's solely used for userfault_fd WP feature, here we just 1574 * hardcode page size to qemu_real_host_page_size. 1575 * 1576 * @block: RAM block to populate 1577 */ 1578 static void ram_block_populate_pages(RAMBlock *block) 1579 { 1580 char *ptr = (char *) block->host; 1581 1582 for (ram_addr_t offset = 0; offset < block->used_length; 1583 offset += qemu_real_host_page_size) { 1584 char tmp = *(ptr + offset); 1585 1586 /* Don't optimize the read out */ 1587 asm volatile("" : "+r" (tmp)); 1588 } 1589 } 1590 1591 /* 1592 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1593 */ 1594 void ram_write_tracking_prepare(void) 1595 { 1596 RAMBlock *block; 1597 1598 RCU_READ_LOCK_GUARD(); 1599 1600 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1601 /* Nothing to do with read-only and MMIO-writable regions */ 1602 if (block->mr->readonly || block->mr->rom_device) { 1603 continue; 1604 } 1605 1606 /* 1607 * Populate pages of the RAM block before enabling userfault_fd 1608 * write protection. 1609 * 1610 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1611 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1612 * pages with pte_none() entries in page table. 1613 */ 1614 ram_block_populate_pages(block); 1615 } 1616 } 1617 1618 /* 1619 * ram_write_tracking_start: start UFFD-WP memory tracking 1620 * 1621 * Returns 0 for success or negative value in case of error 1622 */ 1623 int ram_write_tracking_start(void) 1624 { 1625 int uffd_fd; 1626 RAMState *rs = ram_state; 1627 RAMBlock *block; 1628 1629 /* Open UFFD file descriptor */ 1630 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1631 if (uffd_fd < 0) { 1632 return uffd_fd; 1633 } 1634 rs->uffdio_fd = uffd_fd; 1635 1636 RCU_READ_LOCK_GUARD(); 1637 1638 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1639 /* Nothing to do with read-only and MMIO-writable regions */ 1640 if (block->mr->readonly || block->mr->rom_device) { 1641 continue; 1642 } 1643 1644 /* Register block memory with UFFD to track writes */ 1645 if (uffd_register_memory(rs->uffdio_fd, block->host, 1646 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1647 goto fail; 1648 } 1649 /* Apply UFFD write protection to the block memory range */ 1650 if (uffd_change_protection(rs->uffdio_fd, block->host, 1651 block->max_length, true, false)) { 1652 goto fail; 1653 } 1654 block->flags |= RAM_UF_WRITEPROTECT; 1655 memory_region_ref(block->mr); 1656 1657 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1658 block->host, block->max_length); 1659 } 1660 1661 return 0; 1662 1663 fail: 1664 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1665 1666 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1667 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1668 continue; 1669 } 1670 /* 1671 * In case some memory block failed to be write-protected 1672 * remove protection and unregister all succeeded RAM blocks 1673 */ 1674 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1675 false, false); 1676 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1677 /* Cleanup flags and remove reference */ 1678 block->flags &= ~RAM_UF_WRITEPROTECT; 1679 memory_region_unref(block->mr); 1680 } 1681 1682 uffd_close_fd(uffd_fd); 1683 rs->uffdio_fd = -1; 1684 return -1; 1685 } 1686 1687 /** 1688 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1689 */ 1690 void ram_write_tracking_stop(void) 1691 { 1692 RAMState *rs = ram_state; 1693 RAMBlock *block; 1694 1695 RCU_READ_LOCK_GUARD(); 1696 1697 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1698 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1699 continue; 1700 } 1701 /* Remove protection and unregister all affected RAM blocks */ 1702 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1703 false, false); 1704 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1705 1706 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1707 block->host, block->max_length); 1708 1709 /* Cleanup flags and remove reference */ 1710 block->flags &= ~RAM_UF_WRITEPROTECT; 1711 memory_region_unref(block->mr); 1712 } 1713 1714 /* Finally close UFFD file descriptor */ 1715 uffd_close_fd(rs->uffdio_fd); 1716 rs->uffdio_fd = -1; 1717 } 1718 1719 #else 1720 /* No target OS support, stubs just fail or ignore */ 1721 1722 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1723 { 1724 (void) rs; 1725 (void) offset; 1726 1727 return NULL; 1728 } 1729 1730 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1731 unsigned long start_page) 1732 { 1733 (void) rs; 1734 (void) pss; 1735 (void) start_page; 1736 1737 return 0; 1738 } 1739 1740 bool ram_write_tracking_available(void) 1741 { 1742 return false; 1743 } 1744 1745 bool ram_write_tracking_compatible(void) 1746 { 1747 assert(0); 1748 return false; 1749 } 1750 1751 int ram_write_tracking_start(void) 1752 { 1753 assert(0); 1754 return -1; 1755 } 1756 1757 void ram_write_tracking_stop(void) 1758 { 1759 assert(0); 1760 } 1761 #endif /* defined(__linux__) */ 1762 1763 /** 1764 * get_queued_page: unqueue a page from the postcopy requests 1765 * 1766 * Skips pages that are already sent (!dirty) 1767 * 1768 * Returns true if a queued page is found 1769 * 1770 * @rs: current RAM state 1771 * @pss: data about the state of the current dirty page scan 1772 */ 1773 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1774 { 1775 RAMBlock *block; 1776 ram_addr_t offset; 1777 bool dirty; 1778 1779 do { 1780 block = unqueue_page(rs, &offset); 1781 /* 1782 * We're sending this page, and since it's postcopy nothing else 1783 * will dirty it, and we must make sure it doesn't get sent again 1784 * even if this queue request was received after the background 1785 * search already sent it. 1786 */ 1787 if (block) { 1788 unsigned long page; 1789 1790 page = offset >> TARGET_PAGE_BITS; 1791 dirty = test_bit(page, block->bmap); 1792 if (!dirty) { 1793 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1794 page); 1795 } else { 1796 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1797 } 1798 } 1799 1800 } while (block && !dirty); 1801 1802 if (!block) { 1803 /* 1804 * Poll write faults too if background snapshot is enabled; that's 1805 * when we have vcpus got blocked by the write protected pages. 1806 */ 1807 block = poll_fault_page(rs, &offset); 1808 } 1809 1810 if (block) { 1811 /* 1812 * We want the background search to continue from the queued page 1813 * since the guest is likely to want other pages near to the page 1814 * it just requested. 1815 */ 1816 pss->block = block; 1817 pss->page = offset >> TARGET_PAGE_BITS; 1818 1819 /* 1820 * This unqueued page would break the "one round" check, even is 1821 * really rare. 1822 */ 1823 pss->complete_round = false; 1824 } 1825 1826 return !!block; 1827 } 1828 1829 /** 1830 * migration_page_queue_free: drop any remaining pages in the ram 1831 * request queue 1832 * 1833 * It should be empty at the end anyway, but in error cases there may 1834 * be some left. in case that there is any page left, we drop it. 1835 * 1836 */ 1837 static void migration_page_queue_free(RAMState *rs) 1838 { 1839 struct RAMSrcPageRequest *mspr, *next_mspr; 1840 /* This queue generally should be empty - but in the case of a failed 1841 * migration might have some droppings in. 1842 */ 1843 RCU_READ_LOCK_GUARD(); 1844 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1845 memory_region_unref(mspr->rb->mr); 1846 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1847 g_free(mspr); 1848 } 1849 } 1850 1851 /** 1852 * ram_save_queue_pages: queue the page for transmission 1853 * 1854 * A request from postcopy destination for example. 1855 * 1856 * Returns zero on success or negative on error 1857 * 1858 * @rbname: Name of the RAMBLock of the request. NULL means the 1859 * same that last one. 1860 * @start: starting address from the start of the RAMBlock 1861 * @len: length (in bytes) to send 1862 */ 1863 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1864 { 1865 RAMBlock *ramblock; 1866 RAMState *rs = ram_state; 1867 1868 ram_counters.postcopy_requests++; 1869 RCU_READ_LOCK_GUARD(); 1870 1871 if (!rbname) { 1872 /* Reuse last RAMBlock */ 1873 ramblock = rs->last_req_rb; 1874 1875 if (!ramblock) { 1876 /* 1877 * Shouldn't happen, we can't reuse the last RAMBlock if 1878 * it's the 1st request. 1879 */ 1880 error_report("ram_save_queue_pages no previous block"); 1881 return -1; 1882 } 1883 } else { 1884 ramblock = qemu_ram_block_by_name(rbname); 1885 1886 if (!ramblock) { 1887 /* We shouldn't be asked for a non-existent RAMBlock */ 1888 error_report("ram_save_queue_pages no block '%s'", rbname); 1889 return -1; 1890 } 1891 rs->last_req_rb = ramblock; 1892 } 1893 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1894 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1895 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1896 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1897 __func__, start, len, ramblock->used_length); 1898 return -1; 1899 } 1900 1901 struct RAMSrcPageRequest *new_entry = 1902 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1903 new_entry->rb = ramblock; 1904 new_entry->offset = start; 1905 new_entry->len = len; 1906 1907 memory_region_ref(ramblock->mr); 1908 qemu_mutex_lock(&rs->src_page_req_mutex); 1909 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1910 migration_make_urgent_request(); 1911 qemu_mutex_unlock(&rs->src_page_req_mutex); 1912 1913 return 0; 1914 } 1915 1916 static bool save_page_use_compression(RAMState *rs) 1917 { 1918 if (!migrate_use_compression()) { 1919 return false; 1920 } 1921 1922 /* 1923 * If xbzrle is enabled (e.g., after first round of migration), stop 1924 * using the data compression. In theory, xbzrle can do better than 1925 * compression. 1926 */ 1927 if (rs->xbzrle_enabled) { 1928 return false; 1929 } 1930 1931 return true; 1932 } 1933 1934 /* 1935 * try to compress the page before posting it out, return true if the page 1936 * has been properly handled by compression, otherwise needs other 1937 * paths to handle it 1938 */ 1939 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1940 { 1941 if (!save_page_use_compression(rs)) { 1942 return false; 1943 } 1944 1945 /* 1946 * When starting the process of a new block, the first page of 1947 * the block should be sent out before other pages in the same 1948 * block, and all the pages in last block should have been sent 1949 * out, keeping this order is important, because the 'cont' flag 1950 * is used to avoid resending the block name. 1951 * 1952 * We post the fist page as normal page as compression will take 1953 * much CPU resource. 1954 */ 1955 if (block != rs->last_sent_block) { 1956 flush_compressed_data(rs); 1957 return false; 1958 } 1959 1960 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 1961 return true; 1962 } 1963 1964 compression_counters.busy++; 1965 return false; 1966 } 1967 1968 /** 1969 * ram_save_target_page: save one target page 1970 * 1971 * Returns the number of pages written 1972 * 1973 * @rs: current RAM state 1974 * @pss: data about the page we want to send 1975 * @last_stage: if we are at the completion stage 1976 */ 1977 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1978 bool last_stage) 1979 { 1980 RAMBlock *block = pss->block; 1981 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1982 int res; 1983 1984 if (control_save_page(rs, block, offset, &res)) { 1985 return res; 1986 } 1987 1988 if (save_compress_page(rs, block, offset)) { 1989 return 1; 1990 } 1991 1992 res = save_zero_page(rs, block, offset); 1993 if (res > 0) { 1994 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1995 * page would be stale 1996 */ 1997 if (!save_page_use_compression(rs)) { 1998 XBZRLE_cache_lock(); 1999 xbzrle_cache_zero_page(rs, block->offset + offset); 2000 XBZRLE_cache_unlock(); 2001 } 2002 ram_release_pages(block->idstr, offset, res); 2003 return res; 2004 } 2005 2006 /* 2007 * Do not use multifd for: 2008 * 1. Compression as the first page in the new block should be posted out 2009 * before sending the compressed page 2010 * 2. In postcopy as one whole host page should be placed 2011 */ 2012 if (!save_page_use_compression(rs) && migrate_use_multifd() 2013 && !migration_in_postcopy()) { 2014 return ram_save_multifd_page(rs, block, offset); 2015 } 2016 2017 return ram_save_page(rs, pss, last_stage); 2018 } 2019 2020 /** 2021 * ram_save_host_page: save a whole host page 2022 * 2023 * Starting at *offset send pages up to the end of the current host 2024 * page. It's valid for the initial offset to point into the middle of 2025 * a host page in which case the remainder of the hostpage is sent. 2026 * Only dirty target pages are sent. Note that the host page size may 2027 * be a huge page for this block. 2028 * The saving stops at the boundary of the used_length of the block 2029 * if the RAMBlock isn't a multiple of the host page size. 2030 * 2031 * Returns the number of pages written or negative on error 2032 * 2033 * @rs: current RAM state 2034 * @ms: current migration state 2035 * @pss: data about the page we want to send 2036 * @last_stage: if we are at the completion stage 2037 */ 2038 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 2039 bool last_stage) 2040 { 2041 int tmppages, pages = 0; 2042 size_t pagesize_bits = 2043 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2044 unsigned long hostpage_boundary = 2045 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2046 unsigned long start_page = pss->page; 2047 int res; 2048 2049 if (ramblock_is_ignored(pss->block)) { 2050 error_report("block %s should not be migrated !", pss->block->idstr); 2051 return 0; 2052 } 2053 2054 do { 2055 /* Check the pages is dirty and if it is send it */ 2056 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2057 tmppages = ram_save_target_page(rs, pss, last_stage); 2058 if (tmppages < 0) { 2059 return tmppages; 2060 } 2061 2062 pages += tmppages; 2063 /* 2064 * Allow rate limiting to happen in the middle of huge pages if 2065 * something is sent in the current iteration. 2066 */ 2067 if (pagesize_bits > 1 && tmppages > 0) { 2068 migration_rate_limit(); 2069 } 2070 } 2071 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2072 } while ((pss->page < hostpage_boundary) && 2073 offset_in_ramblock(pss->block, 2074 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2075 /* The offset we leave with is the min boundary of host page and block */ 2076 pss->page = MIN(pss->page, hostpage_boundary) - 1; 2077 2078 res = ram_save_release_protection(rs, pss, start_page); 2079 return (res < 0 ? res : pages); 2080 } 2081 2082 /** 2083 * ram_find_and_save_block: finds a dirty page and sends it to f 2084 * 2085 * Called within an RCU critical section. 2086 * 2087 * Returns the number of pages written where zero means no dirty pages, 2088 * or negative on error 2089 * 2090 * @rs: current RAM state 2091 * @last_stage: if we are at the completion stage 2092 * 2093 * On systems where host-page-size > target-page-size it will send all the 2094 * pages in a host page that are dirty. 2095 */ 2096 2097 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 2098 { 2099 PageSearchStatus pss; 2100 int pages = 0; 2101 bool again, found; 2102 2103 /* No dirty page as there is zero RAM */ 2104 if (!ram_bytes_total()) { 2105 return pages; 2106 } 2107 2108 pss.block = rs->last_seen_block; 2109 pss.page = rs->last_page; 2110 pss.complete_round = false; 2111 2112 if (!pss.block) { 2113 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2114 } 2115 2116 do { 2117 again = true; 2118 found = get_queued_page(rs, &pss); 2119 2120 if (!found) { 2121 /* priority queue empty, so just search for something dirty */ 2122 found = find_dirty_block(rs, &pss, &again); 2123 } 2124 2125 if (found) { 2126 pages = ram_save_host_page(rs, &pss, last_stage); 2127 } 2128 } while (!pages && again); 2129 2130 rs->last_seen_block = pss.block; 2131 rs->last_page = pss.page; 2132 2133 return pages; 2134 } 2135 2136 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2137 { 2138 uint64_t pages = size / TARGET_PAGE_SIZE; 2139 2140 if (zero) { 2141 ram_counters.duplicate += pages; 2142 } else { 2143 ram_counters.normal += pages; 2144 ram_counters.transferred += size; 2145 qemu_update_position(f, size); 2146 } 2147 } 2148 2149 static uint64_t ram_bytes_total_common(bool count_ignored) 2150 { 2151 RAMBlock *block; 2152 uint64_t total = 0; 2153 2154 RCU_READ_LOCK_GUARD(); 2155 2156 if (count_ignored) { 2157 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2158 total += block->used_length; 2159 } 2160 } else { 2161 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2162 total += block->used_length; 2163 } 2164 } 2165 return total; 2166 } 2167 2168 uint64_t ram_bytes_total(void) 2169 { 2170 return ram_bytes_total_common(false); 2171 } 2172 2173 static void xbzrle_load_setup(void) 2174 { 2175 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2176 } 2177 2178 static void xbzrle_load_cleanup(void) 2179 { 2180 g_free(XBZRLE.decoded_buf); 2181 XBZRLE.decoded_buf = NULL; 2182 } 2183 2184 static void ram_state_cleanup(RAMState **rsp) 2185 { 2186 if (*rsp) { 2187 migration_page_queue_free(*rsp); 2188 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2189 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2190 g_free(*rsp); 2191 *rsp = NULL; 2192 } 2193 } 2194 2195 static void xbzrle_cleanup(void) 2196 { 2197 XBZRLE_cache_lock(); 2198 if (XBZRLE.cache) { 2199 cache_fini(XBZRLE.cache); 2200 g_free(XBZRLE.encoded_buf); 2201 g_free(XBZRLE.current_buf); 2202 g_free(XBZRLE.zero_target_page); 2203 XBZRLE.cache = NULL; 2204 XBZRLE.encoded_buf = NULL; 2205 XBZRLE.current_buf = NULL; 2206 XBZRLE.zero_target_page = NULL; 2207 } 2208 XBZRLE_cache_unlock(); 2209 } 2210 2211 static void ram_save_cleanup(void *opaque) 2212 { 2213 RAMState **rsp = opaque; 2214 RAMBlock *block; 2215 2216 /* We don't use dirty log with background snapshots */ 2217 if (!migrate_background_snapshot()) { 2218 /* caller have hold iothread lock or is in a bh, so there is 2219 * no writing race against the migration bitmap 2220 */ 2221 memory_global_dirty_log_stop(); 2222 } 2223 2224 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2225 g_free(block->clear_bmap); 2226 block->clear_bmap = NULL; 2227 g_free(block->bmap); 2228 block->bmap = NULL; 2229 } 2230 2231 xbzrle_cleanup(); 2232 compress_threads_save_cleanup(); 2233 ram_state_cleanup(rsp); 2234 } 2235 2236 static void ram_state_reset(RAMState *rs) 2237 { 2238 rs->last_seen_block = NULL; 2239 rs->last_sent_block = NULL; 2240 rs->last_page = 0; 2241 rs->last_version = ram_list.version; 2242 rs->xbzrle_enabled = false; 2243 } 2244 2245 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2246 2247 /* 2248 * 'expected' is the value you expect the bitmap mostly to be full 2249 * of; it won't bother printing lines that are all this value. 2250 * If 'todump' is null the migration bitmap is dumped. 2251 */ 2252 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 2253 unsigned long pages) 2254 { 2255 int64_t cur; 2256 int64_t linelen = 128; 2257 char linebuf[129]; 2258 2259 for (cur = 0; cur < pages; cur += linelen) { 2260 int64_t curb; 2261 bool found = false; 2262 /* 2263 * Last line; catch the case where the line length 2264 * is longer than remaining ram 2265 */ 2266 if (cur + linelen > pages) { 2267 linelen = pages - cur; 2268 } 2269 for (curb = 0; curb < linelen; curb++) { 2270 bool thisbit = test_bit(cur + curb, todump); 2271 linebuf[curb] = thisbit ? '1' : '.'; 2272 found = found || (thisbit != expected); 2273 } 2274 if (found) { 2275 linebuf[curb] = '\0'; 2276 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 2277 } 2278 } 2279 } 2280 2281 /* **** functions for postcopy ***** */ 2282 2283 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2284 { 2285 struct RAMBlock *block; 2286 2287 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2288 unsigned long *bitmap = block->bmap; 2289 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2290 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2291 2292 while (run_start < range) { 2293 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2294 ram_discard_range(block->idstr, 2295 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2296 ((ram_addr_t)(run_end - run_start)) 2297 << TARGET_PAGE_BITS); 2298 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2299 } 2300 } 2301 } 2302 2303 /** 2304 * postcopy_send_discard_bm_ram: discard a RAMBlock 2305 * 2306 * Returns zero on success 2307 * 2308 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2309 * 2310 * @ms: current migration state 2311 * @block: RAMBlock to discard 2312 */ 2313 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2314 { 2315 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2316 unsigned long current; 2317 unsigned long *bitmap = block->bmap; 2318 2319 for (current = 0; current < end; ) { 2320 unsigned long one = find_next_bit(bitmap, end, current); 2321 unsigned long zero, discard_length; 2322 2323 if (one >= end) { 2324 break; 2325 } 2326 2327 zero = find_next_zero_bit(bitmap, end, one + 1); 2328 2329 if (zero >= end) { 2330 discard_length = end - one; 2331 } else { 2332 discard_length = zero - one; 2333 } 2334 postcopy_discard_send_range(ms, one, discard_length); 2335 current = one + discard_length; 2336 } 2337 2338 return 0; 2339 } 2340 2341 /** 2342 * postcopy_each_ram_send_discard: discard all RAMBlocks 2343 * 2344 * Returns 0 for success or negative for error 2345 * 2346 * Utility for the outgoing postcopy code. 2347 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2348 * passing it bitmap indexes and name. 2349 * (qemu_ram_foreach_block ends up passing unscaled lengths 2350 * which would mean postcopy code would have to deal with target page) 2351 * 2352 * @ms: current migration state 2353 */ 2354 static int postcopy_each_ram_send_discard(MigrationState *ms) 2355 { 2356 struct RAMBlock *block; 2357 int ret; 2358 2359 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2360 postcopy_discard_send_init(ms, block->idstr); 2361 2362 /* 2363 * Postcopy sends chunks of bitmap over the wire, but it 2364 * just needs indexes at this point, avoids it having 2365 * target page specific code. 2366 */ 2367 ret = postcopy_send_discard_bm_ram(ms, block); 2368 postcopy_discard_send_finish(ms); 2369 if (ret) { 2370 return ret; 2371 } 2372 } 2373 2374 return 0; 2375 } 2376 2377 /** 2378 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2379 * 2380 * Helper for postcopy_chunk_hostpages; it's called twice to 2381 * canonicalize the two bitmaps, that are similar, but one is 2382 * inverted. 2383 * 2384 * Postcopy requires that all target pages in a hostpage are dirty or 2385 * clean, not a mix. This function canonicalizes the bitmaps. 2386 * 2387 * @ms: current migration state 2388 * @block: block that contains the page we want to canonicalize 2389 */ 2390 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2391 { 2392 RAMState *rs = ram_state; 2393 unsigned long *bitmap = block->bmap; 2394 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2395 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2396 unsigned long run_start; 2397 2398 if (block->page_size == TARGET_PAGE_SIZE) { 2399 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2400 return; 2401 } 2402 2403 /* Find a dirty page */ 2404 run_start = find_next_bit(bitmap, pages, 0); 2405 2406 while (run_start < pages) { 2407 2408 /* 2409 * If the start of this run of pages is in the middle of a host 2410 * page, then we need to fixup this host page. 2411 */ 2412 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2413 /* Find the end of this run */ 2414 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2415 /* 2416 * If the end isn't at the start of a host page, then the 2417 * run doesn't finish at the end of a host page 2418 * and we need to discard. 2419 */ 2420 } 2421 2422 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2423 unsigned long page; 2424 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2425 host_ratio); 2426 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2427 2428 /* Clean up the bitmap */ 2429 for (page = fixup_start_addr; 2430 page < fixup_start_addr + host_ratio; page++) { 2431 /* 2432 * Remark them as dirty, updating the count for any pages 2433 * that weren't previously dirty. 2434 */ 2435 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2436 } 2437 } 2438 2439 /* Find the next dirty page for the next iteration */ 2440 run_start = find_next_bit(bitmap, pages, run_start); 2441 } 2442 } 2443 2444 /** 2445 * postcopy_chunk_hostpages: discard any partially sent host page 2446 * 2447 * Utility for the outgoing postcopy code. 2448 * 2449 * Discard any partially sent host-page size chunks, mark any partially 2450 * dirty host-page size chunks as all dirty. In this case the host-page 2451 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2452 * 2453 * Returns zero on success 2454 * 2455 * @ms: current migration state 2456 * @block: block we want to work with 2457 */ 2458 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2459 { 2460 postcopy_discard_send_init(ms, block->idstr); 2461 2462 /* 2463 * Ensure that all partially dirty host pages are made fully dirty. 2464 */ 2465 postcopy_chunk_hostpages_pass(ms, block); 2466 2467 postcopy_discard_send_finish(ms); 2468 return 0; 2469 } 2470 2471 /** 2472 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2473 * 2474 * Returns zero on success 2475 * 2476 * Transmit the set of pages to be discarded after precopy to the target 2477 * these are pages that: 2478 * a) Have been previously transmitted but are now dirty again 2479 * b) Pages that have never been transmitted, this ensures that 2480 * any pages on the destination that have been mapped by background 2481 * tasks get discarded (transparent huge pages is the specific concern) 2482 * Hopefully this is pretty sparse 2483 * 2484 * @ms: current migration state 2485 */ 2486 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2487 { 2488 RAMState *rs = ram_state; 2489 RAMBlock *block; 2490 int ret; 2491 2492 RCU_READ_LOCK_GUARD(); 2493 2494 /* This should be our last sync, the src is now paused */ 2495 migration_bitmap_sync(rs); 2496 2497 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2498 rs->last_seen_block = NULL; 2499 rs->last_sent_block = NULL; 2500 rs->last_page = 0; 2501 2502 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2503 /* Deal with TPS != HPS and huge pages */ 2504 ret = postcopy_chunk_hostpages(ms, block); 2505 if (ret) { 2506 return ret; 2507 } 2508 2509 #ifdef DEBUG_POSTCOPY 2510 ram_debug_dump_bitmap(block->bmap, true, 2511 block->used_length >> TARGET_PAGE_BITS); 2512 #endif 2513 } 2514 trace_ram_postcopy_send_discard_bitmap(); 2515 2516 return postcopy_each_ram_send_discard(ms); 2517 } 2518 2519 /** 2520 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2521 * 2522 * Returns zero on success 2523 * 2524 * @rbname: name of the RAMBlock of the request. NULL means the 2525 * same that last one. 2526 * @start: RAMBlock starting page 2527 * @length: RAMBlock size 2528 */ 2529 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2530 { 2531 trace_ram_discard_range(rbname, start, length); 2532 2533 RCU_READ_LOCK_GUARD(); 2534 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2535 2536 if (!rb) { 2537 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2538 return -1; 2539 } 2540 2541 /* 2542 * On source VM, we don't need to update the received bitmap since 2543 * we don't even have one. 2544 */ 2545 if (rb->receivedmap) { 2546 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2547 length >> qemu_target_page_bits()); 2548 } 2549 2550 return ram_block_discard_range(rb, start, length); 2551 } 2552 2553 /* 2554 * For every allocation, we will try not to crash the VM if the 2555 * allocation failed. 2556 */ 2557 static int xbzrle_init(void) 2558 { 2559 Error *local_err = NULL; 2560 2561 if (!migrate_use_xbzrle()) { 2562 return 0; 2563 } 2564 2565 XBZRLE_cache_lock(); 2566 2567 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2568 if (!XBZRLE.zero_target_page) { 2569 error_report("%s: Error allocating zero page", __func__); 2570 goto err_out; 2571 } 2572 2573 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2574 TARGET_PAGE_SIZE, &local_err); 2575 if (!XBZRLE.cache) { 2576 error_report_err(local_err); 2577 goto free_zero_page; 2578 } 2579 2580 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2581 if (!XBZRLE.encoded_buf) { 2582 error_report("%s: Error allocating encoded_buf", __func__); 2583 goto free_cache; 2584 } 2585 2586 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2587 if (!XBZRLE.current_buf) { 2588 error_report("%s: Error allocating current_buf", __func__); 2589 goto free_encoded_buf; 2590 } 2591 2592 /* We are all good */ 2593 XBZRLE_cache_unlock(); 2594 return 0; 2595 2596 free_encoded_buf: 2597 g_free(XBZRLE.encoded_buf); 2598 XBZRLE.encoded_buf = NULL; 2599 free_cache: 2600 cache_fini(XBZRLE.cache); 2601 XBZRLE.cache = NULL; 2602 free_zero_page: 2603 g_free(XBZRLE.zero_target_page); 2604 XBZRLE.zero_target_page = NULL; 2605 err_out: 2606 XBZRLE_cache_unlock(); 2607 return -ENOMEM; 2608 } 2609 2610 static int ram_state_init(RAMState **rsp) 2611 { 2612 *rsp = g_try_new0(RAMState, 1); 2613 2614 if (!*rsp) { 2615 error_report("%s: Init ramstate fail", __func__); 2616 return -1; 2617 } 2618 2619 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2620 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2621 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2622 2623 /* 2624 * Count the total number of pages used by ram blocks not including any 2625 * gaps due to alignment or unplugs. 2626 * This must match with the initial values of dirty bitmap. 2627 */ 2628 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2629 ram_state_reset(*rsp); 2630 2631 return 0; 2632 } 2633 2634 static void ram_list_init_bitmaps(void) 2635 { 2636 MigrationState *ms = migrate_get_current(); 2637 RAMBlock *block; 2638 unsigned long pages; 2639 uint8_t shift; 2640 2641 /* Skip setting bitmap if there is no RAM */ 2642 if (ram_bytes_total()) { 2643 shift = ms->clear_bitmap_shift; 2644 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2645 error_report("clear_bitmap_shift (%u) too big, using " 2646 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2647 shift = CLEAR_BITMAP_SHIFT_MAX; 2648 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2649 error_report("clear_bitmap_shift (%u) too small, using " 2650 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2651 shift = CLEAR_BITMAP_SHIFT_MIN; 2652 } 2653 2654 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2655 pages = block->max_length >> TARGET_PAGE_BITS; 2656 /* 2657 * The initial dirty bitmap for migration must be set with all 2658 * ones to make sure we'll migrate every guest RAM page to 2659 * destination. 2660 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2661 * new migration after a failed migration, ram_list. 2662 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2663 * guest memory. 2664 */ 2665 block->bmap = bitmap_new(pages); 2666 bitmap_set(block->bmap, 0, pages); 2667 block->clear_bmap_shift = shift; 2668 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2669 } 2670 } 2671 } 2672 2673 static void ram_init_bitmaps(RAMState *rs) 2674 { 2675 /* For memory_global_dirty_log_start below. */ 2676 qemu_mutex_lock_iothread(); 2677 qemu_mutex_lock_ramlist(); 2678 2679 WITH_RCU_READ_LOCK_GUARD() { 2680 ram_list_init_bitmaps(); 2681 /* We don't use dirty log with background snapshots */ 2682 if (!migrate_background_snapshot()) { 2683 memory_global_dirty_log_start(); 2684 migration_bitmap_sync_precopy(rs); 2685 } 2686 } 2687 qemu_mutex_unlock_ramlist(); 2688 qemu_mutex_unlock_iothread(); 2689 } 2690 2691 static int ram_init_all(RAMState **rsp) 2692 { 2693 if (ram_state_init(rsp)) { 2694 return -1; 2695 } 2696 2697 if (xbzrle_init()) { 2698 ram_state_cleanup(rsp); 2699 return -1; 2700 } 2701 2702 ram_init_bitmaps(*rsp); 2703 2704 return 0; 2705 } 2706 2707 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2708 { 2709 RAMBlock *block; 2710 uint64_t pages = 0; 2711 2712 /* 2713 * Postcopy is not using xbzrle/compression, so no need for that. 2714 * Also, since source are already halted, we don't need to care 2715 * about dirty page logging as well. 2716 */ 2717 2718 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2719 pages += bitmap_count_one(block->bmap, 2720 block->used_length >> TARGET_PAGE_BITS); 2721 } 2722 2723 /* This may not be aligned with current bitmaps. Recalculate. */ 2724 rs->migration_dirty_pages = pages; 2725 2726 ram_state_reset(rs); 2727 2728 /* Update RAMState cache of output QEMUFile */ 2729 rs->f = out; 2730 2731 trace_ram_state_resume_prepare(pages); 2732 } 2733 2734 /* 2735 * This function clears bits of the free pages reported by the caller from the 2736 * migration dirty bitmap. @addr is the host address corresponding to the 2737 * start of the continuous guest free pages, and @len is the total bytes of 2738 * those pages. 2739 */ 2740 void qemu_guest_free_page_hint(void *addr, size_t len) 2741 { 2742 RAMBlock *block; 2743 ram_addr_t offset; 2744 size_t used_len, start, npages; 2745 MigrationState *s = migrate_get_current(); 2746 2747 /* This function is currently expected to be used during live migration */ 2748 if (!migration_is_setup_or_active(s->state)) { 2749 return; 2750 } 2751 2752 for (; len > 0; len -= used_len, addr += used_len) { 2753 block = qemu_ram_block_from_host(addr, false, &offset); 2754 if (unlikely(!block || offset >= block->used_length)) { 2755 /* 2756 * The implementation might not support RAMBlock resize during 2757 * live migration, but it could happen in theory with future 2758 * updates. So we add a check here to capture that case. 2759 */ 2760 error_report_once("%s unexpected error", __func__); 2761 return; 2762 } 2763 2764 if (len <= block->used_length - offset) { 2765 used_len = len; 2766 } else { 2767 used_len = block->used_length - offset; 2768 } 2769 2770 start = offset >> TARGET_PAGE_BITS; 2771 npages = used_len >> TARGET_PAGE_BITS; 2772 2773 qemu_mutex_lock(&ram_state->bitmap_mutex); 2774 /* 2775 * The skipped free pages are equavalent to be sent from clear_bmap's 2776 * perspective, so clear the bits from the memory region bitmap which 2777 * are initially set. Otherwise those skipped pages will be sent in 2778 * the next round after syncing from the memory region bitmap. 2779 */ 2780 migration_clear_memory_region_dirty_bitmap_range(ram_state, block, 2781 start, npages); 2782 ram_state->migration_dirty_pages -= 2783 bitmap_count_one_with_offset(block->bmap, start, npages); 2784 bitmap_clear(block->bmap, start, npages); 2785 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2786 } 2787 } 2788 2789 /* 2790 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2791 * long-running RCU critical section. When rcu-reclaims in the code 2792 * start to become numerous it will be necessary to reduce the 2793 * granularity of these critical sections. 2794 */ 2795 2796 /** 2797 * ram_save_setup: Setup RAM for migration 2798 * 2799 * Returns zero to indicate success and negative for error 2800 * 2801 * @f: QEMUFile where to send the data 2802 * @opaque: RAMState pointer 2803 */ 2804 static int ram_save_setup(QEMUFile *f, void *opaque) 2805 { 2806 RAMState **rsp = opaque; 2807 RAMBlock *block; 2808 2809 if (compress_threads_save_setup()) { 2810 return -1; 2811 } 2812 2813 /* migration has already setup the bitmap, reuse it. */ 2814 if (!migration_in_colo_state()) { 2815 if (ram_init_all(rsp) != 0) { 2816 compress_threads_save_cleanup(); 2817 return -1; 2818 } 2819 } 2820 (*rsp)->f = f; 2821 2822 WITH_RCU_READ_LOCK_GUARD() { 2823 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2824 2825 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2826 qemu_put_byte(f, strlen(block->idstr)); 2827 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2828 qemu_put_be64(f, block->used_length); 2829 if (migrate_postcopy_ram() && block->page_size != 2830 qemu_host_page_size) { 2831 qemu_put_be64(f, block->page_size); 2832 } 2833 if (migrate_ignore_shared()) { 2834 qemu_put_be64(f, block->mr->addr); 2835 } 2836 } 2837 } 2838 2839 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2840 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2841 2842 multifd_send_sync_main(f); 2843 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2844 qemu_fflush(f); 2845 2846 return 0; 2847 } 2848 2849 /** 2850 * ram_save_iterate: iterative stage for migration 2851 * 2852 * Returns zero to indicate success and negative for error 2853 * 2854 * @f: QEMUFile where to send the data 2855 * @opaque: RAMState pointer 2856 */ 2857 static int ram_save_iterate(QEMUFile *f, void *opaque) 2858 { 2859 RAMState **temp = opaque; 2860 RAMState *rs = *temp; 2861 int ret = 0; 2862 int i; 2863 int64_t t0; 2864 int done = 0; 2865 2866 if (blk_mig_bulk_active()) { 2867 /* Avoid transferring ram during bulk phase of block migration as 2868 * the bulk phase will usually take a long time and transferring 2869 * ram updates during that time is pointless. */ 2870 goto out; 2871 } 2872 2873 /* 2874 * We'll take this lock a little bit long, but it's okay for two reasons. 2875 * Firstly, the only possible other thread to take it is who calls 2876 * qemu_guest_free_page_hint(), which should be rare; secondly, see 2877 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 2878 * guarantees that we'll at least released it in a regular basis. 2879 */ 2880 qemu_mutex_lock(&rs->bitmap_mutex); 2881 WITH_RCU_READ_LOCK_GUARD() { 2882 if (ram_list.version != rs->last_version) { 2883 ram_state_reset(rs); 2884 } 2885 2886 /* Read version before ram_list.blocks */ 2887 smp_rmb(); 2888 2889 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2890 2891 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2892 i = 0; 2893 while ((ret = qemu_file_rate_limit(f)) == 0 || 2894 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 2895 int pages; 2896 2897 if (qemu_file_get_error(f)) { 2898 break; 2899 } 2900 2901 pages = ram_find_and_save_block(rs, false); 2902 /* no more pages to sent */ 2903 if (pages == 0) { 2904 done = 1; 2905 break; 2906 } 2907 2908 if (pages < 0) { 2909 qemu_file_set_error(f, pages); 2910 break; 2911 } 2912 2913 rs->target_page_count += pages; 2914 2915 /* 2916 * During postcopy, it is necessary to make sure one whole host 2917 * page is sent in one chunk. 2918 */ 2919 if (migrate_postcopy_ram()) { 2920 flush_compressed_data(rs); 2921 } 2922 2923 /* 2924 * we want to check in the 1st loop, just in case it was the 1st 2925 * time and we had to sync the dirty bitmap. 2926 * qemu_clock_get_ns() is a bit expensive, so we only check each 2927 * some iterations 2928 */ 2929 if ((i & 63) == 0) { 2930 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 2931 1000000; 2932 if (t1 > MAX_WAIT) { 2933 trace_ram_save_iterate_big_wait(t1, i); 2934 break; 2935 } 2936 } 2937 i++; 2938 } 2939 } 2940 qemu_mutex_unlock(&rs->bitmap_mutex); 2941 2942 /* 2943 * Must occur before EOS (or any QEMUFile operation) 2944 * because of RDMA protocol. 2945 */ 2946 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2947 2948 out: 2949 if (ret >= 0 2950 && migration_is_setup_or_active(migrate_get_current()->state)) { 2951 multifd_send_sync_main(rs->f); 2952 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2953 qemu_fflush(f); 2954 ram_counters.transferred += 8; 2955 2956 ret = qemu_file_get_error(f); 2957 } 2958 if (ret < 0) { 2959 return ret; 2960 } 2961 2962 return done; 2963 } 2964 2965 /** 2966 * ram_save_complete: function called to send the remaining amount of ram 2967 * 2968 * Returns zero to indicate success or negative on error 2969 * 2970 * Called with iothread lock 2971 * 2972 * @f: QEMUFile where to send the data 2973 * @opaque: RAMState pointer 2974 */ 2975 static int ram_save_complete(QEMUFile *f, void *opaque) 2976 { 2977 RAMState **temp = opaque; 2978 RAMState *rs = *temp; 2979 int ret = 0; 2980 2981 WITH_RCU_READ_LOCK_GUARD() { 2982 if (!migration_in_postcopy()) { 2983 migration_bitmap_sync_precopy(rs); 2984 } 2985 2986 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2987 2988 /* try transferring iterative blocks of memory */ 2989 2990 /* flush all remaining blocks regardless of rate limiting */ 2991 while (true) { 2992 int pages; 2993 2994 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2995 /* no more blocks to sent */ 2996 if (pages == 0) { 2997 break; 2998 } 2999 if (pages < 0) { 3000 ret = pages; 3001 break; 3002 } 3003 } 3004 3005 flush_compressed_data(rs); 3006 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3007 } 3008 3009 if (ret >= 0) { 3010 multifd_send_sync_main(rs->f); 3011 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3012 qemu_fflush(f); 3013 } 3014 3015 return ret; 3016 } 3017 3018 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3019 uint64_t *res_precopy_only, 3020 uint64_t *res_compatible, 3021 uint64_t *res_postcopy_only) 3022 { 3023 RAMState **temp = opaque; 3024 RAMState *rs = *temp; 3025 uint64_t remaining_size; 3026 3027 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3028 3029 if (!migration_in_postcopy() && 3030 remaining_size < max_size) { 3031 qemu_mutex_lock_iothread(); 3032 WITH_RCU_READ_LOCK_GUARD() { 3033 migration_bitmap_sync_precopy(rs); 3034 } 3035 qemu_mutex_unlock_iothread(); 3036 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3037 } 3038 3039 if (migrate_postcopy_ram()) { 3040 /* We can do postcopy, and all the data is postcopiable */ 3041 *res_compatible += remaining_size; 3042 } else { 3043 *res_precopy_only += remaining_size; 3044 } 3045 } 3046 3047 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3048 { 3049 unsigned int xh_len; 3050 int xh_flags; 3051 uint8_t *loaded_data; 3052 3053 /* extract RLE header */ 3054 xh_flags = qemu_get_byte(f); 3055 xh_len = qemu_get_be16(f); 3056 3057 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3058 error_report("Failed to load XBZRLE page - wrong compression!"); 3059 return -1; 3060 } 3061 3062 if (xh_len > TARGET_PAGE_SIZE) { 3063 error_report("Failed to load XBZRLE page - len overflow!"); 3064 return -1; 3065 } 3066 loaded_data = XBZRLE.decoded_buf; 3067 /* load data and decode */ 3068 /* it can change loaded_data to point to an internal buffer */ 3069 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3070 3071 /* decode RLE */ 3072 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3073 TARGET_PAGE_SIZE) == -1) { 3074 error_report("Failed to load XBZRLE page - decode error!"); 3075 return -1; 3076 } 3077 3078 return 0; 3079 } 3080 3081 /** 3082 * ram_block_from_stream: read a RAMBlock id from the migration stream 3083 * 3084 * Must be called from within a rcu critical section. 3085 * 3086 * Returns a pointer from within the RCU-protected ram_list. 3087 * 3088 * @f: QEMUFile where to read the data from 3089 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3090 */ 3091 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 3092 { 3093 static RAMBlock *block; 3094 char id[256]; 3095 uint8_t len; 3096 3097 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3098 if (!block) { 3099 error_report("Ack, bad migration stream!"); 3100 return NULL; 3101 } 3102 return block; 3103 } 3104 3105 len = qemu_get_byte(f); 3106 qemu_get_buffer(f, (uint8_t *)id, len); 3107 id[len] = 0; 3108 3109 block = qemu_ram_block_by_name(id); 3110 if (!block) { 3111 error_report("Can't find block %s", id); 3112 return NULL; 3113 } 3114 3115 if (ramblock_is_ignored(block)) { 3116 error_report("block %s should not be migrated !", id); 3117 return NULL; 3118 } 3119 3120 return block; 3121 } 3122 3123 static inline void *host_from_ram_block_offset(RAMBlock *block, 3124 ram_addr_t offset) 3125 { 3126 if (!offset_in_ramblock(block, offset)) { 3127 return NULL; 3128 } 3129 3130 return block->host + offset; 3131 } 3132 3133 static void *host_page_from_ram_block_offset(RAMBlock *block, 3134 ram_addr_t offset) 3135 { 3136 /* Note: Explicitly no check against offset_in_ramblock(). */ 3137 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3138 block->page_size); 3139 } 3140 3141 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3142 ram_addr_t offset) 3143 { 3144 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3145 } 3146 3147 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3148 ram_addr_t offset, bool record_bitmap) 3149 { 3150 if (!offset_in_ramblock(block, offset)) { 3151 return NULL; 3152 } 3153 if (!block->colo_cache) { 3154 error_report("%s: colo_cache is NULL in block :%s", 3155 __func__, block->idstr); 3156 return NULL; 3157 } 3158 3159 /* 3160 * During colo checkpoint, we need bitmap of these migrated pages. 3161 * It help us to decide which pages in ram cache should be flushed 3162 * into VM's RAM later. 3163 */ 3164 if (record_bitmap && 3165 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3166 ram_state->migration_dirty_pages++; 3167 } 3168 return block->colo_cache + offset; 3169 } 3170 3171 /** 3172 * ram_handle_compressed: handle the zero page case 3173 * 3174 * If a page (or a whole RDMA chunk) has been 3175 * determined to be zero, then zap it. 3176 * 3177 * @host: host address for the zero page 3178 * @ch: what the page is filled from. We only support zero 3179 * @size: size of the zero page 3180 */ 3181 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3182 { 3183 if (ch != 0 || !is_zero_range(host, size)) { 3184 memset(host, ch, size); 3185 } 3186 } 3187 3188 /* return the size after decompression, or negative value on error */ 3189 static int 3190 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3191 const uint8_t *source, size_t source_len) 3192 { 3193 int err; 3194 3195 err = inflateReset(stream); 3196 if (err != Z_OK) { 3197 return -1; 3198 } 3199 3200 stream->avail_in = source_len; 3201 stream->next_in = (uint8_t *)source; 3202 stream->avail_out = dest_len; 3203 stream->next_out = dest; 3204 3205 err = inflate(stream, Z_NO_FLUSH); 3206 if (err != Z_STREAM_END) { 3207 return -1; 3208 } 3209 3210 return stream->total_out; 3211 } 3212 3213 static void *do_data_decompress(void *opaque) 3214 { 3215 DecompressParam *param = opaque; 3216 unsigned long pagesize; 3217 uint8_t *des; 3218 int len, ret; 3219 3220 qemu_mutex_lock(¶m->mutex); 3221 while (!param->quit) { 3222 if (param->des) { 3223 des = param->des; 3224 len = param->len; 3225 param->des = 0; 3226 qemu_mutex_unlock(¶m->mutex); 3227 3228 pagesize = TARGET_PAGE_SIZE; 3229 3230 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3231 param->compbuf, len); 3232 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3233 error_report("decompress data failed"); 3234 qemu_file_set_error(decomp_file, ret); 3235 } 3236 3237 qemu_mutex_lock(&decomp_done_lock); 3238 param->done = true; 3239 qemu_cond_signal(&decomp_done_cond); 3240 qemu_mutex_unlock(&decomp_done_lock); 3241 3242 qemu_mutex_lock(¶m->mutex); 3243 } else { 3244 qemu_cond_wait(¶m->cond, ¶m->mutex); 3245 } 3246 } 3247 qemu_mutex_unlock(¶m->mutex); 3248 3249 return NULL; 3250 } 3251 3252 static int wait_for_decompress_done(void) 3253 { 3254 int idx, thread_count; 3255 3256 if (!migrate_use_compression()) { 3257 return 0; 3258 } 3259 3260 thread_count = migrate_decompress_threads(); 3261 qemu_mutex_lock(&decomp_done_lock); 3262 for (idx = 0; idx < thread_count; idx++) { 3263 while (!decomp_param[idx].done) { 3264 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3265 } 3266 } 3267 qemu_mutex_unlock(&decomp_done_lock); 3268 return qemu_file_get_error(decomp_file); 3269 } 3270 3271 static void compress_threads_load_cleanup(void) 3272 { 3273 int i, thread_count; 3274 3275 if (!migrate_use_compression()) { 3276 return; 3277 } 3278 thread_count = migrate_decompress_threads(); 3279 for (i = 0; i < thread_count; i++) { 3280 /* 3281 * we use it as a indicator which shows if the thread is 3282 * properly init'd or not 3283 */ 3284 if (!decomp_param[i].compbuf) { 3285 break; 3286 } 3287 3288 qemu_mutex_lock(&decomp_param[i].mutex); 3289 decomp_param[i].quit = true; 3290 qemu_cond_signal(&decomp_param[i].cond); 3291 qemu_mutex_unlock(&decomp_param[i].mutex); 3292 } 3293 for (i = 0; i < thread_count; i++) { 3294 if (!decomp_param[i].compbuf) { 3295 break; 3296 } 3297 3298 qemu_thread_join(decompress_threads + i); 3299 qemu_mutex_destroy(&decomp_param[i].mutex); 3300 qemu_cond_destroy(&decomp_param[i].cond); 3301 inflateEnd(&decomp_param[i].stream); 3302 g_free(decomp_param[i].compbuf); 3303 decomp_param[i].compbuf = NULL; 3304 } 3305 g_free(decompress_threads); 3306 g_free(decomp_param); 3307 decompress_threads = NULL; 3308 decomp_param = NULL; 3309 decomp_file = NULL; 3310 } 3311 3312 static int compress_threads_load_setup(QEMUFile *f) 3313 { 3314 int i, thread_count; 3315 3316 if (!migrate_use_compression()) { 3317 return 0; 3318 } 3319 3320 thread_count = migrate_decompress_threads(); 3321 decompress_threads = g_new0(QemuThread, thread_count); 3322 decomp_param = g_new0(DecompressParam, thread_count); 3323 qemu_mutex_init(&decomp_done_lock); 3324 qemu_cond_init(&decomp_done_cond); 3325 decomp_file = f; 3326 for (i = 0; i < thread_count; i++) { 3327 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3328 goto exit; 3329 } 3330 3331 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3332 qemu_mutex_init(&decomp_param[i].mutex); 3333 qemu_cond_init(&decomp_param[i].cond); 3334 decomp_param[i].done = true; 3335 decomp_param[i].quit = false; 3336 qemu_thread_create(decompress_threads + i, "decompress", 3337 do_data_decompress, decomp_param + i, 3338 QEMU_THREAD_JOINABLE); 3339 } 3340 return 0; 3341 exit: 3342 compress_threads_load_cleanup(); 3343 return -1; 3344 } 3345 3346 static void decompress_data_with_multi_threads(QEMUFile *f, 3347 void *host, int len) 3348 { 3349 int idx, thread_count; 3350 3351 thread_count = migrate_decompress_threads(); 3352 QEMU_LOCK_GUARD(&decomp_done_lock); 3353 while (true) { 3354 for (idx = 0; idx < thread_count; idx++) { 3355 if (decomp_param[idx].done) { 3356 decomp_param[idx].done = false; 3357 qemu_mutex_lock(&decomp_param[idx].mutex); 3358 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3359 decomp_param[idx].des = host; 3360 decomp_param[idx].len = len; 3361 qemu_cond_signal(&decomp_param[idx].cond); 3362 qemu_mutex_unlock(&decomp_param[idx].mutex); 3363 break; 3364 } 3365 } 3366 if (idx < thread_count) { 3367 break; 3368 } else { 3369 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3370 } 3371 } 3372 } 3373 3374 static void colo_init_ram_state(void) 3375 { 3376 ram_state_init(&ram_state); 3377 } 3378 3379 /* 3380 * colo cache: this is for secondary VM, we cache the whole 3381 * memory of the secondary VM, it is need to hold the global lock 3382 * to call this helper. 3383 */ 3384 int colo_init_ram_cache(void) 3385 { 3386 RAMBlock *block; 3387 3388 WITH_RCU_READ_LOCK_GUARD() { 3389 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3390 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3391 NULL, false, false); 3392 if (!block->colo_cache) { 3393 error_report("%s: Can't alloc memory for COLO cache of block %s," 3394 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3395 block->used_length); 3396 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3397 if (block->colo_cache) { 3398 qemu_anon_ram_free(block->colo_cache, block->used_length); 3399 block->colo_cache = NULL; 3400 } 3401 } 3402 return -errno; 3403 } 3404 } 3405 } 3406 3407 /* 3408 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3409 * with to decide which page in cache should be flushed into SVM's RAM. Here 3410 * we use the same name 'ram_bitmap' as for migration. 3411 */ 3412 if (ram_bytes_total()) { 3413 RAMBlock *block; 3414 3415 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3416 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3417 block->bmap = bitmap_new(pages); 3418 } 3419 } 3420 3421 colo_init_ram_state(); 3422 return 0; 3423 } 3424 3425 /* TODO: duplicated with ram_init_bitmaps */ 3426 void colo_incoming_start_dirty_log(void) 3427 { 3428 RAMBlock *block = NULL; 3429 /* For memory_global_dirty_log_start below. */ 3430 qemu_mutex_lock_iothread(); 3431 qemu_mutex_lock_ramlist(); 3432 3433 memory_global_dirty_log_sync(); 3434 WITH_RCU_READ_LOCK_GUARD() { 3435 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3436 ramblock_sync_dirty_bitmap(ram_state, block); 3437 /* Discard this dirty bitmap record */ 3438 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3439 } 3440 memory_global_dirty_log_start(); 3441 } 3442 ram_state->migration_dirty_pages = 0; 3443 qemu_mutex_unlock_ramlist(); 3444 qemu_mutex_unlock_iothread(); 3445 } 3446 3447 /* It is need to hold the global lock to call this helper */ 3448 void colo_release_ram_cache(void) 3449 { 3450 RAMBlock *block; 3451 3452 memory_global_dirty_log_stop(); 3453 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3454 g_free(block->bmap); 3455 block->bmap = NULL; 3456 } 3457 3458 WITH_RCU_READ_LOCK_GUARD() { 3459 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3460 if (block->colo_cache) { 3461 qemu_anon_ram_free(block->colo_cache, block->used_length); 3462 block->colo_cache = NULL; 3463 } 3464 } 3465 } 3466 ram_state_cleanup(&ram_state); 3467 } 3468 3469 /** 3470 * ram_load_setup: Setup RAM for migration incoming side 3471 * 3472 * Returns zero to indicate success and negative for error 3473 * 3474 * @f: QEMUFile where to receive the data 3475 * @opaque: RAMState pointer 3476 */ 3477 static int ram_load_setup(QEMUFile *f, void *opaque) 3478 { 3479 if (compress_threads_load_setup(f)) { 3480 return -1; 3481 } 3482 3483 xbzrle_load_setup(); 3484 ramblock_recv_map_init(); 3485 3486 return 0; 3487 } 3488 3489 static int ram_load_cleanup(void *opaque) 3490 { 3491 RAMBlock *rb; 3492 3493 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3494 qemu_ram_block_writeback(rb); 3495 } 3496 3497 xbzrle_load_cleanup(); 3498 compress_threads_load_cleanup(); 3499 3500 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3501 g_free(rb->receivedmap); 3502 rb->receivedmap = NULL; 3503 } 3504 3505 return 0; 3506 } 3507 3508 /** 3509 * ram_postcopy_incoming_init: allocate postcopy data structures 3510 * 3511 * Returns 0 for success and negative if there was one error 3512 * 3513 * @mis: current migration incoming state 3514 * 3515 * Allocate data structures etc needed by incoming migration with 3516 * postcopy-ram. postcopy-ram's similarly names 3517 * postcopy_ram_incoming_init does the work. 3518 */ 3519 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3520 { 3521 return postcopy_ram_incoming_init(mis); 3522 } 3523 3524 /** 3525 * ram_load_postcopy: load a page in postcopy case 3526 * 3527 * Returns 0 for success or -errno in case of error 3528 * 3529 * Called in postcopy mode by ram_load(). 3530 * rcu_read_lock is taken prior to this being called. 3531 * 3532 * @f: QEMUFile where to send the data 3533 */ 3534 static int ram_load_postcopy(QEMUFile *f) 3535 { 3536 int flags = 0, ret = 0; 3537 bool place_needed = false; 3538 bool matches_target_page_size = false; 3539 MigrationIncomingState *mis = migration_incoming_get_current(); 3540 /* Temporary page that is later 'placed' */ 3541 void *postcopy_host_page = mis->postcopy_tmp_page; 3542 void *host_page = NULL; 3543 bool all_zero = true; 3544 int target_pages = 0; 3545 3546 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3547 ram_addr_t addr; 3548 void *page_buffer = NULL; 3549 void *place_source = NULL; 3550 RAMBlock *block = NULL; 3551 uint8_t ch; 3552 int len; 3553 3554 addr = qemu_get_be64(f); 3555 3556 /* 3557 * If qemu file error, we should stop here, and then "addr" 3558 * may be invalid 3559 */ 3560 ret = qemu_file_get_error(f); 3561 if (ret) { 3562 break; 3563 } 3564 3565 flags = addr & ~TARGET_PAGE_MASK; 3566 addr &= TARGET_PAGE_MASK; 3567 3568 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3569 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3570 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3571 block = ram_block_from_stream(f, flags); 3572 if (!block) { 3573 ret = -EINVAL; 3574 break; 3575 } 3576 3577 /* 3578 * Relying on used_length is racy and can result in false positives. 3579 * We might place pages beyond used_length in case RAM was shrunk 3580 * while in postcopy, which is fine - trying to place via 3581 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3582 */ 3583 if (!block->host || addr >= block->postcopy_length) { 3584 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3585 ret = -EINVAL; 3586 break; 3587 } 3588 target_pages++; 3589 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3590 /* 3591 * Postcopy requires that we place whole host pages atomically; 3592 * these may be huge pages for RAMBlocks that are backed by 3593 * hugetlbfs. 3594 * To make it atomic, the data is read into a temporary page 3595 * that's moved into place later. 3596 * The migration protocol uses, possibly smaller, target-pages 3597 * however the source ensures it always sends all the components 3598 * of a host page in one chunk. 3599 */ 3600 page_buffer = postcopy_host_page + 3601 host_page_offset_from_ram_block_offset(block, addr); 3602 /* If all TP are zero then we can optimise the place */ 3603 if (target_pages == 1) { 3604 host_page = host_page_from_ram_block_offset(block, addr); 3605 } else if (host_page != host_page_from_ram_block_offset(block, 3606 addr)) { 3607 /* not the 1st TP within the HP */ 3608 error_report("Non-same host page %p/%p", host_page, 3609 host_page_from_ram_block_offset(block, addr)); 3610 ret = -EINVAL; 3611 break; 3612 } 3613 3614 /* 3615 * If it's the last part of a host page then we place the host 3616 * page 3617 */ 3618 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3619 place_needed = true; 3620 } 3621 place_source = postcopy_host_page; 3622 } 3623 3624 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3625 case RAM_SAVE_FLAG_ZERO: 3626 ch = qemu_get_byte(f); 3627 /* 3628 * Can skip to set page_buffer when 3629 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3630 */ 3631 if (ch || !matches_target_page_size) { 3632 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3633 } 3634 if (ch) { 3635 all_zero = false; 3636 } 3637 break; 3638 3639 case RAM_SAVE_FLAG_PAGE: 3640 all_zero = false; 3641 if (!matches_target_page_size) { 3642 /* For huge pages, we always use temporary buffer */ 3643 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3644 } else { 3645 /* 3646 * For small pages that matches target page size, we 3647 * avoid the qemu_file copy. Instead we directly use 3648 * the buffer of QEMUFile to place the page. Note: we 3649 * cannot do any QEMUFile operation before using that 3650 * buffer to make sure the buffer is valid when 3651 * placing the page. 3652 */ 3653 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3654 TARGET_PAGE_SIZE); 3655 } 3656 break; 3657 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3658 all_zero = false; 3659 len = qemu_get_be32(f); 3660 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3661 error_report("Invalid compressed data length: %d", len); 3662 ret = -EINVAL; 3663 break; 3664 } 3665 decompress_data_with_multi_threads(f, page_buffer, len); 3666 break; 3667 3668 case RAM_SAVE_FLAG_EOS: 3669 /* normal exit */ 3670 multifd_recv_sync_main(); 3671 break; 3672 default: 3673 error_report("Unknown combination of migration flags: 0x%x" 3674 " (postcopy mode)", flags); 3675 ret = -EINVAL; 3676 break; 3677 } 3678 3679 /* Got the whole host page, wait for decompress before placing. */ 3680 if (place_needed) { 3681 ret |= wait_for_decompress_done(); 3682 } 3683 3684 /* Detect for any possible file errors */ 3685 if (!ret && qemu_file_get_error(f)) { 3686 ret = qemu_file_get_error(f); 3687 } 3688 3689 if (!ret && place_needed) { 3690 if (all_zero) { 3691 ret = postcopy_place_page_zero(mis, host_page, block); 3692 } else { 3693 ret = postcopy_place_page(mis, host_page, place_source, 3694 block); 3695 } 3696 place_needed = false; 3697 target_pages = 0; 3698 /* Assume we have a zero page until we detect something different */ 3699 all_zero = true; 3700 } 3701 } 3702 3703 return ret; 3704 } 3705 3706 static bool postcopy_is_advised(void) 3707 { 3708 PostcopyState ps = postcopy_state_get(); 3709 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3710 } 3711 3712 static bool postcopy_is_running(void) 3713 { 3714 PostcopyState ps = postcopy_state_get(); 3715 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3716 } 3717 3718 /* 3719 * Flush content of RAM cache into SVM's memory. 3720 * Only flush the pages that be dirtied by PVM or SVM or both. 3721 */ 3722 void colo_flush_ram_cache(void) 3723 { 3724 RAMBlock *block = NULL; 3725 void *dst_host; 3726 void *src_host; 3727 unsigned long offset = 0; 3728 3729 memory_global_dirty_log_sync(); 3730 qemu_mutex_lock(&ram_state->bitmap_mutex); 3731 WITH_RCU_READ_LOCK_GUARD() { 3732 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3733 ramblock_sync_dirty_bitmap(ram_state, block); 3734 } 3735 } 3736 3737 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3738 WITH_RCU_READ_LOCK_GUARD() { 3739 block = QLIST_FIRST_RCU(&ram_list.blocks); 3740 3741 while (block) { 3742 offset = migration_bitmap_find_dirty(ram_state, block, offset); 3743 3744 if (!offset_in_ramblock(block, 3745 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3746 offset = 0; 3747 block = QLIST_NEXT_RCU(block, next); 3748 } else { 3749 migration_bitmap_clear_dirty(ram_state, block, offset); 3750 dst_host = block->host 3751 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3752 src_host = block->colo_cache 3753 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3754 memcpy(dst_host, src_host, TARGET_PAGE_SIZE); 3755 } 3756 } 3757 } 3758 trace_colo_flush_ram_cache_end(); 3759 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3760 } 3761 3762 /** 3763 * ram_load_precopy: load pages in precopy case 3764 * 3765 * Returns 0 for success or -errno in case of error 3766 * 3767 * Called in precopy mode by ram_load(). 3768 * rcu_read_lock is taken prior to this being called. 3769 * 3770 * @f: QEMUFile where to send the data 3771 */ 3772 static int ram_load_precopy(QEMUFile *f) 3773 { 3774 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3775 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3776 bool postcopy_advised = postcopy_is_advised(); 3777 if (!migrate_use_compression()) { 3778 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3779 } 3780 3781 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3782 ram_addr_t addr, total_ram_bytes; 3783 void *host = NULL, *host_bak = NULL; 3784 uint8_t ch; 3785 3786 /* 3787 * Yield periodically to let main loop run, but an iteration of 3788 * the main loop is expensive, so do it each some iterations 3789 */ 3790 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3791 aio_co_schedule(qemu_get_current_aio_context(), 3792 qemu_coroutine_self()); 3793 qemu_coroutine_yield(); 3794 } 3795 i++; 3796 3797 addr = qemu_get_be64(f); 3798 flags = addr & ~TARGET_PAGE_MASK; 3799 addr &= TARGET_PAGE_MASK; 3800 3801 if (flags & invalid_flags) { 3802 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3803 error_report("Received an unexpected compressed page"); 3804 } 3805 3806 ret = -EINVAL; 3807 break; 3808 } 3809 3810 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3811 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3812 RAMBlock *block = ram_block_from_stream(f, flags); 3813 3814 host = host_from_ram_block_offset(block, addr); 3815 /* 3816 * After going into COLO stage, we should not load the page 3817 * into SVM's memory directly, we put them into colo_cache firstly. 3818 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3819 * Previously, we copied all these memory in preparing stage of COLO 3820 * while we need to stop VM, which is a time-consuming process. 3821 * Here we optimize it by a trick, back-up every page while in 3822 * migration process while COLO is enabled, though it affects the 3823 * speed of the migration, but it obviously reduce the downtime of 3824 * back-up all SVM'S memory in COLO preparing stage. 3825 */ 3826 if (migration_incoming_colo_enabled()) { 3827 if (migration_incoming_in_colo_state()) { 3828 /* In COLO stage, put all pages into cache temporarily */ 3829 host = colo_cache_from_block_offset(block, addr, true); 3830 } else { 3831 /* 3832 * In migration stage but before COLO stage, 3833 * Put all pages into both cache and SVM's memory. 3834 */ 3835 host_bak = colo_cache_from_block_offset(block, addr, false); 3836 } 3837 } 3838 if (!host) { 3839 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3840 ret = -EINVAL; 3841 break; 3842 } 3843 if (!migration_incoming_in_colo_state()) { 3844 ramblock_recv_bitmap_set(block, host); 3845 } 3846 3847 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3848 } 3849 3850 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3851 case RAM_SAVE_FLAG_MEM_SIZE: 3852 /* Synchronize RAM block list */ 3853 total_ram_bytes = addr; 3854 while (!ret && total_ram_bytes) { 3855 RAMBlock *block; 3856 char id[256]; 3857 ram_addr_t length; 3858 3859 len = qemu_get_byte(f); 3860 qemu_get_buffer(f, (uint8_t *)id, len); 3861 id[len] = 0; 3862 length = qemu_get_be64(f); 3863 3864 block = qemu_ram_block_by_name(id); 3865 if (block && !qemu_ram_is_migratable(block)) { 3866 error_report("block %s should not be migrated !", id); 3867 ret = -EINVAL; 3868 } else if (block) { 3869 if (length != block->used_length) { 3870 Error *local_err = NULL; 3871 3872 ret = qemu_ram_resize(block, length, 3873 &local_err); 3874 if (local_err) { 3875 error_report_err(local_err); 3876 } 3877 } 3878 /* For postcopy we need to check hugepage sizes match */ 3879 if (postcopy_advised && migrate_postcopy_ram() && 3880 block->page_size != qemu_host_page_size) { 3881 uint64_t remote_page_size = qemu_get_be64(f); 3882 if (remote_page_size != block->page_size) { 3883 error_report("Mismatched RAM page size %s " 3884 "(local) %zd != %" PRId64, 3885 id, block->page_size, 3886 remote_page_size); 3887 ret = -EINVAL; 3888 } 3889 } 3890 if (migrate_ignore_shared()) { 3891 hwaddr addr = qemu_get_be64(f); 3892 if (ramblock_is_ignored(block) && 3893 block->mr->addr != addr) { 3894 error_report("Mismatched GPAs for block %s " 3895 "%" PRId64 "!= %" PRId64, 3896 id, (uint64_t)addr, 3897 (uint64_t)block->mr->addr); 3898 ret = -EINVAL; 3899 } 3900 } 3901 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3902 block->idstr); 3903 } else { 3904 error_report("Unknown ramblock \"%s\", cannot " 3905 "accept migration", id); 3906 ret = -EINVAL; 3907 } 3908 3909 total_ram_bytes -= length; 3910 } 3911 break; 3912 3913 case RAM_SAVE_FLAG_ZERO: 3914 ch = qemu_get_byte(f); 3915 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3916 break; 3917 3918 case RAM_SAVE_FLAG_PAGE: 3919 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3920 break; 3921 3922 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3923 len = qemu_get_be32(f); 3924 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3925 error_report("Invalid compressed data length: %d", len); 3926 ret = -EINVAL; 3927 break; 3928 } 3929 decompress_data_with_multi_threads(f, host, len); 3930 break; 3931 3932 case RAM_SAVE_FLAG_XBZRLE: 3933 if (load_xbzrle(f, addr, host) < 0) { 3934 error_report("Failed to decompress XBZRLE page at " 3935 RAM_ADDR_FMT, addr); 3936 ret = -EINVAL; 3937 break; 3938 } 3939 break; 3940 case RAM_SAVE_FLAG_EOS: 3941 /* normal exit */ 3942 multifd_recv_sync_main(); 3943 break; 3944 default: 3945 if (flags & RAM_SAVE_FLAG_HOOK) { 3946 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3947 } else { 3948 error_report("Unknown combination of migration flags: 0x%x", 3949 flags); 3950 ret = -EINVAL; 3951 } 3952 } 3953 if (!ret) { 3954 ret = qemu_file_get_error(f); 3955 } 3956 if (!ret && host_bak) { 3957 memcpy(host_bak, host, TARGET_PAGE_SIZE); 3958 } 3959 } 3960 3961 ret |= wait_for_decompress_done(); 3962 return ret; 3963 } 3964 3965 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3966 { 3967 int ret = 0; 3968 static uint64_t seq_iter; 3969 /* 3970 * If system is running in postcopy mode, page inserts to host memory must 3971 * be atomic 3972 */ 3973 bool postcopy_running = postcopy_is_running(); 3974 3975 seq_iter++; 3976 3977 if (version_id != 4) { 3978 return -EINVAL; 3979 } 3980 3981 /* 3982 * This RCU critical section can be very long running. 3983 * When RCU reclaims in the code start to become numerous, 3984 * it will be necessary to reduce the granularity of this 3985 * critical section. 3986 */ 3987 WITH_RCU_READ_LOCK_GUARD() { 3988 if (postcopy_running) { 3989 ret = ram_load_postcopy(f); 3990 } else { 3991 ret = ram_load_precopy(f); 3992 } 3993 } 3994 trace_ram_load_complete(ret, seq_iter); 3995 3996 return ret; 3997 } 3998 3999 static bool ram_has_postcopy(void *opaque) 4000 { 4001 RAMBlock *rb; 4002 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4003 if (ramblock_is_pmem(rb)) { 4004 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4005 "is not supported now!", rb->idstr, rb->host); 4006 return false; 4007 } 4008 } 4009 4010 return migrate_postcopy_ram(); 4011 } 4012 4013 /* Sync all the dirty bitmap with destination VM. */ 4014 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4015 { 4016 RAMBlock *block; 4017 QEMUFile *file = s->to_dst_file; 4018 int ramblock_count = 0; 4019 4020 trace_ram_dirty_bitmap_sync_start(); 4021 4022 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4023 qemu_savevm_send_recv_bitmap(file, block->idstr); 4024 trace_ram_dirty_bitmap_request(block->idstr); 4025 ramblock_count++; 4026 } 4027 4028 trace_ram_dirty_bitmap_sync_wait(); 4029 4030 /* Wait until all the ramblocks' dirty bitmap synced */ 4031 while (ramblock_count--) { 4032 qemu_sem_wait(&s->rp_state.rp_sem); 4033 } 4034 4035 trace_ram_dirty_bitmap_sync_complete(); 4036 4037 return 0; 4038 } 4039 4040 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4041 { 4042 qemu_sem_post(&s->rp_state.rp_sem); 4043 } 4044 4045 /* 4046 * Read the received bitmap, revert it as the initial dirty bitmap. 4047 * This is only used when the postcopy migration is paused but wants 4048 * to resume from a middle point. 4049 */ 4050 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4051 { 4052 int ret = -EINVAL; 4053 /* from_dst_file is always valid because we're within rp_thread */ 4054 QEMUFile *file = s->rp_state.from_dst_file; 4055 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4056 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4057 uint64_t size, end_mark; 4058 4059 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4060 4061 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4062 error_report("%s: incorrect state %s", __func__, 4063 MigrationStatus_str(s->state)); 4064 return -EINVAL; 4065 } 4066 4067 /* 4068 * Note: see comments in ramblock_recv_bitmap_send() on why we 4069 * need the endianness conversion, and the paddings. 4070 */ 4071 local_size = ROUND_UP(local_size, 8); 4072 4073 /* Add paddings */ 4074 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4075 4076 size = qemu_get_be64(file); 4077 4078 /* The size of the bitmap should match with our ramblock */ 4079 if (size != local_size) { 4080 error_report("%s: ramblock '%s' bitmap size mismatch " 4081 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4082 block->idstr, size, local_size); 4083 ret = -EINVAL; 4084 goto out; 4085 } 4086 4087 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4088 end_mark = qemu_get_be64(file); 4089 4090 ret = qemu_file_get_error(file); 4091 if (ret || size != local_size) { 4092 error_report("%s: read bitmap failed for ramblock '%s': %d" 4093 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4094 __func__, block->idstr, ret, local_size, size); 4095 ret = -EIO; 4096 goto out; 4097 } 4098 4099 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4100 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4101 __func__, block->idstr, end_mark); 4102 ret = -EINVAL; 4103 goto out; 4104 } 4105 4106 /* 4107 * Endianness conversion. We are during postcopy (though paused). 4108 * The dirty bitmap won't change. We can directly modify it. 4109 */ 4110 bitmap_from_le(block->bmap, le_bitmap, nbits); 4111 4112 /* 4113 * What we received is "received bitmap". Revert it as the initial 4114 * dirty bitmap for this ramblock. 4115 */ 4116 bitmap_complement(block->bmap, block->bmap, nbits); 4117 4118 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4119 4120 /* 4121 * We succeeded to sync bitmap for current ramblock. If this is 4122 * the last one to sync, we need to notify the main send thread. 4123 */ 4124 ram_dirty_bitmap_reload_notify(s); 4125 4126 ret = 0; 4127 out: 4128 g_free(le_bitmap); 4129 return ret; 4130 } 4131 4132 static int ram_resume_prepare(MigrationState *s, void *opaque) 4133 { 4134 RAMState *rs = *(RAMState **)opaque; 4135 int ret; 4136 4137 ret = ram_dirty_bitmap_sync_all(s, rs); 4138 if (ret) { 4139 return ret; 4140 } 4141 4142 ram_state_resume_prepare(rs, s->to_dst_file); 4143 4144 return 0; 4145 } 4146 4147 static SaveVMHandlers savevm_ram_handlers = { 4148 .save_setup = ram_save_setup, 4149 .save_live_iterate = ram_save_iterate, 4150 .save_live_complete_postcopy = ram_save_complete, 4151 .save_live_complete_precopy = ram_save_complete, 4152 .has_postcopy = ram_has_postcopy, 4153 .save_live_pending = ram_save_pending, 4154 .load_state = ram_load, 4155 .save_cleanup = ram_save_cleanup, 4156 .load_setup = ram_load_setup, 4157 .load_cleanup = ram_load_cleanup, 4158 .resume_prepare = ram_resume_prepare, 4159 }; 4160 4161 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4162 size_t old_size, size_t new_size) 4163 { 4164 PostcopyState ps = postcopy_state_get(); 4165 ram_addr_t offset; 4166 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4167 Error *err = NULL; 4168 4169 if (ramblock_is_ignored(rb)) { 4170 return; 4171 } 4172 4173 if (!migration_is_idle()) { 4174 /* 4175 * Precopy code on the source cannot deal with the size of RAM blocks 4176 * changing at random points in time - especially after sending the 4177 * RAM block sizes in the migration stream, they must no longer change. 4178 * Abort and indicate a proper reason. 4179 */ 4180 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4181 migrate_set_error(migrate_get_current(), err); 4182 error_free(err); 4183 migration_cancel(); 4184 } 4185 4186 switch (ps) { 4187 case POSTCOPY_INCOMING_ADVISE: 4188 /* 4189 * Update what ram_postcopy_incoming_init()->init_range() does at the 4190 * time postcopy was advised. Syncing RAM blocks with the source will 4191 * result in RAM resizes. 4192 */ 4193 if (old_size < new_size) { 4194 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4195 error_report("RAM block '%s' discard of resized RAM failed", 4196 rb->idstr); 4197 } 4198 } 4199 rb->postcopy_length = new_size; 4200 break; 4201 case POSTCOPY_INCOMING_NONE: 4202 case POSTCOPY_INCOMING_RUNNING: 4203 case POSTCOPY_INCOMING_END: 4204 /* 4205 * Once our guest is running, postcopy does no longer care about 4206 * resizes. When growing, the new memory was not available on the 4207 * source, no handler needed. 4208 */ 4209 break; 4210 default: 4211 error_report("RAM block '%s' resized during postcopy state: %d", 4212 rb->idstr, ps); 4213 exit(-1); 4214 } 4215 } 4216 4217 static RAMBlockNotifier ram_mig_ram_notifier = { 4218 .ram_block_resized = ram_mig_ram_block_resized, 4219 }; 4220 4221 void ram_mig_init(void) 4222 { 4223 qemu_mutex_init(&XBZRLE.lock); 4224 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4225 ram_block_notifier_add(&ram_mig_ram_notifier); 4226 } 4227