1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration/register.h" 39 #include "migration/misc.h" 40 #include "qemu-file.h" 41 #include "postcopy-ram.h" 42 #include "page_cache.h" 43 #include "qemu/error-report.h" 44 #include "qapi/error.h" 45 #include "qapi/qapi-types-migration.h" 46 #include "qapi/qapi-events-migration.h" 47 #include "qapi/qmp/qerror.h" 48 #include "trace.h" 49 #include "exec/ram_addr.h" 50 #include "exec/target_page.h" 51 #include "qemu/rcu_queue.h" 52 #include "migration/colo.h" 53 #include "block.h" 54 #include "sysemu/cpu-throttle.h" 55 #include "savevm.h" 56 #include "qemu/iov.h" 57 #include "multifd.h" 58 #include "sysemu/runstate.h" 59 60 #include "hw/boards.h" /* for machine_dump_guest_core() */ 61 62 #if defined(__linux__) 63 #include "qemu/userfaultfd.h" 64 #endif /* defined(__linux__) */ 65 66 /***********************************************************/ 67 /* ram save/restore */ 68 69 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 70 * worked for pages that where filled with the same char. We switched 71 * it to only search for the zero value. And to avoid confusion with 72 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 73 */ 74 75 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 76 #define RAM_SAVE_FLAG_ZERO 0x02 77 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 78 #define RAM_SAVE_FLAG_PAGE 0x08 79 #define RAM_SAVE_FLAG_EOS 0x10 80 #define RAM_SAVE_FLAG_CONTINUE 0x20 81 #define RAM_SAVE_FLAG_XBZRLE 0x40 82 /* 0x80 is reserved in migration.h start with 0x100 next */ 83 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 84 85 XBZRLECacheStats xbzrle_counters; 86 87 /* struct contains XBZRLE cache and a static page 88 used by the compression */ 89 static struct { 90 /* buffer used for XBZRLE encoding */ 91 uint8_t *encoded_buf; 92 /* buffer for storing page content */ 93 uint8_t *current_buf; 94 /* Cache for XBZRLE, Protected by lock. */ 95 PageCache *cache; 96 QemuMutex lock; 97 /* it will store a page full of zeros */ 98 uint8_t *zero_target_page; 99 /* buffer used for XBZRLE decoding */ 100 uint8_t *decoded_buf; 101 } XBZRLE; 102 103 static void XBZRLE_cache_lock(void) 104 { 105 if (migrate_use_xbzrle()) { 106 qemu_mutex_lock(&XBZRLE.lock); 107 } 108 } 109 110 static void XBZRLE_cache_unlock(void) 111 { 112 if (migrate_use_xbzrle()) { 113 qemu_mutex_unlock(&XBZRLE.lock); 114 } 115 } 116 117 /** 118 * xbzrle_cache_resize: resize the xbzrle cache 119 * 120 * This function is called from migrate_params_apply in main 121 * thread, possibly while a migration is in progress. A running 122 * migration may be using the cache and might finish during this call, 123 * hence changes to the cache are protected by XBZRLE.lock(). 124 * 125 * Returns 0 for success or -1 for error 126 * 127 * @new_size: new cache size 128 * @errp: set *errp if the check failed, with reason 129 */ 130 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 131 { 132 PageCache *new_cache; 133 int64_t ret = 0; 134 135 /* Check for truncation */ 136 if (new_size != (size_t)new_size) { 137 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 138 "exceeding address space"); 139 return -1; 140 } 141 142 if (new_size == migrate_xbzrle_cache_size()) { 143 /* nothing to do */ 144 return 0; 145 } 146 147 XBZRLE_cache_lock(); 148 149 if (XBZRLE.cache != NULL) { 150 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 151 if (!new_cache) { 152 ret = -1; 153 goto out; 154 } 155 156 cache_fini(XBZRLE.cache); 157 XBZRLE.cache = new_cache; 158 } 159 out: 160 XBZRLE_cache_unlock(); 161 return ret; 162 } 163 164 bool ramblock_is_ignored(RAMBlock *block) 165 { 166 return !qemu_ram_is_migratable(block) || 167 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 168 } 169 170 #undef RAMBLOCK_FOREACH 171 172 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 173 { 174 RAMBlock *block; 175 int ret = 0; 176 177 RCU_READ_LOCK_GUARD(); 178 179 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 180 ret = func(block, opaque); 181 if (ret) { 182 break; 183 } 184 } 185 return ret; 186 } 187 188 static void ramblock_recv_map_init(void) 189 { 190 RAMBlock *rb; 191 192 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 193 assert(!rb->receivedmap); 194 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 195 } 196 } 197 198 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 199 { 200 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 201 rb->receivedmap); 202 } 203 204 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 205 { 206 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 207 } 208 209 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 210 { 211 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 212 } 213 214 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 215 size_t nr) 216 { 217 bitmap_set_atomic(rb->receivedmap, 218 ramblock_recv_bitmap_offset(host_addr, rb), 219 nr); 220 } 221 222 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 223 224 /* 225 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 226 * 227 * Returns >0 if success with sent bytes, or <0 if error. 228 */ 229 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 230 const char *block_name) 231 { 232 RAMBlock *block = qemu_ram_block_by_name(block_name); 233 unsigned long *le_bitmap, nbits; 234 uint64_t size; 235 236 if (!block) { 237 error_report("%s: invalid block name: %s", __func__, block_name); 238 return -1; 239 } 240 241 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 242 243 /* 244 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 245 * machines we may need 4 more bytes for padding (see below 246 * comment). So extend it a bit before hand. 247 */ 248 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 249 250 /* 251 * Always use little endian when sending the bitmap. This is 252 * required that when source and destination VMs are not using the 253 * same endianness. (Note: big endian won't work.) 254 */ 255 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 256 257 /* Size of the bitmap, in bytes */ 258 size = DIV_ROUND_UP(nbits, 8); 259 260 /* 261 * size is always aligned to 8 bytes for 64bit machines, but it 262 * may not be true for 32bit machines. We need this padding to 263 * make sure the migration can survive even between 32bit and 264 * 64bit machines. 265 */ 266 size = ROUND_UP(size, 8); 267 268 qemu_put_be64(file, size); 269 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 270 /* 271 * Mark as an end, in case the middle part is screwed up due to 272 * some "mysterious" reason. 273 */ 274 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 275 qemu_fflush(file); 276 277 g_free(le_bitmap); 278 279 if (qemu_file_get_error(file)) { 280 return qemu_file_get_error(file); 281 } 282 283 return size + sizeof(size); 284 } 285 286 /* 287 * An outstanding page request, on the source, having been received 288 * and queued 289 */ 290 struct RAMSrcPageRequest { 291 RAMBlock *rb; 292 hwaddr offset; 293 hwaddr len; 294 295 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 296 }; 297 298 /* State of RAM for migration */ 299 struct RAMState { 300 /* QEMUFile used for this migration */ 301 QEMUFile *f; 302 /* UFFD file descriptor, used in 'write-tracking' migration */ 303 int uffdio_fd; 304 /* Last block that we have visited searching for dirty pages */ 305 RAMBlock *last_seen_block; 306 /* Last block from where we have sent data */ 307 RAMBlock *last_sent_block; 308 /* Last dirty target page we have sent */ 309 ram_addr_t last_page; 310 /* last ram version we have seen */ 311 uint32_t last_version; 312 /* How many times we have dirty too many pages */ 313 int dirty_rate_high_cnt; 314 /* these variables are used for bitmap sync */ 315 /* last time we did a full bitmap_sync */ 316 int64_t time_last_bitmap_sync; 317 /* bytes transferred at start_time */ 318 uint64_t bytes_xfer_prev; 319 /* number of dirty pages since start_time */ 320 uint64_t num_dirty_pages_period; 321 /* xbzrle misses since the beginning of the period */ 322 uint64_t xbzrle_cache_miss_prev; 323 /* Amount of xbzrle pages since the beginning of the period */ 324 uint64_t xbzrle_pages_prev; 325 /* Amount of xbzrle encoded bytes since the beginning of the period */ 326 uint64_t xbzrle_bytes_prev; 327 /* Start using XBZRLE (e.g., after the first round). */ 328 bool xbzrle_enabled; 329 /* Are we on the last stage of migration */ 330 bool last_stage; 331 /* compression statistics since the beginning of the period */ 332 /* amount of count that no free thread to compress data */ 333 uint64_t compress_thread_busy_prev; 334 /* amount bytes after compression */ 335 uint64_t compressed_size_prev; 336 /* amount of compressed pages */ 337 uint64_t compress_pages_prev; 338 339 /* total handled target pages at the beginning of period */ 340 uint64_t target_page_count_prev; 341 /* total handled target pages since start */ 342 uint64_t target_page_count; 343 /* number of dirty bits in the bitmap */ 344 uint64_t migration_dirty_pages; 345 /* Protects modification of the bitmap and migration dirty pages */ 346 QemuMutex bitmap_mutex; 347 /* The RAMBlock used in the last src_page_requests */ 348 RAMBlock *last_req_rb; 349 /* Queue of outstanding page requests from the destination */ 350 QemuMutex src_page_req_mutex; 351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 352 }; 353 typedef struct RAMState RAMState; 354 355 static RAMState *ram_state; 356 357 static NotifierWithReturnList precopy_notifier_list; 358 359 /* Whether postcopy has queued requests? */ 360 static bool postcopy_has_request(RAMState *rs) 361 { 362 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 363 } 364 365 void precopy_infrastructure_init(void) 366 { 367 notifier_with_return_list_init(&precopy_notifier_list); 368 } 369 370 void precopy_add_notifier(NotifierWithReturn *n) 371 { 372 notifier_with_return_list_add(&precopy_notifier_list, n); 373 } 374 375 void precopy_remove_notifier(NotifierWithReturn *n) 376 { 377 notifier_with_return_remove(n); 378 } 379 380 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 381 { 382 PrecopyNotifyData pnd; 383 pnd.reason = reason; 384 pnd.errp = errp; 385 386 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 387 } 388 389 uint64_t ram_bytes_remaining(void) 390 { 391 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 392 0; 393 } 394 395 MigrationStats ram_counters; 396 397 static void ram_transferred_add(uint64_t bytes) 398 { 399 if (runstate_is_running()) { 400 ram_counters.precopy_bytes += bytes; 401 } else if (migration_in_postcopy()) { 402 ram_counters.postcopy_bytes += bytes; 403 } else { 404 ram_counters.downtime_bytes += bytes; 405 } 406 ram_counters.transferred += bytes; 407 } 408 409 /* used by the search for pages to send */ 410 struct PageSearchStatus { 411 /* Current block being searched */ 412 RAMBlock *block; 413 /* Current page to search from */ 414 unsigned long page; 415 /* Set once we wrap around */ 416 bool complete_round; 417 /* Whether current page is explicitly requested by postcopy */ 418 bool postcopy_requested; 419 }; 420 typedef struct PageSearchStatus PageSearchStatus; 421 422 CompressionStats compression_counters; 423 424 struct CompressParam { 425 bool done; 426 bool quit; 427 bool zero_page; 428 QEMUFile *file; 429 QemuMutex mutex; 430 QemuCond cond; 431 RAMBlock *block; 432 ram_addr_t offset; 433 434 /* internally used fields */ 435 z_stream stream; 436 uint8_t *originbuf; 437 }; 438 typedef struct CompressParam CompressParam; 439 440 struct DecompressParam { 441 bool done; 442 bool quit; 443 QemuMutex mutex; 444 QemuCond cond; 445 void *des; 446 uint8_t *compbuf; 447 int len; 448 z_stream stream; 449 }; 450 typedef struct DecompressParam DecompressParam; 451 452 static CompressParam *comp_param; 453 static QemuThread *compress_threads; 454 /* comp_done_cond is used to wake up the migration thread when 455 * one of the compression threads has finished the compression. 456 * comp_done_lock is used to co-work with comp_done_cond. 457 */ 458 static QemuMutex comp_done_lock; 459 static QemuCond comp_done_cond; 460 /* The empty QEMUFileOps will be used by file in CompressParam */ 461 static const QEMUFileOps empty_ops = { }; 462 463 static QEMUFile *decomp_file; 464 static DecompressParam *decomp_param; 465 static QemuThread *decompress_threads; 466 static QemuMutex decomp_done_lock; 467 static QemuCond decomp_done_cond; 468 469 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 470 ram_addr_t offset, uint8_t *source_buf); 471 472 static void *do_data_compress(void *opaque) 473 { 474 CompressParam *param = opaque; 475 RAMBlock *block; 476 ram_addr_t offset; 477 bool zero_page; 478 479 qemu_mutex_lock(¶m->mutex); 480 while (!param->quit) { 481 if (param->block) { 482 block = param->block; 483 offset = param->offset; 484 param->block = NULL; 485 qemu_mutex_unlock(¶m->mutex); 486 487 zero_page = do_compress_ram_page(param->file, ¶m->stream, 488 block, offset, param->originbuf); 489 490 qemu_mutex_lock(&comp_done_lock); 491 param->done = true; 492 param->zero_page = zero_page; 493 qemu_cond_signal(&comp_done_cond); 494 qemu_mutex_unlock(&comp_done_lock); 495 496 qemu_mutex_lock(¶m->mutex); 497 } else { 498 qemu_cond_wait(¶m->cond, ¶m->mutex); 499 } 500 } 501 qemu_mutex_unlock(¶m->mutex); 502 503 return NULL; 504 } 505 506 static void compress_threads_save_cleanup(void) 507 { 508 int i, thread_count; 509 510 if (!migrate_use_compression() || !comp_param) { 511 return; 512 } 513 514 thread_count = migrate_compress_threads(); 515 for (i = 0; i < thread_count; i++) { 516 /* 517 * we use it as a indicator which shows if the thread is 518 * properly init'd or not 519 */ 520 if (!comp_param[i].file) { 521 break; 522 } 523 524 qemu_mutex_lock(&comp_param[i].mutex); 525 comp_param[i].quit = true; 526 qemu_cond_signal(&comp_param[i].cond); 527 qemu_mutex_unlock(&comp_param[i].mutex); 528 529 qemu_thread_join(compress_threads + i); 530 qemu_mutex_destroy(&comp_param[i].mutex); 531 qemu_cond_destroy(&comp_param[i].cond); 532 deflateEnd(&comp_param[i].stream); 533 g_free(comp_param[i].originbuf); 534 qemu_fclose(comp_param[i].file); 535 comp_param[i].file = NULL; 536 } 537 qemu_mutex_destroy(&comp_done_lock); 538 qemu_cond_destroy(&comp_done_cond); 539 g_free(compress_threads); 540 g_free(comp_param); 541 compress_threads = NULL; 542 comp_param = NULL; 543 } 544 545 static int compress_threads_save_setup(void) 546 { 547 int i, thread_count; 548 549 if (!migrate_use_compression()) { 550 return 0; 551 } 552 thread_count = migrate_compress_threads(); 553 compress_threads = g_new0(QemuThread, thread_count); 554 comp_param = g_new0(CompressParam, thread_count); 555 qemu_cond_init(&comp_done_cond); 556 qemu_mutex_init(&comp_done_lock); 557 for (i = 0; i < thread_count; i++) { 558 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 559 if (!comp_param[i].originbuf) { 560 goto exit; 561 } 562 563 if (deflateInit(&comp_param[i].stream, 564 migrate_compress_level()) != Z_OK) { 565 g_free(comp_param[i].originbuf); 566 goto exit; 567 } 568 569 /* comp_param[i].file is just used as a dummy buffer to save data, 570 * set its ops to empty. 571 */ 572 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false); 573 comp_param[i].done = true; 574 comp_param[i].quit = false; 575 qemu_mutex_init(&comp_param[i].mutex); 576 qemu_cond_init(&comp_param[i].cond); 577 qemu_thread_create(compress_threads + i, "compress", 578 do_data_compress, comp_param + i, 579 QEMU_THREAD_JOINABLE); 580 } 581 return 0; 582 583 exit: 584 compress_threads_save_cleanup(); 585 return -1; 586 } 587 588 /** 589 * save_page_header: write page header to wire 590 * 591 * If this is the 1st block, it also writes the block identification 592 * 593 * Returns the number of bytes written 594 * 595 * @f: QEMUFile where to send the data 596 * @block: block that contains the page we want to send 597 * @offset: offset inside the block for the page 598 * in the lower bits, it contains flags 599 */ 600 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 601 ram_addr_t offset) 602 { 603 size_t size, len; 604 605 if (block == rs->last_sent_block) { 606 offset |= RAM_SAVE_FLAG_CONTINUE; 607 } 608 qemu_put_be64(f, offset); 609 size = 8; 610 611 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 612 len = strlen(block->idstr); 613 qemu_put_byte(f, len); 614 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 615 size += 1 + len; 616 rs->last_sent_block = block; 617 } 618 return size; 619 } 620 621 /** 622 * mig_throttle_guest_down: throttle down the guest 623 * 624 * Reduce amount of guest cpu execution to hopefully slow down memory 625 * writes. If guest dirty memory rate is reduced below the rate at 626 * which we can transfer pages to the destination then we should be 627 * able to complete migration. Some workloads dirty memory way too 628 * fast and will not effectively converge, even with auto-converge. 629 */ 630 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 631 uint64_t bytes_dirty_threshold) 632 { 633 MigrationState *s = migrate_get_current(); 634 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 635 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 636 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 637 int pct_max = s->parameters.max_cpu_throttle; 638 639 uint64_t throttle_now = cpu_throttle_get_percentage(); 640 uint64_t cpu_now, cpu_ideal, throttle_inc; 641 642 /* We have not started throttling yet. Let's start it. */ 643 if (!cpu_throttle_active()) { 644 cpu_throttle_set(pct_initial); 645 } else { 646 /* Throttling already on, just increase the rate */ 647 if (!pct_tailslow) { 648 throttle_inc = pct_increment; 649 } else { 650 /* Compute the ideal CPU percentage used by Guest, which may 651 * make the dirty rate match the dirty rate threshold. */ 652 cpu_now = 100 - throttle_now; 653 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 654 bytes_dirty_period); 655 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 656 } 657 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 658 } 659 } 660 661 void mig_throttle_counter_reset(void) 662 { 663 RAMState *rs = ram_state; 664 665 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 666 rs->num_dirty_pages_period = 0; 667 rs->bytes_xfer_prev = ram_counters.transferred; 668 } 669 670 /** 671 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 672 * 673 * @rs: current RAM state 674 * @current_addr: address for the zero page 675 * 676 * Update the xbzrle cache to reflect a page that's been sent as all 0. 677 * The important thing is that a stale (not-yet-0'd) page be replaced 678 * by the new data. 679 * As a bonus, if the page wasn't in the cache it gets added so that 680 * when a small write is made into the 0'd page it gets XBZRLE sent. 681 */ 682 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 683 { 684 if (!rs->xbzrle_enabled) { 685 return; 686 } 687 688 /* We don't care if this fails to allocate a new cache page 689 * as long as it updated an old one */ 690 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 691 ram_counters.dirty_sync_count); 692 } 693 694 #define ENCODING_FLAG_XBZRLE 0x1 695 696 /** 697 * save_xbzrle_page: compress and send current page 698 * 699 * Returns: 1 means that we wrote the page 700 * 0 means that page is identical to the one already sent 701 * -1 means that xbzrle would be longer than normal 702 * 703 * @rs: current RAM state 704 * @current_data: pointer to the address of the page contents 705 * @current_addr: addr of the page 706 * @block: block that contains the page we want to send 707 * @offset: offset inside the block for the page 708 */ 709 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 710 ram_addr_t current_addr, RAMBlock *block, 711 ram_addr_t offset) 712 { 713 int encoded_len = 0, bytes_xbzrle; 714 uint8_t *prev_cached_page; 715 716 if (!cache_is_cached(XBZRLE.cache, current_addr, 717 ram_counters.dirty_sync_count)) { 718 xbzrle_counters.cache_miss++; 719 if (!rs->last_stage) { 720 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 721 ram_counters.dirty_sync_count) == -1) { 722 return -1; 723 } else { 724 /* update *current_data when the page has been 725 inserted into cache */ 726 *current_data = get_cached_data(XBZRLE.cache, current_addr); 727 } 728 } 729 return -1; 730 } 731 732 /* 733 * Reaching here means the page has hit the xbzrle cache, no matter what 734 * encoding result it is (normal encoding, overflow or skipping the page), 735 * count the page as encoded. This is used to calculate the encoding rate. 736 * 737 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 738 * 2nd page turns out to be skipped (i.e. no new bytes written to the 739 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 740 * skipped page included. In this way, the encoding rate can tell if the 741 * guest page is good for xbzrle encoding. 742 */ 743 xbzrle_counters.pages++; 744 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 745 746 /* save current buffer into memory */ 747 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 748 749 /* XBZRLE encoding (if there is no overflow) */ 750 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 751 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 752 TARGET_PAGE_SIZE); 753 754 /* 755 * Update the cache contents, so that it corresponds to the data 756 * sent, in all cases except where we skip the page. 757 */ 758 if (!rs->last_stage && encoded_len != 0) { 759 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 760 /* 761 * In the case where we couldn't compress, ensure that the caller 762 * sends the data from the cache, since the guest might have 763 * changed the RAM since we copied it. 764 */ 765 *current_data = prev_cached_page; 766 } 767 768 if (encoded_len == 0) { 769 trace_save_xbzrle_page_skipping(); 770 return 0; 771 } else if (encoded_len == -1) { 772 trace_save_xbzrle_page_overflow(); 773 xbzrle_counters.overflow++; 774 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 775 return -1; 776 } 777 778 /* Send XBZRLE based compressed page */ 779 bytes_xbzrle = save_page_header(rs, rs->f, block, 780 offset | RAM_SAVE_FLAG_XBZRLE); 781 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 782 qemu_put_be16(rs->f, encoded_len); 783 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 784 bytes_xbzrle += encoded_len + 1 + 2; 785 /* 786 * Like compressed_size (please see update_compress_thread_counts), 787 * the xbzrle encoded bytes don't count the 8 byte header with 788 * RAM_SAVE_FLAG_CONTINUE. 789 */ 790 xbzrle_counters.bytes += bytes_xbzrle - 8; 791 ram_transferred_add(bytes_xbzrle); 792 793 return 1; 794 } 795 796 /** 797 * migration_bitmap_find_dirty: find the next dirty page from start 798 * 799 * Returns the page offset within memory region of the start of a dirty page 800 * 801 * @rs: current RAM state 802 * @rb: RAMBlock where to search for dirty pages 803 * @start: page where we start the search 804 */ 805 static inline 806 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 807 unsigned long start) 808 { 809 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 810 unsigned long *bitmap = rb->bmap; 811 812 if (ramblock_is_ignored(rb)) { 813 return size; 814 } 815 816 return find_next_bit(bitmap, size, start); 817 } 818 819 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 820 unsigned long page) 821 { 822 uint8_t shift; 823 hwaddr size, start; 824 825 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 826 return; 827 } 828 829 shift = rb->clear_bmap_shift; 830 /* 831 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 832 * can make things easier sometimes since then start address 833 * of the small chunk will always be 64 pages aligned so the 834 * bitmap will always be aligned to unsigned long. We should 835 * even be able to remove this restriction but I'm simply 836 * keeping it. 837 */ 838 assert(shift >= 6); 839 840 size = 1ULL << (TARGET_PAGE_BITS + shift); 841 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 842 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 843 memory_region_clear_dirty_bitmap(rb->mr, start, size); 844 } 845 846 static void 847 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 848 unsigned long start, 849 unsigned long npages) 850 { 851 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 852 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 853 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 854 855 /* 856 * Clear pages from start to start + npages - 1, so the end boundary is 857 * exclusive. 858 */ 859 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 860 migration_clear_memory_region_dirty_bitmap(rb, i); 861 } 862 } 863 864 /* 865 * colo_bitmap_find_diry:find contiguous dirty pages from start 866 * 867 * Returns the page offset within memory region of the start of the contiguout 868 * dirty page 869 * 870 * @rs: current RAM state 871 * @rb: RAMBlock where to search for dirty pages 872 * @start: page where we start the search 873 * @num: the number of contiguous dirty pages 874 */ 875 static inline 876 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 877 unsigned long start, unsigned long *num) 878 { 879 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 880 unsigned long *bitmap = rb->bmap; 881 unsigned long first, next; 882 883 *num = 0; 884 885 if (ramblock_is_ignored(rb)) { 886 return size; 887 } 888 889 first = find_next_bit(bitmap, size, start); 890 if (first >= size) { 891 return first; 892 } 893 next = find_next_zero_bit(bitmap, size, first + 1); 894 assert(next >= first); 895 *num = next - first; 896 return first; 897 } 898 899 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 900 RAMBlock *rb, 901 unsigned long page) 902 { 903 bool ret; 904 905 /* 906 * Clear dirty bitmap if needed. This _must_ be called before we 907 * send any of the page in the chunk because we need to make sure 908 * we can capture further page content changes when we sync dirty 909 * log the next time. So as long as we are going to send any of 910 * the page in the chunk we clear the remote dirty bitmap for all. 911 * Clearing it earlier won't be a problem, but too late will. 912 */ 913 migration_clear_memory_region_dirty_bitmap(rb, page); 914 915 ret = test_and_clear_bit(page, rb->bmap); 916 if (ret) { 917 rs->migration_dirty_pages--; 918 } 919 920 return ret; 921 } 922 923 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 924 void *opaque) 925 { 926 const hwaddr offset = section->offset_within_region; 927 const hwaddr size = int128_get64(section->size); 928 const unsigned long start = offset >> TARGET_PAGE_BITS; 929 const unsigned long npages = size >> TARGET_PAGE_BITS; 930 RAMBlock *rb = section->mr->ram_block; 931 uint64_t *cleared_bits = opaque; 932 933 /* 934 * We don't grab ram_state->bitmap_mutex because we expect to run 935 * only when starting migration or during postcopy recovery where 936 * we don't have concurrent access. 937 */ 938 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 939 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 940 } 941 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 942 bitmap_clear(rb->bmap, start, npages); 943 } 944 945 /* 946 * Exclude all dirty pages from migration that fall into a discarded range as 947 * managed by a RamDiscardManager responsible for the mapped memory region of 948 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 949 * 950 * Discarded pages ("logically unplugged") have undefined content and must 951 * not get migrated, because even reading these pages for migration might 952 * result in undesired behavior. 953 * 954 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 955 * 956 * Note: The result is only stable while migrating (precopy/postcopy). 957 */ 958 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 959 { 960 uint64_t cleared_bits = 0; 961 962 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 963 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 964 MemoryRegionSection section = { 965 .mr = rb->mr, 966 .offset_within_region = 0, 967 .size = int128_make64(qemu_ram_get_used_length(rb)), 968 }; 969 970 ram_discard_manager_replay_discarded(rdm, §ion, 971 dirty_bitmap_clear_section, 972 &cleared_bits); 973 } 974 return cleared_bits; 975 } 976 977 /* 978 * Check if a host-page aligned page falls into a discarded range as managed by 979 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 980 * 981 * Note: The result is only stable while migrating (precopy/postcopy). 982 */ 983 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 984 { 985 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 986 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 987 MemoryRegionSection section = { 988 .mr = rb->mr, 989 .offset_within_region = start, 990 .size = int128_make64(qemu_ram_pagesize(rb)), 991 }; 992 993 return !ram_discard_manager_is_populated(rdm, §ion); 994 } 995 return false; 996 } 997 998 /* Called with RCU critical section */ 999 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1000 { 1001 uint64_t new_dirty_pages = 1002 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1003 1004 rs->migration_dirty_pages += new_dirty_pages; 1005 rs->num_dirty_pages_period += new_dirty_pages; 1006 } 1007 1008 /** 1009 * ram_pagesize_summary: calculate all the pagesizes of a VM 1010 * 1011 * Returns a summary bitmap of the page sizes of all RAMBlocks 1012 * 1013 * For VMs with just normal pages this is equivalent to the host page 1014 * size. If it's got some huge pages then it's the OR of all the 1015 * different page sizes. 1016 */ 1017 uint64_t ram_pagesize_summary(void) 1018 { 1019 RAMBlock *block; 1020 uint64_t summary = 0; 1021 1022 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1023 summary |= block->page_size; 1024 } 1025 1026 return summary; 1027 } 1028 1029 uint64_t ram_get_total_transferred_pages(void) 1030 { 1031 return ram_counters.normal + ram_counters.duplicate + 1032 compression_counters.pages + xbzrle_counters.pages; 1033 } 1034 1035 static void migration_update_rates(RAMState *rs, int64_t end_time) 1036 { 1037 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1038 double compressed_size; 1039 1040 /* calculate period counters */ 1041 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1042 / (end_time - rs->time_last_bitmap_sync); 1043 1044 if (!page_count) { 1045 return; 1046 } 1047 1048 if (migrate_use_xbzrle()) { 1049 double encoded_size, unencoded_size; 1050 1051 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1052 rs->xbzrle_cache_miss_prev) / page_count; 1053 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1054 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1055 TARGET_PAGE_SIZE; 1056 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1057 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1058 xbzrle_counters.encoding_rate = 0; 1059 } else { 1060 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1061 } 1062 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1063 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1064 } 1065 1066 if (migrate_use_compression()) { 1067 compression_counters.busy_rate = (double)(compression_counters.busy - 1068 rs->compress_thread_busy_prev) / page_count; 1069 rs->compress_thread_busy_prev = compression_counters.busy; 1070 1071 compressed_size = compression_counters.compressed_size - 1072 rs->compressed_size_prev; 1073 if (compressed_size) { 1074 double uncompressed_size = (compression_counters.pages - 1075 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1076 1077 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1078 compression_counters.compression_rate = 1079 uncompressed_size / compressed_size; 1080 1081 rs->compress_pages_prev = compression_counters.pages; 1082 rs->compressed_size_prev = compression_counters.compressed_size; 1083 } 1084 } 1085 } 1086 1087 static void migration_trigger_throttle(RAMState *rs) 1088 { 1089 MigrationState *s = migrate_get_current(); 1090 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1091 1092 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 1093 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1094 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1095 1096 /* During block migration the auto-converge logic incorrectly detects 1097 * that ram migration makes no progress. Avoid this by disabling the 1098 * throttling logic during the bulk phase of block migration. */ 1099 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1100 /* The following detection logic can be refined later. For now: 1101 Check to see if the ratio between dirtied bytes and the approx. 1102 amount of bytes that just got transferred since the last time 1103 we were in this routine reaches the threshold. If that happens 1104 twice, start or increase throttling. */ 1105 1106 if ((bytes_dirty_period > bytes_dirty_threshold) && 1107 (++rs->dirty_rate_high_cnt >= 2)) { 1108 trace_migration_throttle(); 1109 rs->dirty_rate_high_cnt = 0; 1110 mig_throttle_guest_down(bytes_dirty_period, 1111 bytes_dirty_threshold); 1112 } 1113 } 1114 } 1115 1116 static void migration_bitmap_sync(RAMState *rs) 1117 { 1118 RAMBlock *block; 1119 int64_t end_time; 1120 1121 ram_counters.dirty_sync_count++; 1122 1123 if (!rs->time_last_bitmap_sync) { 1124 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1125 } 1126 1127 trace_migration_bitmap_sync_start(); 1128 memory_global_dirty_log_sync(); 1129 1130 qemu_mutex_lock(&rs->bitmap_mutex); 1131 WITH_RCU_READ_LOCK_GUARD() { 1132 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1133 ramblock_sync_dirty_bitmap(rs, block); 1134 } 1135 ram_counters.remaining = ram_bytes_remaining(); 1136 } 1137 qemu_mutex_unlock(&rs->bitmap_mutex); 1138 1139 memory_global_after_dirty_log_sync(); 1140 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1141 1142 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1143 1144 /* more than 1 second = 1000 millisecons */ 1145 if (end_time > rs->time_last_bitmap_sync + 1000) { 1146 migration_trigger_throttle(rs); 1147 1148 migration_update_rates(rs, end_time); 1149 1150 rs->target_page_count_prev = rs->target_page_count; 1151 1152 /* reset period counters */ 1153 rs->time_last_bitmap_sync = end_time; 1154 rs->num_dirty_pages_period = 0; 1155 rs->bytes_xfer_prev = ram_counters.transferred; 1156 } 1157 if (migrate_use_events()) { 1158 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1159 } 1160 } 1161 1162 static void migration_bitmap_sync_precopy(RAMState *rs) 1163 { 1164 Error *local_err = NULL; 1165 1166 /* 1167 * The current notifier usage is just an optimization to migration, so we 1168 * don't stop the normal migration process in the error case. 1169 */ 1170 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1171 error_report_err(local_err); 1172 local_err = NULL; 1173 } 1174 1175 migration_bitmap_sync(rs); 1176 1177 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1178 error_report_err(local_err); 1179 } 1180 } 1181 1182 static void ram_release_page(const char *rbname, uint64_t offset) 1183 { 1184 if (!migrate_release_ram() || !migration_in_postcopy()) { 1185 return; 1186 } 1187 1188 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1189 } 1190 1191 /** 1192 * save_zero_page_to_file: send the zero page to the file 1193 * 1194 * Returns the size of data written to the file, 0 means the page is not 1195 * a zero page 1196 * 1197 * @rs: current RAM state 1198 * @file: the file where the data is saved 1199 * @block: block that contains the page we want to send 1200 * @offset: offset inside the block for the page 1201 */ 1202 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1203 RAMBlock *block, ram_addr_t offset) 1204 { 1205 uint8_t *p = block->host + offset; 1206 int len = 0; 1207 1208 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1209 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1210 qemu_put_byte(file, 0); 1211 len += 1; 1212 ram_release_page(block->idstr, offset); 1213 } 1214 return len; 1215 } 1216 1217 /** 1218 * save_zero_page: send the zero page to the stream 1219 * 1220 * Returns the number of pages written. 1221 * 1222 * @rs: current RAM state 1223 * @block: block that contains the page we want to send 1224 * @offset: offset inside the block for the page 1225 */ 1226 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1227 { 1228 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1229 1230 if (len) { 1231 ram_counters.duplicate++; 1232 ram_transferred_add(len); 1233 return 1; 1234 } 1235 return -1; 1236 } 1237 1238 /* 1239 * @pages: the number of pages written by the control path, 1240 * < 0 - error 1241 * > 0 - number of pages written 1242 * 1243 * Return true if the pages has been saved, otherwise false is returned. 1244 */ 1245 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1246 int *pages) 1247 { 1248 uint64_t bytes_xmit = 0; 1249 int ret; 1250 1251 *pages = -1; 1252 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1253 &bytes_xmit); 1254 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1255 return false; 1256 } 1257 1258 if (bytes_xmit) { 1259 ram_transferred_add(bytes_xmit); 1260 *pages = 1; 1261 } 1262 1263 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1264 return true; 1265 } 1266 1267 if (bytes_xmit > 0) { 1268 ram_counters.normal++; 1269 } else if (bytes_xmit == 0) { 1270 ram_counters.duplicate++; 1271 } 1272 1273 return true; 1274 } 1275 1276 /* 1277 * directly send the page to the stream 1278 * 1279 * Returns the number of pages written. 1280 * 1281 * @rs: current RAM state 1282 * @block: block that contains the page we want to send 1283 * @offset: offset inside the block for the page 1284 * @buf: the page to be sent 1285 * @async: send to page asyncly 1286 */ 1287 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1288 uint8_t *buf, bool async) 1289 { 1290 ram_transferred_add(save_page_header(rs, rs->f, block, 1291 offset | RAM_SAVE_FLAG_PAGE)); 1292 if (async) { 1293 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1294 migrate_release_ram() && 1295 migration_in_postcopy()); 1296 } else { 1297 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1298 } 1299 ram_transferred_add(TARGET_PAGE_SIZE); 1300 ram_counters.normal++; 1301 return 1; 1302 } 1303 1304 /** 1305 * ram_save_page: send the given page to the stream 1306 * 1307 * Returns the number of pages written. 1308 * < 0 - error 1309 * >=0 - Number of pages written - this might legally be 0 1310 * if xbzrle noticed the page was the same. 1311 * 1312 * @rs: current RAM state 1313 * @block: block that contains the page we want to send 1314 * @offset: offset inside the block for the page 1315 */ 1316 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1317 { 1318 int pages = -1; 1319 uint8_t *p; 1320 bool send_async = true; 1321 RAMBlock *block = pss->block; 1322 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1323 ram_addr_t current_addr = block->offset + offset; 1324 1325 p = block->host + offset; 1326 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1327 1328 XBZRLE_cache_lock(); 1329 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1330 pages = save_xbzrle_page(rs, &p, current_addr, block, 1331 offset); 1332 if (!rs->last_stage) { 1333 /* Can't send this cached data async, since the cache page 1334 * might get updated before it gets to the wire 1335 */ 1336 send_async = false; 1337 } 1338 } 1339 1340 /* XBZRLE overflow or normal page */ 1341 if (pages == -1) { 1342 pages = save_normal_page(rs, block, offset, p, send_async); 1343 } 1344 1345 XBZRLE_cache_unlock(); 1346 1347 return pages; 1348 } 1349 1350 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1351 ram_addr_t offset) 1352 { 1353 if (multifd_queue_page(rs->f, block, offset) < 0) { 1354 return -1; 1355 } 1356 ram_counters.normal++; 1357 1358 return 1; 1359 } 1360 1361 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1362 ram_addr_t offset, uint8_t *source_buf) 1363 { 1364 RAMState *rs = ram_state; 1365 uint8_t *p = block->host + offset; 1366 int ret; 1367 1368 if (save_zero_page_to_file(rs, f, block, offset)) { 1369 return true; 1370 } 1371 1372 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1373 1374 /* 1375 * copy it to a internal buffer to avoid it being modified by VM 1376 * so that we can catch up the error during compression and 1377 * decompression 1378 */ 1379 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1380 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1381 if (ret < 0) { 1382 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1383 error_report("compressed data failed!"); 1384 } 1385 return false; 1386 } 1387 1388 static void 1389 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1390 { 1391 ram_transferred_add(bytes_xmit); 1392 1393 if (param->zero_page) { 1394 ram_counters.duplicate++; 1395 return; 1396 } 1397 1398 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1399 compression_counters.compressed_size += bytes_xmit - 8; 1400 compression_counters.pages++; 1401 } 1402 1403 static bool save_page_use_compression(RAMState *rs); 1404 1405 static void flush_compressed_data(RAMState *rs) 1406 { 1407 int idx, len, thread_count; 1408 1409 if (!save_page_use_compression(rs)) { 1410 return; 1411 } 1412 thread_count = migrate_compress_threads(); 1413 1414 qemu_mutex_lock(&comp_done_lock); 1415 for (idx = 0; idx < thread_count; idx++) { 1416 while (!comp_param[idx].done) { 1417 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1418 } 1419 } 1420 qemu_mutex_unlock(&comp_done_lock); 1421 1422 for (idx = 0; idx < thread_count; idx++) { 1423 qemu_mutex_lock(&comp_param[idx].mutex); 1424 if (!comp_param[idx].quit) { 1425 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1426 /* 1427 * it's safe to fetch zero_page without holding comp_done_lock 1428 * as there is no further request submitted to the thread, 1429 * i.e, the thread should be waiting for a request at this point. 1430 */ 1431 update_compress_thread_counts(&comp_param[idx], len); 1432 } 1433 qemu_mutex_unlock(&comp_param[idx].mutex); 1434 } 1435 } 1436 1437 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1438 ram_addr_t offset) 1439 { 1440 param->block = block; 1441 param->offset = offset; 1442 } 1443 1444 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1445 ram_addr_t offset) 1446 { 1447 int idx, thread_count, bytes_xmit = -1, pages = -1; 1448 bool wait = migrate_compress_wait_thread(); 1449 1450 thread_count = migrate_compress_threads(); 1451 qemu_mutex_lock(&comp_done_lock); 1452 retry: 1453 for (idx = 0; idx < thread_count; idx++) { 1454 if (comp_param[idx].done) { 1455 comp_param[idx].done = false; 1456 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1457 qemu_mutex_lock(&comp_param[idx].mutex); 1458 set_compress_params(&comp_param[idx], block, offset); 1459 qemu_cond_signal(&comp_param[idx].cond); 1460 qemu_mutex_unlock(&comp_param[idx].mutex); 1461 pages = 1; 1462 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1463 break; 1464 } 1465 } 1466 1467 /* 1468 * wait for the free thread if the user specifies 'compress-wait-thread', 1469 * otherwise we will post the page out in the main thread as normal page. 1470 */ 1471 if (pages < 0 && wait) { 1472 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1473 goto retry; 1474 } 1475 qemu_mutex_unlock(&comp_done_lock); 1476 1477 return pages; 1478 } 1479 1480 /** 1481 * find_dirty_block: find the next dirty page and update any state 1482 * associated with the search process. 1483 * 1484 * Returns true if a page is found 1485 * 1486 * @rs: current RAM state 1487 * @pss: data about the state of the current dirty page scan 1488 * @again: set to false if the search has scanned the whole of RAM 1489 */ 1490 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1491 { 1492 /* This is not a postcopy requested page */ 1493 pss->postcopy_requested = false; 1494 1495 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1496 if (pss->complete_round && pss->block == rs->last_seen_block && 1497 pss->page >= rs->last_page) { 1498 /* 1499 * We've been once around the RAM and haven't found anything. 1500 * Give up. 1501 */ 1502 *again = false; 1503 return false; 1504 } 1505 if (!offset_in_ramblock(pss->block, 1506 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1507 /* Didn't find anything in this RAM Block */ 1508 pss->page = 0; 1509 pss->block = QLIST_NEXT_RCU(pss->block, next); 1510 if (!pss->block) { 1511 /* 1512 * If memory migration starts over, we will meet a dirtied page 1513 * which may still exists in compression threads's ring, so we 1514 * should flush the compressed data to make sure the new page 1515 * is not overwritten by the old one in the destination. 1516 * 1517 * Also If xbzrle is on, stop using the data compression at this 1518 * point. In theory, xbzrle can do better than compression. 1519 */ 1520 flush_compressed_data(rs); 1521 1522 /* Hit the end of the list */ 1523 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1524 /* Flag that we've looped */ 1525 pss->complete_round = true; 1526 /* After the first round, enable XBZRLE. */ 1527 if (migrate_use_xbzrle()) { 1528 rs->xbzrle_enabled = true; 1529 } 1530 } 1531 /* Didn't find anything this time, but try again on the new block */ 1532 *again = true; 1533 return false; 1534 } else { 1535 /* Can go around again, but... */ 1536 *again = true; 1537 /* We've found something so probably don't need to */ 1538 return true; 1539 } 1540 } 1541 1542 /** 1543 * unqueue_page: gets a page of the queue 1544 * 1545 * Helper for 'get_queued_page' - gets a page off the queue 1546 * 1547 * Returns the block of the page (or NULL if none available) 1548 * 1549 * @rs: current RAM state 1550 * @offset: used to return the offset within the RAMBlock 1551 */ 1552 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1553 { 1554 struct RAMSrcPageRequest *entry; 1555 RAMBlock *block = NULL; 1556 size_t page_size; 1557 1558 if (!postcopy_has_request(rs)) { 1559 return NULL; 1560 } 1561 1562 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1563 1564 /* 1565 * This should _never_ change even after we take the lock, because no one 1566 * should be taking anything off the request list other than us. 1567 */ 1568 assert(postcopy_has_request(rs)); 1569 1570 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1571 block = entry->rb; 1572 *offset = entry->offset; 1573 page_size = qemu_ram_pagesize(block); 1574 /* Each page request should only be multiple page size of the ramblock */ 1575 assert((entry->len % page_size) == 0); 1576 1577 if (entry->len > page_size) { 1578 entry->len -= page_size; 1579 entry->offset += page_size; 1580 } else { 1581 memory_region_unref(block->mr); 1582 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1583 g_free(entry); 1584 migration_consume_urgent_request(); 1585 } 1586 1587 trace_unqueue_page(block->idstr, *offset, 1588 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap)); 1589 1590 return block; 1591 } 1592 1593 #if defined(__linux__) 1594 /** 1595 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1596 * is found, return RAM block pointer and page offset 1597 * 1598 * Returns pointer to the RAMBlock containing faulting page, 1599 * NULL if no write faults are pending 1600 * 1601 * @rs: current RAM state 1602 * @offset: page offset from the beginning of the block 1603 */ 1604 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1605 { 1606 struct uffd_msg uffd_msg; 1607 void *page_address; 1608 RAMBlock *block; 1609 int res; 1610 1611 if (!migrate_background_snapshot()) { 1612 return NULL; 1613 } 1614 1615 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1616 if (res <= 0) { 1617 return NULL; 1618 } 1619 1620 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1621 block = qemu_ram_block_from_host(page_address, false, offset); 1622 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1623 return block; 1624 } 1625 1626 /** 1627 * ram_save_release_protection: release UFFD write protection after 1628 * a range of pages has been saved 1629 * 1630 * @rs: current RAM state 1631 * @pss: page-search-status structure 1632 * @start_page: index of the first page in the range relative to pss->block 1633 * 1634 * Returns 0 on success, negative value in case of an error 1635 */ 1636 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1637 unsigned long start_page) 1638 { 1639 int res = 0; 1640 1641 /* Check if page is from UFFD-managed region. */ 1642 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1643 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1644 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1645 1646 /* Flush async buffers before un-protect. */ 1647 qemu_fflush(rs->f); 1648 /* Un-protect memory range. */ 1649 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1650 false, false); 1651 } 1652 1653 return res; 1654 } 1655 1656 /* ram_write_tracking_available: check if kernel supports required UFFD features 1657 * 1658 * Returns true if supports, false otherwise 1659 */ 1660 bool ram_write_tracking_available(void) 1661 { 1662 uint64_t uffd_features; 1663 int res; 1664 1665 res = uffd_query_features(&uffd_features); 1666 return (res == 0 && 1667 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1668 } 1669 1670 /* ram_write_tracking_compatible: check if guest configuration is 1671 * compatible with 'write-tracking' 1672 * 1673 * Returns true if compatible, false otherwise 1674 */ 1675 bool ram_write_tracking_compatible(void) 1676 { 1677 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1678 int uffd_fd; 1679 RAMBlock *block; 1680 bool ret = false; 1681 1682 /* Open UFFD file descriptor */ 1683 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1684 if (uffd_fd < 0) { 1685 return false; 1686 } 1687 1688 RCU_READ_LOCK_GUARD(); 1689 1690 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1691 uint64_t uffd_ioctls; 1692 1693 /* Nothing to do with read-only and MMIO-writable regions */ 1694 if (block->mr->readonly || block->mr->rom_device) { 1695 continue; 1696 } 1697 /* Try to register block memory via UFFD-IO to track writes */ 1698 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1699 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1700 goto out; 1701 } 1702 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1703 goto out; 1704 } 1705 } 1706 ret = true; 1707 1708 out: 1709 uffd_close_fd(uffd_fd); 1710 return ret; 1711 } 1712 1713 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1714 ram_addr_t size) 1715 { 1716 /* 1717 * We read one byte of each page; this will preallocate page tables if 1718 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1719 * where no page was populated yet. This might require adaption when 1720 * supporting other mappings, like shmem. 1721 */ 1722 for (; offset < size; offset += block->page_size) { 1723 char tmp = *((char *)block->host + offset); 1724 1725 /* Don't optimize the read out */ 1726 asm volatile("" : "+r" (tmp)); 1727 } 1728 } 1729 1730 static inline int populate_read_section(MemoryRegionSection *section, 1731 void *opaque) 1732 { 1733 const hwaddr size = int128_get64(section->size); 1734 hwaddr offset = section->offset_within_region; 1735 RAMBlock *block = section->mr->ram_block; 1736 1737 populate_read_range(block, offset, size); 1738 return 0; 1739 } 1740 1741 /* 1742 * ram_block_populate_read: preallocate page tables and populate pages in the 1743 * RAM block by reading a byte of each page. 1744 * 1745 * Since it's solely used for userfault_fd WP feature, here we just 1746 * hardcode page size to qemu_real_host_page_size. 1747 * 1748 * @block: RAM block to populate 1749 */ 1750 static void ram_block_populate_read(RAMBlock *rb) 1751 { 1752 /* 1753 * Skip populating all pages that fall into a discarded range as managed by 1754 * a RamDiscardManager responsible for the mapped memory region of the 1755 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1756 * must not get populated automatically. We don't have to track 1757 * modifications via userfaultfd WP reliably, because these pages will 1758 * not be part of the migration stream either way -- see 1759 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1760 * 1761 * Note: The result is only stable while migrating (precopy/postcopy). 1762 */ 1763 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1764 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1765 MemoryRegionSection section = { 1766 .mr = rb->mr, 1767 .offset_within_region = 0, 1768 .size = rb->mr->size, 1769 }; 1770 1771 ram_discard_manager_replay_populated(rdm, §ion, 1772 populate_read_section, NULL); 1773 } else { 1774 populate_read_range(rb, 0, rb->used_length); 1775 } 1776 } 1777 1778 /* 1779 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1780 */ 1781 void ram_write_tracking_prepare(void) 1782 { 1783 RAMBlock *block; 1784 1785 RCU_READ_LOCK_GUARD(); 1786 1787 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1788 /* Nothing to do with read-only and MMIO-writable regions */ 1789 if (block->mr->readonly || block->mr->rom_device) { 1790 continue; 1791 } 1792 1793 /* 1794 * Populate pages of the RAM block before enabling userfault_fd 1795 * write protection. 1796 * 1797 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1798 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1799 * pages with pte_none() entries in page table. 1800 */ 1801 ram_block_populate_read(block); 1802 } 1803 } 1804 1805 /* 1806 * ram_write_tracking_start: start UFFD-WP memory tracking 1807 * 1808 * Returns 0 for success or negative value in case of error 1809 */ 1810 int ram_write_tracking_start(void) 1811 { 1812 int uffd_fd; 1813 RAMState *rs = ram_state; 1814 RAMBlock *block; 1815 1816 /* Open UFFD file descriptor */ 1817 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1818 if (uffd_fd < 0) { 1819 return uffd_fd; 1820 } 1821 rs->uffdio_fd = uffd_fd; 1822 1823 RCU_READ_LOCK_GUARD(); 1824 1825 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1826 /* Nothing to do with read-only and MMIO-writable regions */ 1827 if (block->mr->readonly || block->mr->rom_device) { 1828 continue; 1829 } 1830 1831 /* Register block memory with UFFD to track writes */ 1832 if (uffd_register_memory(rs->uffdio_fd, block->host, 1833 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1834 goto fail; 1835 } 1836 /* Apply UFFD write protection to the block memory range */ 1837 if (uffd_change_protection(rs->uffdio_fd, block->host, 1838 block->max_length, true, false)) { 1839 goto fail; 1840 } 1841 block->flags |= RAM_UF_WRITEPROTECT; 1842 memory_region_ref(block->mr); 1843 1844 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1845 block->host, block->max_length); 1846 } 1847 1848 return 0; 1849 1850 fail: 1851 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1852 1853 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1854 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1855 continue; 1856 } 1857 /* 1858 * In case some memory block failed to be write-protected 1859 * remove protection and unregister all succeeded RAM blocks 1860 */ 1861 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1862 false, false); 1863 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1864 /* Cleanup flags and remove reference */ 1865 block->flags &= ~RAM_UF_WRITEPROTECT; 1866 memory_region_unref(block->mr); 1867 } 1868 1869 uffd_close_fd(uffd_fd); 1870 rs->uffdio_fd = -1; 1871 return -1; 1872 } 1873 1874 /** 1875 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1876 */ 1877 void ram_write_tracking_stop(void) 1878 { 1879 RAMState *rs = ram_state; 1880 RAMBlock *block; 1881 1882 RCU_READ_LOCK_GUARD(); 1883 1884 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1885 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1886 continue; 1887 } 1888 /* Remove protection and unregister all affected RAM blocks */ 1889 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1890 false, false); 1891 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1892 1893 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1894 block->host, block->max_length); 1895 1896 /* Cleanup flags and remove reference */ 1897 block->flags &= ~RAM_UF_WRITEPROTECT; 1898 memory_region_unref(block->mr); 1899 } 1900 1901 /* Finally close UFFD file descriptor */ 1902 uffd_close_fd(rs->uffdio_fd); 1903 rs->uffdio_fd = -1; 1904 } 1905 1906 #else 1907 /* No target OS support, stubs just fail or ignore */ 1908 1909 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1910 { 1911 (void) rs; 1912 (void) offset; 1913 1914 return NULL; 1915 } 1916 1917 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1918 unsigned long start_page) 1919 { 1920 (void) rs; 1921 (void) pss; 1922 (void) start_page; 1923 1924 return 0; 1925 } 1926 1927 bool ram_write_tracking_available(void) 1928 { 1929 return false; 1930 } 1931 1932 bool ram_write_tracking_compatible(void) 1933 { 1934 assert(0); 1935 return false; 1936 } 1937 1938 int ram_write_tracking_start(void) 1939 { 1940 assert(0); 1941 return -1; 1942 } 1943 1944 void ram_write_tracking_stop(void) 1945 { 1946 assert(0); 1947 } 1948 #endif /* defined(__linux__) */ 1949 1950 /** 1951 * get_queued_page: unqueue a page from the postcopy requests 1952 * 1953 * Skips pages that are already sent (!dirty) 1954 * 1955 * Returns true if a queued page is found 1956 * 1957 * @rs: current RAM state 1958 * @pss: data about the state of the current dirty page scan 1959 */ 1960 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1961 { 1962 RAMBlock *block; 1963 ram_addr_t offset; 1964 1965 block = unqueue_page(rs, &offset); 1966 1967 if (!block) { 1968 /* 1969 * Poll write faults too if background snapshot is enabled; that's 1970 * when we have vcpus got blocked by the write protected pages. 1971 */ 1972 block = poll_fault_page(rs, &offset); 1973 } 1974 1975 if (block) { 1976 /* 1977 * We want the background search to continue from the queued page 1978 * since the guest is likely to want other pages near to the page 1979 * it just requested. 1980 */ 1981 pss->block = block; 1982 pss->page = offset >> TARGET_PAGE_BITS; 1983 1984 /* 1985 * This unqueued page would break the "one round" check, even is 1986 * really rare. 1987 */ 1988 pss->complete_round = false; 1989 pss->postcopy_requested = true; 1990 } 1991 1992 return !!block; 1993 } 1994 1995 /** 1996 * migration_page_queue_free: drop any remaining pages in the ram 1997 * request queue 1998 * 1999 * It should be empty at the end anyway, but in error cases there may 2000 * be some left. in case that there is any page left, we drop it. 2001 * 2002 */ 2003 static void migration_page_queue_free(RAMState *rs) 2004 { 2005 struct RAMSrcPageRequest *mspr, *next_mspr; 2006 /* This queue generally should be empty - but in the case of a failed 2007 * migration might have some droppings in. 2008 */ 2009 RCU_READ_LOCK_GUARD(); 2010 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2011 memory_region_unref(mspr->rb->mr); 2012 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2013 g_free(mspr); 2014 } 2015 } 2016 2017 /** 2018 * ram_save_queue_pages: queue the page for transmission 2019 * 2020 * A request from postcopy destination for example. 2021 * 2022 * Returns zero on success or negative on error 2023 * 2024 * @rbname: Name of the RAMBLock of the request. NULL means the 2025 * same that last one. 2026 * @start: starting address from the start of the RAMBlock 2027 * @len: length (in bytes) to send 2028 */ 2029 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2030 { 2031 RAMBlock *ramblock; 2032 RAMState *rs = ram_state; 2033 2034 ram_counters.postcopy_requests++; 2035 RCU_READ_LOCK_GUARD(); 2036 2037 if (!rbname) { 2038 /* Reuse last RAMBlock */ 2039 ramblock = rs->last_req_rb; 2040 2041 if (!ramblock) { 2042 /* 2043 * Shouldn't happen, we can't reuse the last RAMBlock if 2044 * it's the 1st request. 2045 */ 2046 error_report("ram_save_queue_pages no previous block"); 2047 return -1; 2048 } 2049 } else { 2050 ramblock = qemu_ram_block_by_name(rbname); 2051 2052 if (!ramblock) { 2053 /* We shouldn't be asked for a non-existent RAMBlock */ 2054 error_report("ram_save_queue_pages no block '%s'", rbname); 2055 return -1; 2056 } 2057 rs->last_req_rb = ramblock; 2058 } 2059 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2060 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2061 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2062 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2063 __func__, start, len, ramblock->used_length); 2064 return -1; 2065 } 2066 2067 struct RAMSrcPageRequest *new_entry = 2068 g_new0(struct RAMSrcPageRequest, 1); 2069 new_entry->rb = ramblock; 2070 new_entry->offset = start; 2071 new_entry->len = len; 2072 2073 memory_region_ref(ramblock->mr); 2074 qemu_mutex_lock(&rs->src_page_req_mutex); 2075 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2076 migration_make_urgent_request(); 2077 qemu_mutex_unlock(&rs->src_page_req_mutex); 2078 2079 return 0; 2080 } 2081 2082 static bool save_page_use_compression(RAMState *rs) 2083 { 2084 if (!migrate_use_compression()) { 2085 return false; 2086 } 2087 2088 /* 2089 * If xbzrle is enabled (e.g., after first round of migration), stop 2090 * using the data compression. In theory, xbzrle can do better than 2091 * compression. 2092 */ 2093 if (rs->xbzrle_enabled) { 2094 return false; 2095 } 2096 2097 return true; 2098 } 2099 2100 /* 2101 * try to compress the page before posting it out, return true if the page 2102 * has been properly handled by compression, otherwise needs other 2103 * paths to handle it 2104 */ 2105 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 2106 { 2107 if (!save_page_use_compression(rs)) { 2108 return false; 2109 } 2110 2111 /* 2112 * When starting the process of a new block, the first page of 2113 * the block should be sent out before other pages in the same 2114 * block, and all the pages in last block should have been sent 2115 * out, keeping this order is important, because the 'cont' flag 2116 * is used to avoid resending the block name. 2117 * 2118 * We post the fist page as normal page as compression will take 2119 * much CPU resource. 2120 */ 2121 if (block != rs->last_sent_block) { 2122 flush_compressed_data(rs); 2123 return false; 2124 } 2125 2126 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 2127 return true; 2128 } 2129 2130 compression_counters.busy++; 2131 return false; 2132 } 2133 2134 /** 2135 * ram_save_target_page: save one target page 2136 * 2137 * Returns the number of pages written 2138 * 2139 * @rs: current RAM state 2140 * @pss: data about the page we want to send 2141 */ 2142 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 2143 { 2144 RAMBlock *block = pss->block; 2145 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2146 int res; 2147 2148 if (control_save_page(rs, block, offset, &res)) { 2149 return res; 2150 } 2151 2152 if (save_compress_page(rs, block, offset)) { 2153 return 1; 2154 } 2155 2156 res = save_zero_page(rs, block, offset); 2157 if (res > 0) { 2158 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2159 * page would be stale 2160 */ 2161 if (!save_page_use_compression(rs)) { 2162 XBZRLE_cache_lock(); 2163 xbzrle_cache_zero_page(rs, block->offset + offset); 2164 XBZRLE_cache_unlock(); 2165 } 2166 return res; 2167 } 2168 2169 /* 2170 * Do not use multifd for: 2171 * 1. Compression as the first page in the new block should be posted out 2172 * before sending the compressed page 2173 * 2. In postcopy as one whole host page should be placed 2174 */ 2175 if (!save_page_use_compression(rs) && migrate_use_multifd() 2176 && !migration_in_postcopy()) { 2177 return ram_save_multifd_page(rs, block, offset); 2178 } 2179 2180 return ram_save_page(rs, pss); 2181 } 2182 2183 /** 2184 * ram_save_host_page: save a whole host page 2185 * 2186 * Starting at *offset send pages up to the end of the current host 2187 * page. It's valid for the initial offset to point into the middle of 2188 * a host page in which case the remainder of the hostpage is sent. 2189 * Only dirty target pages are sent. Note that the host page size may 2190 * be a huge page for this block. 2191 * The saving stops at the boundary of the used_length of the block 2192 * if the RAMBlock isn't a multiple of the host page size. 2193 * 2194 * Returns the number of pages written or negative on error 2195 * 2196 * @rs: current RAM state 2197 * @pss: data about the page we want to send 2198 */ 2199 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2200 { 2201 int tmppages, pages = 0; 2202 size_t pagesize_bits = 2203 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2204 unsigned long hostpage_boundary = 2205 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2206 unsigned long start_page = pss->page; 2207 int res; 2208 2209 if (ramblock_is_ignored(pss->block)) { 2210 error_report("block %s should not be migrated !", pss->block->idstr); 2211 return 0; 2212 } 2213 2214 do { 2215 /* Check the pages is dirty and if it is send it */ 2216 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2217 tmppages = ram_save_target_page(rs, pss); 2218 if (tmppages < 0) { 2219 return tmppages; 2220 } 2221 2222 pages += tmppages; 2223 /* 2224 * Allow rate limiting to happen in the middle of huge pages if 2225 * something is sent in the current iteration. 2226 */ 2227 if (pagesize_bits > 1 && tmppages > 0) { 2228 migration_rate_limit(); 2229 } 2230 } 2231 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2232 } while ((pss->page < hostpage_boundary) && 2233 offset_in_ramblock(pss->block, 2234 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2235 /* The offset we leave with is the min boundary of host page and block */ 2236 pss->page = MIN(pss->page, hostpage_boundary); 2237 2238 res = ram_save_release_protection(rs, pss, start_page); 2239 return (res < 0 ? res : pages); 2240 } 2241 2242 /** 2243 * ram_find_and_save_block: finds a dirty page and sends it to f 2244 * 2245 * Called within an RCU critical section. 2246 * 2247 * Returns the number of pages written where zero means no dirty pages, 2248 * or negative on error 2249 * 2250 * @rs: current RAM state 2251 * 2252 * On systems where host-page-size > target-page-size it will send all the 2253 * pages in a host page that are dirty. 2254 */ 2255 static int ram_find_and_save_block(RAMState *rs) 2256 { 2257 PageSearchStatus pss; 2258 int pages = 0; 2259 bool again, found; 2260 2261 /* No dirty page as there is zero RAM */ 2262 if (!ram_bytes_total()) { 2263 return pages; 2264 } 2265 2266 pss.block = rs->last_seen_block; 2267 pss.page = rs->last_page; 2268 pss.complete_round = false; 2269 2270 if (!pss.block) { 2271 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2272 } 2273 2274 do { 2275 again = true; 2276 found = get_queued_page(rs, &pss); 2277 2278 if (!found) { 2279 /* priority queue empty, so just search for something dirty */ 2280 found = find_dirty_block(rs, &pss, &again); 2281 } 2282 2283 if (found) { 2284 pages = ram_save_host_page(rs, &pss); 2285 } 2286 } while (!pages && again); 2287 2288 rs->last_seen_block = pss.block; 2289 rs->last_page = pss.page; 2290 2291 return pages; 2292 } 2293 2294 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2295 { 2296 uint64_t pages = size / TARGET_PAGE_SIZE; 2297 2298 if (zero) { 2299 ram_counters.duplicate += pages; 2300 } else { 2301 ram_counters.normal += pages; 2302 ram_transferred_add(size); 2303 qemu_update_position(f, size); 2304 } 2305 } 2306 2307 static uint64_t ram_bytes_total_common(bool count_ignored) 2308 { 2309 RAMBlock *block; 2310 uint64_t total = 0; 2311 2312 RCU_READ_LOCK_GUARD(); 2313 2314 if (count_ignored) { 2315 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2316 total += block->used_length; 2317 } 2318 } else { 2319 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2320 total += block->used_length; 2321 } 2322 } 2323 return total; 2324 } 2325 2326 uint64_t ram_bytes_total(void) 2327 { 2328 return ram_bytes_total_common(false); 2329 } 2330 2331 static void xbzrle_load_setup(void) 2332 { 2333 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2334 } 2335 2336 static void xbzrle_load_cleanup(void) 2337 { 2338 g_free(XBZRLE.decoded_buf); 2339 XBZRLE.decoded_buf = NULL; 2340 } 2341 2342 static void ram_state_cleanup(RAMState **rsp) 2343 { 2344 if (*rsp) { 2345 migration_page_queue_free(*rsp); 2346 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2347 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2348 g_free(*rsp); 2349 *rsp = NULL; 2350 } 2351 } 2352 2353 static void xbzrle_cleanup(void) 2354 { 2355 XBZRLE_cache_lock(); 2356 if (XBZRLE.cache) { 2357 cache_fini(XBZRLE.cache); 2358 g_free(XBZRLE.encoded_buf); 2359 g_free(XBZRLE.current_buf); 2360 g_free(XBZRLE.zero_target_page); 2361 XBZRLE.cache = NULL; 2362 XBZRLE.encoded_buf = NULL; 2363 XBZRLE.current_buf = NULL; 2364 XBZRLE.zero_target_page = NULL; 2365 } 2366 XBZRLE_cache_unlock(); 2367 } 2368 2369 static void ram_save_cleanup(void *opaque) 2370 { 2371 RAMState **rsp = opaque; 2372 RAMBlock *block; 2373 2374 /* We don't use dirty log with background snapshots */ 2375 if (!migrate_background_snapshot()) { 2376 /* caller have hold iothread lock or is in a bh, so there is 2377 * no writing race against the migration bitmap 2378 */ 2379 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2380 /* 2381 * do not stop dirty log without starting it, since 2382 * memory_global_dirty_log_stop will assert that 2383 * memory_global_dirty_log_start/stop used in pairs 2384 */ 2385 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2386 } 2387 } 2388 2389 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2390 g_free(block->clear_bmap); 2391 block->clear_bmap = NULL; 2392 g_free(block->bmap); 2393 block->bmap = NULL; 2394 } 2395 2396 xbzrle_cleanup(); 2397 compress_threads_save_cleanup(); 2398 ram_state_cleanup(rsp); 2399 } 2400 2401 static void ram_state_reset(RAMState *rs) 2402 { 2403 rs->last_seen_block = NULL; 2404 rs->last_sent_block = NULL; 2405 rs->last_page = 0; 2406 rs->last_version = ram_list.version; 2407 rs->xbzrle_enabled = false; 2408 } 2409 2410 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2411 2412 /* **** functions for postcopy ***** */ 2413 2414 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2415 { 2416 struct RAMBlock *block; 2417 2418 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2419 unsigned long *bitmap = block->bmap; 2420 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2421 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2422 2423 while (run_start < range) { 2424 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2425 ram_discard_range(block->idstr, 2426 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2427 ((ram_addr_t)(run_end - run_start)) 2428 << TARGET_PAGE_BITS); 2429 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2430 } 2431 } 2432 } 2433 2434 /** 2435 * postcopy_send_discard_bm_ram: discard a RAMBlock 2436 * 2437 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2438 * 2439 * @ms: current migration state 2440 * @block: RAMBlock to discard 2441 */ 2442 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2443 { 2444 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2445 unsigned long current; 2446 unsigned long *bitmap = block->bmap; 2447 2448 for (current = 0; current < end; ) { 2449 unsigned long one = find_next_bit(bitmap, end, current); 2450 unsigned long zero, discard_length; 2451 2452 if (one >= end) { 2453 break; 2454 } 2455 2456 zero = find_next_zero_bit(bitmap, end, one + 1); 2457 2458 if (zero >= end) { 2459 discard_length = end - one; 2460 } else { 2461 discard_length = zero - one; 2462 } 2463 postcopy_discard_send_range(ms, one, discard_length); 2464 current = one + discard_length; 2465 } 2466 } 2467 2468 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2469 2470 /** 2471 * postcopy_each_ram_send_discard: discard all RAMBlocks 2472 * 2473 * Utility for the outgoing postcopy code. 2474 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2475 * passing it bitmap indexes and name. 2476 * (qemu_ram_foreach_block ends up passing unscaled lengths 2477 * which would mean postcopy code would have to deal with target page) 2478 * 2479 * @ms: current migration state 2480 */ 2481 static void postcopy_each_ram_send_discard(MigrationState *ms) 2482 { 2483 struct RAMBlock *block; 2484 2485 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2486 postcopy_discard_send_init(ms, block->idstr); 2487 2488 /* 2489 * Deal with TPS != HPS and huge pages. It discard any partially sent 2490 * host-page size chunks, mark any partially dirty host-page size 2491 * chunks as all dirty. In this case the host-page is the host-page 2492 * for the particular RAMBlock, i.e. it might be a huge page. 2493 */ 2494 postcopy_chunk_hostpages_pass(ms, block); 2495 2496 /* 2497 * Postcopy sends chunks of bitmap over the wire, but it 2498 * just needs indexes at this point, avoids it having 2499 * target page specific code. 2500 */ 2501 postcopy_send_discard_bm_ram(ms, block); 2502 postcopy_discard_send_finish(ms); 2503 } 2504 } 2505 2506 /** 2507 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2508 * 2509 * Helper for postcopy_chunk_hostpages; it's called twice to 2510 * canonicalize the two bitmaps, that are similar, but one is 2511 * inverted. 2512 * 2513 * Postcopy requires that all target pages in a hostpage are dirty or 2514 * clean, not a mix. This function canonicalizes the bitmaps. 2515 * 2516 * @ms: current migration state 2517 * @block: block that contains the page we want to canonicalize 2518 */ 2519 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2520 { 2521 RAMState *rs = ram_state; 2522 unsigned long *bitmap = block->bmap; 2523 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2524 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2525 unsigned long run_start; 2526 2527 if (block->page_size == TARGET_PAGE_SIZE) { 2528 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2529 return; 2530 } 2531 2532 /* Find a dirty page */ 2533 run_start = find_next_bit(bitmap, pages, 0); 2534 2535 while (run_start < pages) { 2536 2537 /* 2538 * If the start of this run of pages is in the middle of a host 2539 * page, then we need to fixup this host page. 2540 */ 2541 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2542 /* Find the end of this run */ 2543 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2544 /* 2545 * If the end isn't at the start of a host page, then the 2546 * run doesn't finish at the end of a host page 2547 * and we need to discard. 2548 */ 2549 } 2550 2551 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2552 unsigned long page; 2553 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2554 host_ratio); 2555 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2556 2557 /* Clean up the bitmap */ 2558 for (page = fixup_start_addr; 2559 page < fixup_start_addr + host_ratio; page++) { 2560 /* 2561 * Remark them as dirty, updating the count for any pages 2562 * that weren't previously dirty. 2563 */ 2564 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2565 } 2566 } 2567 2568 /* Find the next dirty page for the next iteration */ 2569 run_start = find_next_bit(bitmap, pages, run_start); 2570 } 2571 } 2572 2573 /** 2574 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2575 * 2576 * Transmit the set of pages to be discarded after precopy to the target 2577 * these are pages that: 2578 * a) Have been previously transmitted but are now dirty again 2579 * b) Pages that have never been transmitted, this ensures that 2580 * any pages on the destination that have been mapped by background 2581 * tasks get discarded (transparent huge pages is the specific concern) 2582 * Hopefully this is pretty sparse 2583 * 2584 * @ms: current migration state 2585 */ 2586 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2587 { 2588 RAMState *rs = ram_state; 2589 2590 RCU_READ_LOCK_GUARD(); 2591 2592 /* This should be our last sync, the src is now paused */ 2593 migration_bitmap_sync(rs); 2594 2595 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2596 rs->last_seen_block = NULL; 2597 rs->last_sent_block = NULL; 2598 rs->last_page = 0; 2599 2600 postcopy_each_ram_send_discard(ms); 2601 2602 trace_ram_postcopy_send_discard_bitmap(); 2603 } 2604 2605 /** 2606 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2607 * 2608 * Returns zero on success 2609 * 2610 * @rbname: name of the RAMBlock of the request. NULL means the 2611 * same that last one. 2612 * @start: RAMBlock starting page 2613 * @length: RAMBlock size 2614 */ 2615 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2616 { 2617 trace_ram_discard_range(rbname, start, length); 2618 2619 RCU_READ_LOCK_GUARD(); 2620 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2621 2622 if (!rb) { 2623 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2624 return -1; 2625 } 2626 2627 /* 2628 * On source VM, we don't need to update the received bitmap since 2629 * we don't even have one. 2630 */ 2631 if (rb->receivedmap) { 2632 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2633 length >> qemu_target_page_bits()); 2634 } 2635 2636 return ram_block_discard_range(rb, start, length); 2637 } 2638 2639 /* 2640 * For every allocation, we will try not to crash the VM if the 2641 * allocation failed. 2642 */ 2643 static int xbzrle_init(void) 2644 { 2645 Error *local_err = NULL; 2646 2647 if (!migrate_use_xbzrle()) { 2648 return 0; 2649 } 2650 2651 XBZRLE_cache_lock(); 2652 2653 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2654 if (!XBZRLE.zero_target_page) { 2655 error_report("%s: Error allocating zero page", __func__); 2656 goto err_out; 2657 } 2658 2659 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2660 TARGET_PAGE_SIZE, &local_err); 2661 if (!XBZRLE.cache) { 2662 error_report_err(local_err); 2663 goto free_zero_page; 2664 } 2665 2666 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2667 if (!XBZRLE.encoded_buf) { 2668 error_report("%s: Error allocating encoded_buf", __func__); 2669 goto free_cache; 2670 } 2671 2672 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2673 if (!XBZRLE.current_buf) { 2674 error_report("%s: Error allocating current_buf", __func__); 2675 goto free_encoded_buf; 2676 } 2677 2678 /* We are all good */ 2679 XBZRLE_cache_unlock(); 2680 return 0; 2681 2682 free_encoded_buf: 2683 g_free(XBZRLE.encoded_buf); 2684 XBZRLE.encoded_buf = NULL; 2685 free_cache: 2686 cache_fini(XBZRLE.cache); 2687 XBZRLE.cache = NULL; 2688 free_zero_page: 2689 g_free(XBZRLE.zero_target_page); 2690 XBZRLE.zero_target_page = NULL; 2691 err_out: 2692 XBZRLE_cache_unlock(); 2693 return -ENOMEM; 2694 } 2695 2696 static int ram_state_init(RAMState **rsp) 2697 { 2698 *rsp = g_try_new0(RAMState, 1); 2699 2700 if (!*rsp) { 2701 error_report("%s: Init ramstate fail", __func__); 2702 return -1; 2703 } 2704 2705 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2706 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2707 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2708 2709 /* 2710 * Count the total number of pages used by ram blocks not including any 2711 * gaps due to alignment or unplugs. 2712 * This must match with the initial values of dirty bitmap. 2713 */ 2714 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2715 ram_state_reset(*rsp); 2716 2717 return 0; 2718 } 2719 2720 static void ram_list_init_bitmaps(void) 2721 { 2722 MigrationState *ms = migrate_get_current(); 2723 RAMBlock *block; 2724 unsigned long pages; 2725 uint8_t shift; 2726 2727 /* Skip setting bitmap if there is no RAM */ 2728 if (ram_bytes_total()) { 2729 shift = ms->clear_bitmap_shift; 2730 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2731 error_report("clear_bitmap_shift (%u) too big, using " 2732 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2733 shift = CLEAR_BITMAP_SHIFT_MAX; 2734 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2735 error_report("clear_bitmap_shift (%u) too small, using " 2736 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2737 shift = CLEAR_BITMAP_SHIFT_MIN; 2738 } 2739 2740 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2741 pages = block->max_length >> TARGET_PAGE_BITS; 2742 /* 2743 * The initial dirty bitmap for migration must be set with all 2744 * ones to make sure we'll migrate every guest RAM page to 2745 * destination. 2746 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2747 * new migration after a failed migration, ram_list. 2748 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2749 * guest memory. 2750 */ 2751 block->bmap = bitmap_new(pages); 2752 bitmap_set(block->bmap, 0, pages); 2753 block->clear_bmap_shift = shift; 2754 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2755 } 2756 } 2757 } 2758 2759 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2760 { 2761 unsigned long pages; 2762 RAMBlock *rb; 2763 2764 RCU_READ_LOCK_GUARD(); 2765 2766 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2767 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2768 rs->migration_dirty_pages -= pages; 2769 } 2770 } 2771 2772 static void ram_init_bitmaps(RAMState *rs) 2773 { 2774 /* For memory_global_dirty_log_start below. */ 2775 qemu_mutex_lock_iothread(); 2776 qemu_mutex_lock_ramlist(); 2777 2778 WITH_RCU_READ_LOCK_GUARD() { 2779 ram_list_init_bitmaps(); 2780 /* We don't use dirty log with background snapshots */ 2781 if (!migrate_background_snapshot()) { 2782 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 2783 migration_bitmap_sync_precopy(rs); 2784 } 2785 } 2786 qemu_mutex_unlock_ramlist(); 2787 qemu_mutex_unlock_iothread(); 2788 2789 /* 2790 * After an eventual first bitmap sync, fixup the initial bitmap 2791 * containing all 1s to exclude any discarded pages from migration. 2792 */ 2793 migration_bitmap_clear_discarded_pages(rs); 2794 } 2795 2796 static int ram_init_all(RAMState **rsp) 2797 { 2798 if (ram_state_init(rsp)) { 2799 return -1; 2800 } 2801 2802 if (xbzrle_init()) { 2803 ram_state_cleanup(rsp); 2804 return -1; 2805 } 2806 2807 ram_init_bitmaps(*rsp); 2808 2809 return 0; 2810 } 2811 2812 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2813 { 2814 RAMBlock *block; 2815 uint64_t pages = 0; 2816 2817 /* 2818 * Postcopy is not using xbzrle/compression, so no need for that. 2819 * Also, since source are already halted, we don't need to care 2820 * about dirty page logging as well. 2821 */ 2822 2823 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2824 pages += bitmap_count_one(block->bmap, 2825 block->used_length >> TARGET_PAGE_BITS); 2826 } 2827 2828 /* This may not be aligned with current bitmaps. Recalculate. */ 2829 rs->migration_dirty_pages = pages; 2830 2831 ram_state_reset(rs); 2832 2833 /* Update RAMState cache of output QEMUFile */ 2834 rs->f = out; 2835 2836 trace_ram_state_resume_prepare(pages); 2837 } 2838 2839 /* 2840 * This function clears bits of the free pages reported by the caller from the 2841 * migration dirty bitmap. @addr is the host address corresponding to the 2842 * start of the continuous guest free pages, and @len is the total bytes of 2843 * those pages. 2844 */ 2845 void qemu_guest_free_page_hint(void *addr, size_t len) 2846 { 2847 RAMBlock *block; 2848 ram_addr_t offset; 2849 size_t used_len, start, npages; 2850 MigrationState *s = migrate_get_current(); 2851 2852 /* This function is currently expected to be used during live migration */ 2853 if (!migration_is_setup_or_active(s->state)) { 2854 return; 2855 } 2856 2857 for (; len > 0; len -= used_len, addr += used_len) { 2858 block = qemu_ram_block_from_host(addr, false, &offset); 2859 if (unlikely(!block || offset >= block->used_length)) { 2860 /* 2861 * The implementation might not support RAMBlock resize during 2862 * live migration, but it could happen in theory with future 2863 * updates. So we add a check here to capture that case. 2864 */ 2865 error_report_once("%s unexpected error", __func__); 2866 return; 2867 } 2868 2869 if (len <= block->used_length - offset) { 2870 used_len = len; 2871 } else { 2872 used_len = block->used_length - offset; 2873 } 2874 2875 start = offset >> TARGET_PAGE_BITS; 2876 npages = used_len >> TARGET_PAGE_BITS; 2877 2878 qemu_mutex_lock(&ram_state->bitmap_mutex); 2879 /* 2880 * The skipped free pages are equavalent to be sent from clear_bmap's 2881 * perspective, so clear the bits from the memory region bitmap which 2882 * are initially set. Otherwise those skipped pages will be sent in 2883 * the next round after syncing from the memory region bitmap. 2884 */ 2885 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2886 ram_state->migration_dirty_pages -= 2887 bitmap_count_one_with_offset(block->bmap, start, npages); 2888 bitmap_clear(block->bmap, start, npages); 2889 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2890 } 2891 } 2892 2893 /* 2894 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2895 * long-running RCU critical section. When rcu-reclaims in the code 2896 * start to become numerous it will be necessary to reduce the 2897 * granularity of these critical sections. 2898 */ 2899 2900 /** 2901 * ram_save_setup: Setup RAM for migration 2902 * 2903 * Returns zero to indicate success and negative for error 2904 * 2905 * @f: QEMUFile where to send the data 2906 * @opaque: RAMState pointer 2907 */ 2908 static int ram_save_setup(QEMUFile *f, void *opaque) 2909 { 2910 RAMState **rsp = opaque; 2911 RAMBlock *block; 2912 2913 if (compress_threads_save_setup()) { 2914 return -1; 2915 } 2916 2917 /* migration has already setup the bitmap, reuse it. */ 2918 if (!migration_in_colo_state()) { 2919 if (ram_init_all(rsp) != 0) { 2920 compress_threads_save_cleanup(); 2921 return -1; 2922 } 2923 } 2924 (*rsp)->f = f; 2925 2926 WITH_RCU_READ_LOCK_GUARD() { 2927 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2928 2929 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2930 qemu_put_byte(f, strlen(block->idstr)); 2931 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2932 qemu_put_be64(f, block->used_length); 2933 if (migrate_postcopy_ram() && block->page_size != 2934 qemu_host_page_size) { 2935 qemu_put_be64(f, block->page_size); 2936 } 2937 if (migrate_ignore_shared()) { 2938 qemu_put_be64(f, block->mr->addr); 2939 } 2940 } 2941 } 2942 2943 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2944 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2945 2946 multifd_send_sync_main(f); 2947 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2948 qemu_fflush(f); 2949 2950 return 0; 2951 } 2952 2953 /** 2954 * ram_save_iterate: iterative stage for migration 2955 * 2956 * Returns zero to indicate success and negative for error 2957 * 2958 * @f: QEMUFile where to send the data 2959 * @opaque: RAMState pointer 2960 */ 2961 static int ram_save_iterate(QEMUFile *f, void *opaque) 2962 { 2963 RAMState **temp = opaque; 2964 RAMState *rs = *temp; 2965 int ret = 0; 2966 int i; 2967 int64_t t0; 2968 int done = 0; 2969 2970 if (blk_mig_bulk_active()) { 2971 /* Avoid transferring ram during bulk phase of block migration as 2972 * the bulk phase will usually take a long time and transferring 2973 * ram updates during that time is pointless. */ 2974 goto out; 2975 } 2976 2977 /* 2978 * We'll take this lock a little bit long, but it's okay for two reasons. 2979 * Firstly, the only possible other thread to take it is who calls 2980 * qemu_guest_free_page_hint(), which should be rare; secondly, see 2981 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 2982 * guarantees that we'll at least released it in a regular basis. 2983 */ 2984 qemu_mutex_lock(&rs->bitmap_mutex); 2985 WITH_RCU_READ_LOCK_GUARD() { 2986 if (ram_list.version != rs->last_version) { 2987 ram_state_reset(rs); 2988 } 2989 2990 /* Read version before ram_list.blocks */ 2991 smp_rmb(); 2992 2993 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2994 2995 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2996 i = 0; 2997 while ((ret = qemu_file_rate_limit(f)) == 0 || 2998 postcopy_has_request(rs)) { 2999 int pages; 3000 3001 if (qemu_file_get_error(f)) { 3002 break; 3003 } 3004 3005 pages = ram_find_and_save_block(rs); 3006 /* no more pages to sent */ 3007 if (pages == 0) { 3008 done = 1; 3009 break; 3010 } 3011 3012 if (pages < 0) { 3013 qemu_file_set_error(f, pages); 3014 break; 3015 } 3016 3017 rs->target_page_count += pages; 3018 3019 /* 3020 * During postcopy, it is necessary to make sure one whole host 3021 * page is sent in one chunk. 3022 */ 3023 if (migrate_postcopy_ram()) { 3024 flush_compressed_data(rs); 3025 } 3026 3027 /* 3028 * we want to check in the 1st loop, just in case it was the 1st 3029 * time and we had to sync the dirty bitmap. 3030 * qemu_clock_get_ns() is a bit expensive, so we only check each 3031 * some iterations 3032 */ 3033 if ((i & 63) == 0) { 3034 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3035 1000000; 3036 if (t1 > MAX_WAIT) { 3037 trace_ram_save_iterate_big_wait(t1, i); 3038 break; 3039 } 3040 } 3041 i++; 3042 } 3043 } 3044 qemu_mutex_unlock(&rs->bitmap_mutex); 3045 3046 /* 3047 * Must occur before EOS (or any QEMUFile operation) 3048 * because of RDMA protocol. 3049 */ 3050 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3051 3052 out: 3053 if (ret >= 0 3054 && migration_is_setup_or_active(migrate_get_current()->state)) { 3055 multifd_send_sync_main(rs->f); 3056 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3057 qemu_fflush(f); 3058 ram_transferred_add(8); 3059 3060 ret = qemu_file_get_error(f); 3061 } 3062 if (ret < 0) { 3063 return ret; 3064 } 3065 3066 return done; 3067 } 3068 3069 /** 3070 * ram_save_complete: function called to send the remaining amount of ram 3071 * 3072 * Returns zero to indicate success or negative on error 3073 * 3074 * Called with iothread lock 3075 * 3076 * @f: QEMUFile where to send the data 3077 * @opaque: RAMState pointer 3078 */ 3079 static int ram_save_complete(QEMUFile *f, void *opaque) 3080 { 3081 RAMState **temp = opaque; 3082 RAMState *rs = *temp; 3083 int ret = 0; 3084 3085 rs->last_stage = !migration_in_colo_state(); 3086 3087 WITH_RCU_READ_LOCK_GUARD() { 3088 if (!migration_in_postcopy()) { 3089 migration_bitmap_sync_precopy(rs); 3090 } 3091 3092 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3093 3094 /* try transferring iterative blocks of memory */ 3095 3096 /* flush all remaining blocks regardless of rate limiting */ 3097 while (true) { 3098 int pages; 3099 3100 pages = ram_find_and_save_block(rs); 3101 /* no more blocks to sent */ 3102 if (pages == 0) { 3103 break; 3104 } 3105 if (pages < 0) { 3106 ret = pages; 3107 break; 3108 } 3109 } 3110 3111 flush_compressed_data(rs); 3112 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3113 } 3114 3115 if (ret >= 0) { 3116 multifd_send_sync_main(rs->f); 3117 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3118 qemu_fflush(f); 3119 } 3120 3121 return ret; 3122 } 3123 3124 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3125 uint64_t *res_precopy_only, 3126 uint64_t *res_compatible, 3127 uint64_t *res_postcopy_only) 3128 { 3129 RAMState **temp = opaque; 3130 RAMState *rs = *temp; 3131 uint64_t remaining_size; 3132 3133 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3134 3135 if (!migration_in_postcopy() && 3136 remaining_size < max_size) { 3137 qemu_mutex_lock_iothread(); 3138 WITH_RCU_READ_LOCK_GUARD() { 3139 migration_bitmap_sync_precopy(rs); 3140 } 3141 qemu_mutex_unlock_iothread(); 3142 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3143 } 3144 3145 if (migrate_postcopy_ram()) { 3146 /* We can do postcopy, and all the data is postcopiable */ 3147 *res_compatible += remaining_size; 3148 } else { 3149 *res_precopy_only += remaining_size; 3150 } 3151 } 3152 3153 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3154 { 3155 unsigned int xh_len; 3156 int xh_flags; 3157 uint8_t *loaded_data; 3158 3159 /* extract RLE header */ 3160 xh_flags = qemu_get_byte(f); 3161 xh_len = qemu_get_be16(f); 3162 3163 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3164 error_report("Failed to load XBZRLE page - wrong compression!"); 3165 return -1; 3166 } 3167 3168 if (xh_len > TARGET_PAGE_SIZE) { 3169 error_report("Failed to load XBZRLE page - len overflow!"); 3170 return -1; 3171 } 3172 loaded_data = XBZRLE.decoded_buf; 3173 /* load data and decode */ 3174 /* it can change loaded_data to point to an internal buffer */ 3175 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3176 3177 /* decode RLE */ 3178 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3179 TARGET_PAGE_SIZE) == -1) { 3180 error_report("Failed to load XBZRLE page - decode error!"); 3181 return -1; 3182 } 3183 3184 return 0; 3185 } 3186 3187 /** 3188 * ram_block_from_stream: read a RAMBlock id from the migration stream 3189 * 3190 * Must be called from within a rcu critical section. 3191 * 3192 * Returns a pointer from within the RCU-protected ram_list. 3193 * 3194 * @mis: the migration incoming state pointer 3195 * @f: QEMUFile where to read the data from 3196 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3197 */ 3198 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3199 QEMUFile *f, int flags) 3200 { 3201 RAMBlock *block = mis->last_recv_block; 3202 char id[256]; 3203 uint8_t len; 3204 3205 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3206 if (!block) { 3207 error_report("Ack, bad migration stream!"); 3208 return NULL; 3209 } 3210 return block; 3211 } 3212 3213 len = qemu_get_byte(f); 3214 qemu_get_buffer(f, (uint8_t *)id, len); 3215 id[len] = 0; 3216 3217 block = qemu_ram_block_by_name(id); 3218 if (!block) { 3219 error_report("Can't find block %s", id); 3220 return NULL; 3221 } 3222 3223 if (ramblock_is_ignored(block)) { 3224 error_report("block %s should not be migrated !", id); 3225 return NULL; 3226 } 3227 3228 mis->last_recv_block = block; 3229 3230 return block; 3231 } 3232 3233 static inline void *host_from_ram_block_offset(RAMBlock *block, 3234 ram_addr_t offset) 3235 { 3236 if (!offset_in_ramblock(block, offset)) { 3237 return NULL; 3238 } 3239 3240 return block->host + offset; 3241 } 3242 3243 static void *host_page_from_ram_block_offset(RAMBlock *block, 3244 ram_addr_t offset) 3245 { 3246 /* Note: Explicitly no check against offset_in_ramblock(). */ 3247 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3248 block->page_size); 3249 } 3250 3251 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3252 ram_addr_t offset) 3253 { 3254 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3255 } 3256 3257 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3258 ram_addr_t offset, bool record_bitmap) 3259 { 3260 if (!offset_in_ramblock(block, offset)) { 3261 return NULL; 3262 } 3263 if (!block->colo_cache) { 3264 error_report("%s: colo_cache is NULL in block :%s", 3265 __func__, block->idstr); 3266 return NULL; 3267 } 3268 3269 /* 3270 * During colo checkpoint, we need bitmap of these migrated pages. 3271 * It help us to decide which pages in ram cache should be flushed 3272 * into VM's RAM later. 3273 */ 3274 if (record_bitmap && 3275 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3276 ram_state->migration_dirty_pages++; 3277 } 3278 return block->colo_cache + offset; 3279 } 3280 3281 /** 3282 * ram_handle_compressed: handle the zero page case 3283 * 3284 * If a page (or a whole RDMA chunk) has been 3285 * determined to be zero, then zap it. 3286 * 3287 * @host: host address for the zero page 3288 * @ch: what the page is filled from. We only support zero 3289 * @size: size of the zero page 3290 */ 3291 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3292 { 3293 if (ch != 0 || !buffer_is_zero(host, size)) { 3294 memset(host, ch, size); 3295 } 3296 } 3297 3298 /* return the size after decompression, or negative value on error */ 3299 static int 3300 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3301 const uint8_t *source, size_t source_len) 3302 { 3303 int err; 3304 3305 err = inflateReset(stream); 3306 if (err != Z_OK) { 3307 return -1; 3308 } 3309 3310 stream->avail_in = source_len; 3311 stream->next_in = (uint8_t *)source; 3312 stream->avail_out = dest_len; 3313 stream->next_out = dest; 3314 3315 err = inflate(stream, Z_NO_FLUSH); 3316 if (err != Z_STREAM_END) { 3317 return -1; 3318 } 3319 3320 return stream->total_out; 3321 } 3322 3323 static void *do_data_decompress(void *opaque) 3324 { 3325 DecompressParam *param = opaque; 3326 unsigned long pagesize; 3327 uint8_t *des; 3328 int len, ret; 3329 3330 qemu_mutex_lock(¶m->mutex); 3331 while (!param->quit) { 3332 if (param->des) { 3333 des = param->des; 3334 len = param->len; 3335 param->des = 0; 3336 qemu_mutex_unlock(¶m->mutex); 3337 3338 pagesize = TARGET_PAGE_SIZE; 3339 3340 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3341 param->compbuf, len); 3342 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3343 error_report("decompress data failed"); 3344 qemu_file_set_error(decomp_file, ret); 3345 } 3346 3347 qemu_mutex_lock(&decomp_done_lock); 3348 param->done = true; 3349 qemu_cond_signal(&decomp_done_cond); 3350 qemu_mutex_unlock(&decomp_done_lock); 3351 3352 qemu_mutex_lock(¶m->mutex); 3353 } else { 3354 qemu_cond_wait(¶m->cond, ¶m->mutex); 3355 } 3356 } 3357 qemu_mutex_unlock(¶m->mutex); 3358 3359 return NULL; 3360 } 3361 3362 static int wait_for_decompress_done(void) 3363 { 3364 int idx, thread_count; 3365 3366 if (!migrate_use_compression()) { 3367 return 0; 3368 } 3369 3370 thread_count = migrate_decompress_threads(); 3371 qemu_mutex_lock(&decomp_done_lock); 3372 for (idx = 0; idx < thread_count; idx++) { 3373 while (!decomp_param[idx].done) { 3374 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3375 } 3376 } 3377 qemu_mutex_unlock(&decomp_done_lock); 3378 return qemu_file_get_error(decomp_file); 3379 } 3380 3381 static void compress_threads_load_cleanup(void) 3382 { 3383 int i, thread_count; 3384 3385 if (!migrate_use_compression()) { 3386 return; 3387 } 3388 thread_count = migrate_decompress_threads(); 3389 for (i = 0; i < thread_count; i++) { 3390 /* 3391 * we use it as a indicator which shows if the thread is 3392 * properly init'd or not 3393 */ 3394 if (!decomp_param[i].compbuf) { 3395 break; 3396 } 3397 3398 qemu_mutex_lock(&decomp_param[i].mutex); 3399 decomp_param[i].quit = true; 3400 qemu_cond_signal(&decomp_param[i].cond); 3401 qemu_mutex_unlock(&decomp_param[i].mutex); 3402 } 3403 for (i = 0; i < thread_count; i++) { 3404 if (!decomp_param[i].compbuf) { 3405 break; 3406 } 3407 3408 qemu_thread_join(decompress_threads + i); 3409 qemu_mutex_destroy(&decomp_param[i].mutex); 3410 qemu_cond_destroy(&decomp_param[i].cond); 3411 inflateEnd(&decomp_param[i].stream); 3412 g_free(decomp_param[i].compbuf); 3413 decomp_param[i].compbuf = NULL; 3414 } 3415 g_free(decompress_threads); 3416 g_free(decomp_param); 3417 decompress_threads = NULL; 3418 decomp_param = NULL; 3419 decomp_file = NULL; 3420 } 3421 3422 static int compress_threads_load_setup(QEMUFile *f) 3423 { 3424 int i, thread_count; 3425 3426 if (!migrate_use_compression()) { 3427 return 0; 3428 } 3429 3430 thread_count = migrate_decompress_threads(); 3431 decompress_threads = g_new0(QemuThread, thread_count); 3432 decomp_param = g_new0(DecompressParam, thread_count); 3433 qemu_mutex_init(&decomp_done_lock); 3434 qemu_cond_init(&decomp_done_cond); 3435 decomp_file = f; 3436 for (i = 0; i < thread_count; i++) { 3437 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3438 goto exit; 3439 } 3440 3441 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3442 qemu_mutex_init(&decomp_param[i].mutex); 3443 qemu_cond_init(&decomp_param[i].cond); 3444 decomp_param[i].done = true; 3445 decomp_param[i].quit = false; 3446 qemu_thread_create(decompress_threads + i, "decompress", 3447 do_data_decompress, decomp_param + i, 3448 QEMU_THREAD_JOINABLE); 3449 } 3450 return 0; 3451 exit: 3452 compress_threads_load_cleanup(); 3453 return -1; 3454 } 3455 3456 static void decompress_data_with_multi_threads(QEMUFile *f, 3457 void *host, int len) 3458 { 3459 int idx, thread_count; 3460 3461 thread_count = migrate_decompress_threads(); 3462 QEMU_LOCK_GUARD(&decomp_done_lock); 3463 while (true) { 3464 for (idx = 0; idx < thread_count; idx++) { 3465 if (decomp_param[idx].done) { 3466 decomp_param[idx].done = false; 3467 qemu_mutex_lock(&decomp_param[idx].mutex); 3468 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3469 decomp_param[idx].des = host; 3470 decomp_param[idx].len = len; 3471 qemu_cond_signal(&decomp_param[idx].cond); 3472 qemu_mutex_unlock(&decomp_param[idx].mutex); 3473 break; 3474 } 3475 } 3476 if (idx < thread_count) { 3477 break; 3478 } else { 3479 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3480 } 3481 } 3482 } 3483 3484 static void colo_init_ram_state(void) 3485 { 3486 ram_state_init(&ram_state); 3487 } 3488 3489 /* 3490 * colo cache: this is for secondary VM, we cache the whole 3491 * memory of the secondary VM, it is need to hold the global lock 3492 * to call this helper. 3493 */ 3494 int colo_init_ram_cache(void) 3495 { 3496 RAMBlock *block; 3497 3498 WITH_RCU_READ_LOCK_GUARD() { 3499 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3500 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3501 NULL, false, false); 3502 if (!block->colo_cache) { 3503 error_report("%s: Can't alloc memory for COLO cache of block %s," 3504 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3505 block->used_length); 3506 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3507 if (block->colo_cache) { 3508 qemu_anon_ram_free(block->colo_cache, block->used_length); 3509 block->colo_cache = NULL; 3510 } 3511 } 3512 return -errno; 3513 } 3514 if (!machine_dump_guest_core(current_machine)) { 3515 qemu_madvise(block->colo_cache, block->used_length, 3516 QEMU_MADV_DONTDUMP); 3517 } 3518 } 3519 } 3520 3521 /* 3522 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3523 * with to decide which page in cache should be flushed into SVM's RAM. Here 3524 * we use the same name 'ram_bitmap' as for migration. 3525 */ 3526 if (ram_bytes_total()) { 3527 RAMBlock *block; 3528 3529 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3530 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3531 block->bmap = bitmap_new(pages); 3532 } 3533 } 3534 3535 colo_init_ram_state(); 3536 return 0; 3537 } 3538 3539 /* TODO: duplicated with ram_init_bitmaps */ 3540 void colo_incoming_start_dirty_log(void) 3541 { 3542 RAMBlock *block = NULL; 3543 /* For memory_global_dirty_log_start below. */ 3544 qemu_mutex_lock_iothread(); 3545 qemu_mutex_lock_ramlist(); 3546 3547 memory_global_dirty_log_sync(); 3548 WITH_RCU_READ_LOCK_GUARD() { 3549 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3550 ramblock_sync_dirty_bitmap(ram_state, block); 3551 /* Discard this dirty bitmap record */ 3552 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3553 } 3554 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3555 } 3556 ram_state->migration_dirty_pages = 0; 3557 qemu_mutex_unlock_ramlist(); 3558 qemu_mutex_unlock_iothread(); 3559 } 3560 3561 /* It is need to hold the global lock to call this helper */ 3562 void colo_release_ram_cache(void) 3563 { 3564 RAMBlock *block; 3565 3566 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3567 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3568 g_free(block->bmap); 3569 block->bmap = NULL; 3570 } 3571 3572 WITH_RCU_READ_LOCK_GUARD() { 3573 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3574 if (block->colo_cache) { 3575 qemu_anon_ram_free(block->colo_cache, block->used_length); 3576 block->colo_cache = NULL; 3577 } 3578 } 3579 } 3580 ram_state_cleanup(&ram_state); 3581 } 3582 3583 /** 3584 * ram_load_setup: Setup RAM for migration incoming side 3585 * 3586 * Returns zero to indicate success and negative for error 3587 * 3588 * @f: QEMUFile where to receive the data 3589 * @opaque: RAMState pointer 3590 */ 3591 static int ram_load_setup(QEMUFile *f, void *opaque) 3592 { 3593 if (compress_threads_load_setup(f)) { 3594 return -1; 3595 } 3596 3597 xbzrle_load_setup(); 3598 ramblock_recv_map_init(); 3599 3600 return 0; 3601 } 3602 3603 static int ram_load_cleanup(void *opaque) 3604 { 3605 RAMBlock *rb; 3606 3607 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3608 qemu_ram_block_writeback(rb); 3609 } 3610 3611 xbzrle_load_cleanup(); 3612 compress_threads_load_cleanup(); 3613 3614 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3615 g_free(rb->receivedmap); 3616 rb->receivedmap = NULL; 3617 } 3618 3619 return 0; 3620 } 3621 3622 /** 3623 * ram_postcopy_incoming_init: allocate postcopy data structures 3624 * 3625 * Returns 0 for success and negative if there was one error 3626 * 3627 * @mis: current migration incoming state 3628 * 3629 * Allocate data structures etc needed by incoming migration with 3630 * postcopy-ram. postcopy-ram's similarly names 3631 * postcopy_ram_incoming_init does the work. 3632 */ 3633 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3634 { 3635 return postcopy_ram_incoming_init(mis); 3636 } 3637 3638 /** 3639 * ram_load_postcopy: load a page in postcopy case 3640 * 3641 * Returns 0 for success or -errno in case of error 3642 * 3643 * Called in postcopy mode by ram_load(). 3644 * rcu_read_lock is taken prior to this being called. 3645 * 3646 * @f: QEMUFile where to send the data 3647 */ 3648 int ram_load_postcopy(QEMUFile *f) 3649 { 3650 int flags = 0, ret = 0; 3651 bool place_needed = false; 3652 bool matches_target_page_size = false; 3653 MigrationIncomingState *mis = migration_incoming_get_current(); 3654 /* Currently we only use channel 0. TODO: use all the channels */ 3655 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[0]; 3656 3657 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3658 ram_addr_t addr; 3659 void *page_buffer = NULL; 3660 void *place_source = NULL; 3661 RAMBlock *block = NULL; 3662 uint8_t ch; 3663 int len; 3664 3665 addr = qemu_get_be64(f); 3666 3667 /* 3668 * If qemu file error, we should stop here, and then "addr" 3669 * may be invalid 3670 */ 3671 ret = qemu_file_get_error(f); 3672 if (ret) { 3673 break; 3674 } 3675 3676 flags = addr & ~TARGET_PAGE_MASK; 3677 addr &= TARGET_PAGE_MASK; 3678 3679 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3680 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3681 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3682 block = ram_block_from_stream(mis, f, flags); 3683 if (!block) { 3684 ret = -EINVAL; 3685 break; 3686 } 3687 3688 /* 3689 * Relying on used_length is racy and can result in false positives. 3690 * We might place pages beyond used_length in case RAM was shrunk 3691 * while in postcopy, which is fine - trying to place via 3692 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3693 */ 3694 if (!block->host || addr >= block->postcopy_length) { 3695 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3696 ret = -EINVAL; 3697 break; 3698 } 3699 tmp_page->target_pages++; 3700 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3701 /* 3702 * Postcopy requires that we place whole host pages atomically; 3703 * these may be huge pages for RAMBlocks that are backed by 3704 * hugetlbfs. 3705 * To make it atomic, the data is read into a temporary page 3706 * that's moved into place later. 3707 * The migration protocol uses, possibly smaller, target-pages 3708 * however the source ensures it always sends all the components 3709 * of a host page in one chunk. 3710 */ 3711 page_buffer = tmp_page->tmp_huge_page + 3712 host_page_offset_from_ram_block_offset(block, addr); 3713 /* If all TP are zero then we can optimise the place */ 3714 if (tmp_page->target_pages == 1) { 3715 tmp_page->host_addr = 3716 host_page_from_ram_block_offset(block, addr); 3717 } else if (tmp_page->host_addr != 3718 host_page_from_ram_block_offset(block, addr)) { 3719 /* not the 1st TP within the HP */ 3720 error_report("Non-same host page detected. " 3721 "Target host page %p, received host page %p " 3722 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3723 tmp_page->host_addr, 3724 host_page_from_ram_block_offset(block, addr), 3725 block->idstr, addr, tmp_page->target_pages); 3726 ret = -EINVAL; 3727 break; 3728 } 3729 3730 /* 3731 * If it's the last part of a host page then we place the host 3732 * page 3733 */ 3734 if (tmp_page->target_pages == 3735 (block->page_size / TARGET_PAGE_SIZE)) { 3736 place_needed = true; 3737 } 3738 place_source = tmp_page->tmp_huge_page; 3739 } 3740 3741 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3742 case RAM_SAVE_FLAG_ZERO: 3743 ch = qemu_get_byte(f); 3744 /* 3745 * Can skip to set page_buffer when 3746 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3747 */ 3748 if (ch || !matches_target_page_size) { 3749 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3750 } 3751 if (ch) { 3752 tmp_page->all_zero = false; 3753 } 3754 break; 3755 3756 case RAM_SAVE_FLAG_PAGE: 3757 tmp_page->all_zero = false; 3758 if (!matches_target_page_size) { 3759 /* For huge pages, we always use temporary buffer */ 3760 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3761 } else { 3762 /* 3763 * For small pages that matches target page size, we 3764 * avoid the qemu_file copy. Instead we directly use 3765 * the buffer of QEMUFile to place the page. Note: we 3766 * cannot do any QEMUFile operation before using that 3767 * buffer to make sure the buffer is valid when 3768 * placing the page. 3769 */ 3770 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3771 TARGET_PAGE_SIZE); 3772 } 3773 break; 3774 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3775 tmp_page->all_zero = false; 3776 len = qemu_get_be32(f); 3777 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3778 error_report("Invalid compressed data length: %d", len); 3779 ret = -EINVAL; 3780 break; 3781 } 3782 decompress_data_with_multi_threads(f, page_buffer, len); 3783 break; 3784 3785 case RAM_SAVE_FLAG_EOS: 3786 /* normal exit */ 3787 multifd_recv_sync_main(); 3788 break; 3789 default: 3790 error_report("Unknown combination of migration flags: 0x%x" 3791 " (postcopy mode)", flags); 3792 ret = -EINVAL; 3793 break; 3794 } 3795 3796 /* Got the whole host page, wait for decompress before placing. */ 3797 if (place_needed) { 3798 ret |= wait_for_decompress_done(); 3799 } 3800 3801 /* Detect for any possible file errors */ 3802 if (!ret && qemu_file_get_error(f)) { 3803 ret = qemu_file_get_error(f); 3804 } 3805 3806 if (!ret && place_needed) { 3807 if (tmp_page->all_zero) { 3808 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3809 } else { 3810 ret = postcopy_place_page(mis, tmp_page->host_addr, 3811 place_source, block); 3812 } 3813 place_needed = false; 3814 postcopy_temp_page_reset(tmp_page); 3815 } 3816 } 3817 3818 return ret; 3819 } 3820 3821 static bool postcopy_is_advised(void) 3822 { 3823 PostcopyState ps = postcopy_state_get(); 3824 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3825 } 3826 3827 static bool postcopy_is_running(void) 3828 { 3829 PostcopyState ps = postcopy_state_get(); 3830 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3831 } 3832 3833 /* 3834 * Flush content of RAM cache into SVM's memory. 3835 * Only flush the pages that be dirtied by PVM or SVM or both. 3836 */ 3837 void colo_flush_ram_cache(void) 3838 { 3839 RAMBlock *block = NULL; 3840 void *dst_host; 3841 void *src_host; 3842 unsigned long offset = 0; 3843 3844 memory_global_dirty_log_sync(); 3845 WITH_RCU_READ_LOCK_GUARD() { 3846 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3847 ramblock_sync_dirty_bitmap(ram_state, block); 3848 } 3849 } 3850 3851 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3852 WITH_RCU_READ_LOCK_GUARD() { 3853 block = QLIST_FIRST_RCU(&ram_list.blocks); 3854 3855 while (block) { 3856 unsigned long num = 0; 3857 3858 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3859 if (!offset_in_ramblock(block, 3860 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3861 offset = 0; 3862 num = 0; 3863 block = QLIST_NEXT_RCU(block, next); 3864 } else { 3865 unsigned long i = 0; 3866 3867 for (i = 0; i < num; i++) { 3868 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3869 } 3870 dst_host = block->host 3871 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3872 src_host = block->colo_cache 3873 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3874 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3875 offset += num; 3876 } 3877 } 3878 } 3879 trace_colo_flush_ram_cache_end(); 3880 } 3881 3882 /** 3883 * ram_load_precopy: load pages in precopy case 3884 * 3885 * Returns 0 for success or -errno in case of error 3886 * 3887 * Called in precopy mode by ram_load(). 3888 * rcu_read_lock is taken prior to this being called. 3889 * 3890 * @f: QEMUFile where to send the data 3891 */ 3892 static int ram_load_precopy(QEMUFile *f) 3893 { 3894 MigrationIncomingState *mis = migration_incoming_get_current(); 3895 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3896 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3897 bool postcopy_advised = postcopy_is_advised(); 3898 if (!migrate_use_compression()) { 3899 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3900 } 3901 3902 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3903 ram_addr_t addr, total_ram_bytes; 3904 void *host = NULL, *host_bak = NULL; 3905 uint8_t ch; 3906 3907 /* 3908 * Yield periodically to let main loop run, but an iteration of 3909 * the main loop is expensive, so do it each some iterations 3910 */ 3911 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3912 aio_co_schedule(qemu_get_current_aio_context(), 3913 qemu_coroutine_self()); 3914 qemu_coroutine_yield(); 3915 } 3916 i++; 3917 3918 addr = qemu_get_be64(f); 3919 flags = addr & ~TARGET_PAGE_MASK; 3920 addr &= TARGET_PAGE_MASK; 3921 3922 if (flags & invalid_flags) { 3923 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3924 error_report("Received an unexpected compressed page"); 3925 } 3926 3927 ret = -EINVAL; 3928 break; 3929 } 3930 3931 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3932 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3933 RAMBlock *block = ram_block_from_stream(mis, f, flags); 3934 3935 host = host_from_ram_block_offset(block, addr); 3936 /* 3937 * After going into COLO stage, we should not load the page 3938 * into SVM's memory directly, we put them into colo_cache firstly. 3939 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3940 * Previously, we copied all these memory in preparing stage of COLO 3941 * while we need to stop VM, which is a time-consuming process. 3942 * Here we optimize it by a trick, back-up every page while in 3943 * migration process while COLO is enabled, though it affects the 3944 * speed of the migration, but it obviously reduce the downtime of 3945 * back-up all SVM'S memory in COLO preparing stage. 3946 */ 3947 if (migration_incoming_colo_enabled()) { 3948 if (migration_incoming_in_colo_state()) { 3949 /* In COLO stage, put all pages into cache temporarily */ 3950 host = colo_cache_from_block_offset(block, addr, true); 3951 } else { 3952 /* 3953 * In migration stage but before COLO stage, 3954 * Put all pages into both cache and SVM's memory. 3955 */ 3956 host_bak = colo_cache_from_block_offset(block, addr, false); 3957 } 3958 } 3959 if (!host) { 3960 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3961 ret = -EINVAL; 3962 break; 3963 } 3964 if (!migration_incoming_in_colo_state()) { 3965 ramblock_recv_bitmap_set(block, host); 3966 } 3967 3968 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3969 } 3970 3971 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3972 case RAM_SAVE_FLAG_MEM_SIZE: 3973 /* Synchronize RAM block list */ 3974 total_ram_bytes = addr; 3975 while (!ret && total_ram_bytes) { 3976 RAMBlock *block; 3977 char id[256]; 3978 ram_addr_t length; 3979 3980 len = qemu_get_byte(f); 3981 qemu_get_buffer(f, (uint8_t *)id, len); 3982 id[len] = 0; 3983 length = qemu_get_be64(f); 3984 3985 block = qemu_ram_block_by_name(id); 3986 if (block && !qemu_ram_is_migratable(block)) { 3987 error_report("block %s should not be migrated !", id); 3988 ret = -EINVAL; 3989 } else if (block) { 3990 if (length != block->used_length) { 3991 Error *local_err = NULL; 3992 3993 ret = qemu_ram_resize(block, length, 3994 &local_err); 3995 if (local_err) { 3996 error_report_err(local_err); 3997 } 3998 } 3999 /* For postcopy we need to check hugepage sizes match */ 4000 if (postcopy_advised && migrate_postcopy_ram() && 4001 block->page_size != qemu_host_page_size) { 4002 uint64_t remote_page_size = qemu_get_be64(f); 4003 if (remote_page_size != block->page_size) { 4004 error_report("Mismatched RAM page size %s " 4005 "(local) %zd != %" PRId64, 4006 id, block->page_size, 4007 remote_page_size); 4008 ret = -EINVAL; 4009 } 4010 } 4011 if (migrate_ignore_shared()) { 4012 hwaddr addr = qemu_get_be64(f); 4013 if (ramblock_is_ignored(block) && 4014 block->mr->addr != addr) { 4015 error_report("Mismatched GPAs for block %s " 4016 "%" PRId64 "!= %" PRId64, 4017 id, (uint64_t)addr, 4018 (uint64_t)block->mr->addr); 4019 ret = -EINVAL; 4020 } 4021 } 4022 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4023 block->idstr); 4024 } else { 4025 error_report("Unknown ramblock \"%s\", cannot " 4026 "accept migration", id); 4027 ret = -EINVAL; 4028 } 4029 4030 total_ram_bytes -= length; 4031 } 4032 break; 4033 4034 case RAM_SAVE_FLAG_ZERO: 4035 ch = qemu_get_byte(f); 4036 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4037 break; 4038 4039 case RAM_SAVE_FLAG_PAGE: 4040 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4041 break; 4042 4043 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4044 len = qemu_get_be32(f); 4045 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4046 error_report("Invalid compressed data length: %d", len); 4047 ret = -EINVAL; 4048 break; 4049 } 4050 decompress_data_with_multi_threads(f, host, len); 4051 break; 4052 4053 case RAM_SAVE_FLAG_XBZRLE: 4054 if (load_xbzrle(f, addr, host) < 0) { 4055 error_report("Failed to decompress XBZRLE page at " 4056 RAM_ADDR_FMT, addr); 4057 ret = -EINVAL; 4058 break; 4059 } 4060 break; 4061 case RAM_SAVE_FLAG_EOS: 4062 /* normal exit */ 4063 multifd_recv_sync_main(); 4064 break; 4065 default: 4066 if (flags & RAM_SAVE_FLAG_HOOK) { 4067 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4068 } else { 4069 error_report("Unknown combination of migration flags: 0x%x", 4070 flags); 4071 ret = -EINVAL; 4072 } 4073 } 4074 if (!ret) { 4075 ret = qemu_file_get_error(f); 4076 } 4077 if (!ret && host_bak) { 4078 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4079 } 4080 } 4081 4082 ret |= wait_for_decompress_done(); 4083 return ret; 4084 } 4085 4086 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4087 { 4088 int ret = 0; 4089 static uint64_t seq_iter; 4090 /* 4091 * If system is running in postcopy mode, page inserts to host memory must 4092 * be atomic 4093 */ 4094 bool postcopy_running = postcopy_is_running(); 4095 4096 seq_iter++; 4097 4098 if (version_id != 4) { 4099 return -EINVAL; 4100 } 4101 4102 /* 4103 * This RCU critical section can be very long running. 4104 * When RCU reclaims in the code start to become numerous, 4105 * it will be necessary to reduce the granularity of this 4106 * critical section. 4107 */ 4108 WITH_RCU_READ_LOCK_GUARD() { 4109 if (postcopy_running) { 4110 ret = ram_load_postcopy(f); 4111 } else { 4112 ret = ram_load_precopy(f); 4113 } 4114 } 4115 trace_ram_load_complete(ret, seq_iter); 4116 4117 return ret; 4118 } 4119 4120 static bool ram_has_postcopy(void *opaque) 4121 { 4122 RAMBlock *rb; 4123 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4124 if (ramblock_is_pmem(rb)) { 4125 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4126 "is not supported now!", rb->idstr, rb->host); 4127 return false; 4128 } 4129 } 4130 4131 return migrate_postcopy_ram(); 4132 } 4133 4134 /* Sync all the dirty bitmap with destination VM. */ 4135 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4136 { 4137 RAMBlock *block; 4138 QEMUFile *file = s->to_dst_file; 4139 int ramblock_count = 0; 4140 4141 trace_ram_dirty_bitmap_sync_start(); 4142 4143 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4144 qemu_savevm_send_recv_bitmap(file, block->idstr); 4145 trace_ram_dirty_bitmap_request(block->idstr); 4146 ramblock_count++; 4147 } 4148 4149 trace_ram_dirty_bitmap_sync_wait(); 4150 4151 /* Wait until all the ramblocks' dirty bitmap synced */ 4152 while (ramblock_count--) { 4153 qemu_sem_wait(&s->rp_state.rp_sem); 4154 } 4155 4156 trace_ram_dirty_bitmap_sync_complete(); 4157 4158 return 0; 4159 } 4160 4161 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4162 { 4163 qemu_sem_post(&s->rp_state.rp_sem); 4164 } 4165 4166 /* 4167 * Read the received bitmap, revert it as the initial dirty bitmap. 4168 * This is only used when the postcopy migration is paused but wants 4169 * to resume from a middle point. 4170 */ 4171 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4172 { 4173 int ret = -EINVAL; 4174 /* from_dst_file is always valid because we're within rp_thread */ 4175 QEMUFile *file = s->rp_state.from_dst_file; 4176 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4177 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4178 uint64_t size, end_mark; 4179 4180 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4181 4182 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4183 error_report("%s: incorrect state %s", __func__, 4184 MigrationStatus_str(s->state)); 4185 return -EINVAL; 4186 } 4187 4188 /* 4189 * Note: see comments in ramblock_recv_bitmap_send() on why we 4190 * need the endianness conversion, and the paddings. 4191 */ 4192 local_size = ROUND_UP(local_size, 8); 4193 4194 /* Add paddings */ 4195 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4196 4197 size = qemu_get_be64(file); 4198 4199 /* The size of the bitmap should match with our ramblock */ 4200 if (size != local_size) { 4201 error_report("%s: ramblock '%s' bitmap size mismatch " 4202 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4203 block->idstr, size, local_size); 4204 ret = -EINVAL; 4205 goto out; 4206 } 4207 4208 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4209 end_mark = qemu_get_be64(file); 4210 4211 ret = qemu_file_get_error(file); 4212 if (ret || size != local_size) { 4213 error_report("%s: read bitmap failed for ramblock '%s': %d" 4214 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4215 __func__, block->idstr, ret, local_size, size); 4216 ret = -EIO; 4217 goto out; 4218 } 4219 4220 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4221 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4222 __func__, block->idstr, end_mark); 4223 ret = -EINVAL; 4224 goto out; 4225 } 4226 4227 /* 4228 * Endianness conversion. We are during postcopy (though paused). 4229 * The dirty bitmap won't change. We can directly modify it. 4230 */ 4231 bitmap_from_le(block->bmap, le_bitmap, nbits); 4232 4233 /* 4234 * What we received is "received bitmap". Revert it as the initial 4235 * dirty bitmap for this ramblock. 4236 */ 4237 bitmap_complement(block->bmap, block->bmap, nbits); 4238 4239 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4240 ramblock_dirty_bitmap_clear_discarded_pages(block); 4241 4242 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4243 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4244 4245 /* 4246 * We succeeded to sync bitmap for current ramblock. If this is 4247 * the last one to sync, we need to notify the main send thread. 4248 */ 4249 ram_dirty_bitmap_reload_notify(s); 4250 4251 ret = 0; 4252 out: 4253 g_free(le_bitmap); 4254 return ret; 4255 } 4256 4257 static int ram_resume_prepare(MigrationState *s, void *opaque) 4258 { 4259 RAMState *rs = *(RAMState **)opaque; 4260 int ret; 4261 4262 ret = ram_dirty_bitmap_sync_all(s, rs); 4263 if (ret) { 4264 return ret; 4265 } 4266 4267 ram_state_resume_prepare(rs, s->to_dst_file); 4268 4269 return 0; 4270 } 4271 4272 static SaveVMHandlers savevm_ram_handlers = { 4273 .save_setup = ram_save_setup, 4274 .save_live_iterate = ram_save_iterate, 4275 .save_live_complete_postcopy = ram_save_complete, 4276 .save_live_complete_precopy = ram_save_complete, 4277 .has_postcopy = ram_has_postcopy, 4278 .save_live_pending = ram_save_pending, 4279 .load_state = ram_load, 4280 .save_cleanup = ram_save_cleanup, 4281 .load_setup = ram_load_setup, 4282 .load_cleanup = ram_load_cleanup, 4283 .resume_prepare = ram_resume_prepare, 4284 }; 4285 4286 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4287 size_t old_size, size_t new_size) 4288 { 4289 PostcopyState ps = postcopy_state_get(); 4290 ram_addr_t offset; 4291 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4292 Error *err = NULL; 4293 4294 if (ramblock_is_ignored(rb)) { 4295 return; 4296 } 4297 4298 if (!migration_is_idle()) { 4299 /* 4300 * Precopy code on the source cannot deal with the size of RAM blocks 4301 * changing at random points in time - especially after sending the 4302 * RAM block sizes in the migration stream, they must no longer change. 4303 * Abort and indicate a proper reason. 4304 */ 4305 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4306 migration_cancel(err); 4307 error_free(err); 4308 } 4309 4310 switch (ps) { 4311 case POSTCOPY_INCOMING_ADVISE: 4312 /* 4313 * Update what ram_postcopy_incoming_init()->init_range() does at the 4314 * time postcopy was advised. Syncing RAM blocks with the source will 4315 * result in RAM resizes. 4316 */ 4317 if (old_size < new_size) { 4318 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4319 error_report("RAM block '%s' discard of resized RAM failed", 4320 rb->idstr); 4321 } 4322 } 4323 rb->postcopy_length = new_size; 4324 break; 4325 case POSTCOPY_INCOMING_NONE: 4326 case POSTCOPY_INCOMING_RUNNING: 4327 case POSTCOPY_INCOMING_END: 4328 /* 4329 * Once our guest is running, postcopy does no longer care about 4330 * resizes. When growing, the new memory was not available on the 4331 * source, no handler needed. 4332 */ 4333 break; 4334 default: 4335 error_report("RAM block '%s' resized during postcopy state: %d", 4336 rb->idstr, ps); 4337 exit(-1); 4338 } 4339 } 4340 4341 static RAMBlockNotifier ram_mig_ram_notifier = { 4342 .ram_block_resized = ram_mig_ram_block_resized, 4343 }; 4344 4345 void ram_mig_init(void) 4346 { 4347 qemu_mutex_init(&XBZRLE.lock); 4348 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4349 ram_block_notifier_add(&ram_mig_ram_notifier); 4350 } 4351