1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/main-loop.h" 34 #include "xbzrle.h" 35 #include "ram.h" 36 #include "migration.h" 37 #include "migration/register.h" 38 #include "migration/misc.h" 39 #include "qemu-file.h" 40 #include "postcopy-ram.h" 41 #include "page_cache.h" 42 #include "qemu/error-report.h" 43 #include "qapi/error.h" 44 #include "qapi/qapi-types-migration.h" 45 #include "qapi/qapi-events-migration.h" 46 #include "qapi/qmp/qerror.h" 47 #include "trace.h" 48 #include "exec/ram_addr.h" 49 #include "exec/target_page.h" 50 #include "qemu/rcu_queue.h" 51 #include "migration/colo.h" 52 #include "block.h" 53 #include "sysemu/cpu-throttle.h" 54 #include "savevm.h" 55 #include "qemu/iov.h" 56 #include "multifd.h" 57 #include "sysemu/runstate.h" 58 59 #include "hw/boards.h" /* for machine_dump_guest_core() */ 60 61 #if defined(__linux__) 62 #include "qemu/userfaultfd.h" 63 #endif /* defined(__linux__) */ 64 65 /***********************************************************/ 66 /* ram save/restore */ 67 68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 69 * worked for pages that where filled with the same char. We switched 70 * it to only search for the zero value. And to avoid confusion with 71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 72 */ 73 74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 75 #define RAM_SAVE_FLAG_ZERO 0x02 76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 77 #define RAM_SAVE_FLAG_PAGE 0x08 78 #define RAM_SAVE_FLAG_EOS 0x10 79 #define RAM_SAVE_FLAG_CONTINUE 0x20 80 #define RAM_SAVE_FLAG_XBZRLE 0x40 81 /* 0x80 is reserved in migration.h start with 0x100 next */ 82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 83 84 XBZRLECacheStats xbzrle_counters; 85 86 /* struct contains XBZRLE cache and a static page 87 used by the compression */ 88 static struct { 89 /* buffer used for XBZRLE encoding */ 90 uint8_t *encoded_buf; 91 /* buffer for storing page content */ 92 uint8_t *current_buf; 93 /* Cache for XBZRLE, Protected by lock. */ 94 PageCache *cache; 95 QemuMutex lock; 96 /* it will store a page full of zeros */ 97 uint8_t *zero_target_page; 98 /* buffer used for XBZRLE decoding */ 99 uint8_t *decoded_buf; 100 } XBZRLE; 101 102 static void XBZRLE_cache_lock(void) 103 { 104 if (migrate_use_xbzrle()) { 105 qemu_mutex_lock(&XBZRLE.lock); 106 } 107 } 108 109 static void XBZRLE_cache_unlock(void) 110 { 111 if (migrate_use_xbzrle()) { 112 qemu_mutex_unlock(&XBZRLE.lock); 113 } 114 } 115 116 /** 117 * xbzrle_cache_resize: resize the xbzrle cache 118 * 119 * This function is called from migrate_params_apply in main 120 * thread, possibly while a migration is in progress. A running 121 * migration may be using the cache and might finish during this call, 122 * hence changes to the cache are protected by XBZRLE.lock(). 123 * 124 * Returns 0 for success or -1 for error 125 * 126 * @new_size: new cache size 127 * @errp: set *errp if the check failed, with reason 128 */ 129 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 130 { 131 PageCache *new_cache; 132 int64_t ret = 0; 133 134 /* Check for truncation */ 135 if (new_size != (size_t)new_size) { 136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 137 "exceeding address space"); 138 return -1; 139 } 140 141 if (new_size == migrate_xbzrle_cache_size()) { 142 /* nothing to do */ 143 return 0; 144 } 145 146 XBZRLE_cache_lock(); 147 148 if (XBZRLE.cache != NULL) { 149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 150 if (!new_cache) { 151 ret = -1; 152 goto out; 153 } 154 155 cache_fini(XBZRLE.cache); 156 XBZRLE.cache = new_cache; 157 } 158 out: 159 XBZRLE_cache_unlock(); 160 return ret; 161 } 162 163 bool ramblock_is_ignored(RAMBlock *block) 164 { 165 return !qemu_ram_is_migratable(block) || 166 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 167 } 168 169 #undef RAMBLOCK_FOREACH 170 171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 172 { 173 RAMBlock *block; 174 int ret = 0; 175 176 RCU_READ_LOCK_GUARD(); 177 178 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 179 ret = func(block, opaque); 180 if (ret) { 181 break; 182 } 183 } 184 return ret; 185 } 186 187 static void ramblock_recv_map_init(void) 188 { 189 RAMBlock *rb; 190 191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 192 assert(!rb->receivedmap); 193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 194 } 195 } 196 197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 198 { 199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 200 rb->receivedmap); 201 } 202 203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 204 { 205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 206 } 207 208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 209 { 210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 211 } 212 213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 214 size_t nr) 215 { 216 bitmap_set_atomic(rb->receivedmap, 217 ramblock_recv_bitmap_offset(host_addr, rb), 218 nr); 219 } 220 221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 222 223 /* 224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 225 * 226 * Returns >0 if success with sent bytes, or <0 if error. 227 */ 228 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 229 const char *block_name) 230 { 231 RAMBlock *block = qemu_ram_block_by_name(block_name); 232 unsigned long *le_bitmap, nbits; 233 uint64_t size; 234 235 if (!block) { 236 error_report("%s: invalid block name: %s", __func__, block_name); 237 return -1; 238 } 239 240 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 241 242 /* 243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 244 * machines we may need 4 more bytes for padding (see below 245 * comment). So extend it a bit before hand. 246 */ 247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 248 249 /* 250 * Always use little endian when sending the bitmap. This is 251 * required that when source and destination VMs are not using the 252 * same endianness. (Note: big endian won't work.) 253 */ 254 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 255 256 /* Size of the bitmap, in bytes */ 257 size = DIV_ROUND_UP(nbits, 8); 258 259 /* 260 * size is always aligned to 8 bytes for 64bit machines, but it 261 * may not be true for 32bit machines. We need this padding to 262 * make sure the migration can survive even between 32bit and 263 * 64bit machines. 264 */ 265 size = ROUND_UP(size, 8); 266 267 qemu_put_be64(file, size); 268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 269 /* 270 * Mark as an end, in case the middle part is screwed up due to 271 * some "mysterious" reason. 272 */ 273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 274 qemu_fflush(file); 275 276 g_free(le_bitmap); 277 278 if (qemu_file_get_error(file)) { 279 return qemu_file_get_error(file); 280 } 281 282 return size + sizeof(size); 283 } 284 285 /* 286 * An outstanding page request, on the source, having been received 287 * and queued 288 */ 289 struct RAMSrcPageRequest { 290 RAMBlock *rb; 291 hwaddr offset; 292 hwaddr len; 293 294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 295 }; 296 297 /* State of RAM for migration */ 298 struct RAMState { 299 /* QEMUFile used for this migration */ 300 QEMUFile *f; 301 /* UFFD file descriptor, used in 'write-tracking' migration */ 302 int uffdio_fd; 303 /* Last block that we have visited searching for dirty pages */ 304 RAMBlock *last_seen_block; 305 /* Last block from where we have sent data */ 306 RAMBlock *last_sent_block; 307 /* Last dirty target page we have sent */ 308 ram_addr_t last_page; 309 /* last ram version we have seen */ 310 uint32_t last_version; 311 /* How many times we have dirty too many pages */ 312 int dirty_rate_high_cnt; 313 /* these variables are used for bitmap sync */ 314 /* last time we did a full bitmap_sync */ 315 int64_t time_last_bitmap_sync; 316 /* bytes transferred at start_time */ 317 uint64_t bytes_xfer_prev; 318 /* number of dirty pages since start_time */ 319 uint64_t num_dirty_pages_period; 320 /* xbzrle misses since the beginning of the period */ 321 uint64_t xbzrle_cache_miss_prev; 322 /* Amount of xbzrle pages since the beginning of the period */ 323 uint64_t xbzrle_pages_prev; 324 /* Amount of xbzrle encoded bytes since the beginning of the period */ 325 uint64_t xbzrle_bytes_prev; 326 /* Start using XBZRLE (e.g., after the first round). */ 327 bool xbzrle_enabled; 328 /* Are we on the last stage of migration */ 329 bool last_stage; 330 /* compression statistics since the beginning of the period */ 331 /* amount of count that no free thread to compress data */ 332 uint64_t compress_thread_busy_prev; 333 /* amount bytes after compression */ 334 uint64_t compressed_size_prev; 335 /* amount of compressed pages */ 336 uint64_t compress_pages_prev; 337 338 /* total handled target pages at the beginning of period */ 339 uint64_t target_page_count_prev; 340 /* total handled target pages since start */ 341 uint64_t target_page_count; 342 /* number of dirty bits in the bitmap */ 343 uint64_t migration_dirty_pages; 344 /* Protects modification of the bitmap and migration dirty pages */ 345 QemuMutex bitmap_mutex; 346 /* The RAMBlock used in the last src_page_requests */ 347 RAMBlock *last_req_rb; 348 /* Queue of outstanding page requests from the destination */ 349 QemuMutex src_page_req_mutex; 350 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 351 }; 352 typedef struct RAMState RAMState; 353 354 static RAMState *ram_state; 355 356 static NotifierWithReturnList precopy_notifier_list; 357 358 /* Whether postcopy has queued requests? */ 359 static bool postcopy_has_request(RAMState *rs) 360 { 361 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 362 } 363 364 void precopy_infrastructure_init(void) 365 { 366 notifier_with_return_list_init(&precopy_notifier_list); 367 } 368 369 void precopy_add_notifier(NotifierWithReturn *n) 370 { 371 notifier_with_return_list_add(&precopy_notifier_list, n); 372 } 373 374 void precopy_remove_notifier(NotifierWithReturn *n) 375 { 376 notifier_with_return_remove(n); 377 } 378 379 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 380 { 381 PrecopyNotifyData pnd; 382 pnd.reason = reason; 383 pnd.errp = errp; 384 385 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 386 } 387 388 uint64_t ram_bytes_remaining(void) 389 { 390 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 391 0; 392 } 393 394 MigrationStats ram_counters; 395 396 static void ram_transferred_add(uint64_t bytes) 397 { 398 if (runstate_is_running()) { 399 ram_counters.precopy_bytes += bytes; 400 } else if (migration_in_postcopy()) { 401 ram_counters.postcopy_bytes += bytes; 402 } else { 403 ram_counters.downtime_bytes += bytes; 404 } 405 ram_counters.transferred += bytes; 406 } 407 408 /* used by the search for pages to send */ 409 struct PageSearchStatus { 410 /* Current block being searched */ 411 RAMBlock *block; 412 /* Current page to search from */ 413 unsigned long page; 414 /* Set once we wrap around */ 415 bool complete_round; 416 }; 417 typedef struct PageSearchStatus PageSearchStatus; 418 419 CompressionStats compression_counters; 420 421 struct CompressParam { 422 bool done; 423 bool quit; 424 bool zero_page; 425 QEMUFile *file; 426 QemuMutex mutex; 427 QemuCond cond; 428 RAMBlock *block; 429 ram_addr_t offset; 430 431 /* internally used fields */ 432 z_stream stream; 433 uint8_t *originbuf; 434 }; 435 typedef struct CompressParam CompressParam; 436 437 struct DecompressParam { 438 bool done; 439 bool quit; 440 QemuMutex mutex; 441 QemuCond cond; 442 void *des; 443 uint8_t *compbuf; 444 int len; 445 z_stream stream; 446 }; 447 typedef struct DecompressParam DecompressParam; 448 449 static CompressParam *comp_param; 450 static QemuThread *compress_threads; 451 /* comp_done_cond is used to wake up the migration thread when 452 * one of the compression threads has finished the compression. 453 * comp_done_lock is used to co-work with comp_done_cond. 454 */ 455 static QemuMutex comp_done_lock; 456 static QemuCond comp_done_cond; 457 /* The empty QEMUFileOps will be used by file in CompressParam */ 458 static const QEMUFileOps empty_ops = { }; 459 460 static QEMUFile *decomp_file; 461 static DecompressParam *decomp_param; 462 static QemuThread *decompress_threads; 463 static QemuMutex decomp_done_lock; 464 static QemuCond decomp_done_cond; 465 466 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 467 ram_addr_t offset, uint8_t *source_buf); 468 469 static void *do_data_compress(void *opaque) 470 { 471 CompressParam *param = opaque; 472 RAMBlock *block; 473 ram_addr_t offset; 474 bool zero_page; 475 476 qemu_mutex_lock(¶m->mutex); 477 while (!param->quit) { 478 if (param->block) { 479 block = param->block; 480 offset = param->offset; 481 param->block = NULL; 482 qemu_mutex_unlock(¶m->mutex); 483 484 zero_page = do_compress_ram_page(param->file, ¶m->stream, 485 block, offset, param->originbuf); 486 487 qemu_mutex_lock(&comp_done_lock); 488 param->done = true; 489 param->zero_page = zero_page; 490 qemu_cond_signal(&comp_done_cond); 491 qemu_mutex_unlock(&comp_done_lock); 492 493 qemu_mutex_lock(¶m->mutex); 494 } else { 495 qemu_cond_wait(¶m->cond, ¶m->mutex); 496 } 497 } 498 qemu_mutex_unlock(¶m->mutex); 499 500 return NULL; 501 } 502 503 static void compress_threads_save_cleanup(void) 504 { 505 int i, thread_count; 506 507 if (!migrate_use_compression() || !comp_param) { 508 return; 509 } 510 511 thread_count = migrate_compress_threads(); 512 for (i = 0; i < thread_count; i++) { 513 /* 514 * we use it as a indicator which shows if the thread is 515 * properly init'd or not 516 */ 517 if (!comp_param[i].file) { 518 break; 519 } 520 521 qemu_mutex_lock(&comp_param[i].mutex); 522 comp_param[i].quit = true; 523 qemu_cond_signal(&comp_param[i].cond); 524 qemu_mutex_unlock(&comp_param[i].mutex); 525 526 qemu_thread_join(compress_threads + i); 527 qemu_mutex_destroy(&comp_param[i].mutex); 528 qemu_cond_destroy(&comp_param[i].cond); 529 deflateEnd(&comp_param[i].stream); 530 g_free(comp_param[i].originbuf); 531 qemu_fclose(comp_param[i].file); 532 comp_param[i].file = NULL; 533 } 534 qemu_mutex_destroy(&comp_done_lock); 535 qemu_cond_destroy(&comp_done_cond); 536 g_free(compress_threads); 537 g_free(comp_param); 538 compress_threads = NULL; 539 comp_param = NULL; 540 } 541 542 static int compress_threads_save_setup(void) 543 { 544 int i, thread_count; 545 546 if (!migrate_use_compression()) { 547 return 0; 548 } 549 thread_count = migrate_compress_threads(); 550 compress_threads = g_new0(QemuThread, thread_count); 551 comp_param = g_new0(CompressParam, thread_count); 552 qemu_cond_init(&comp_done_cond); 553 qemu_mutex_init(&comp_done_lock); 554 for (i = 0; i < thread_count; i++) { 555 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 556 if (!comp_param[i].originbuf) { 557 goto exit; 558 } 559 560 if (deflateInit(&comp_param[i].stream, 561 migrate_compress_level()) != Z_OK) { 562 g_free(comp_param[i].originbuf); 563 goto exit; 564 } 565 566 /* comp_param[i].file is just used as a dummy buffer to save data, 567 * set its ops to empty. 568 */ 569 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false); 570 comp_param[i].done = true; 571 comp_param[i].quit = false; 572 qemu_mutex_init(&comp_param[i].mutex); 573 qemu_cond_init(&comp_param[i].cond); 574 qemu_thread_create(compress_threads + i, "compress", 575 do_data_compress, comp_param + i, 576 QEMU_THREAD_JOINABLE); 577 } 578 return 0; 579 580 exit: 581 compress_threads_save_cleanup(); 582 return -1; 583 } 584 585 /** 586 * save_page_header: write page header to wire 587 * 588 * If this is the 1st block, it also writes the block identification 589 * 590 * Returns the number of bytes written 591 * 592 * @f: QEMUFile where to send the data 593 * @block: block that contains the page we want to send 594 * @offset: offset inside the block for the page 595 * in the lower bits, it contains flags 596 */ 597 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 598 ram_addr_t offset) 599 { 600 size_t size, len; 601 602 if (block == rs->last_sent_block) { 603 offset |= RAM_SAVE_FLAG_CONTINUE; 604 } 605 qemu_put_be64(f, offset); 606 size = 8; 607 608 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 609 len = strlen(block->idstr); 610 qemu_put_byte(f, len); 611 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 612 size += 1 + len; 613 rs->last_sent_block = block; 614 } 615 return size; 616 } 617 618 /** 619 * mig_throttle_guest_down: throttle down the guest 620 * 621 * Reduce amount of guest cpu execution to hopefully slow down memory 622 * writes. If guest dirty memory rate is reduced below the rate at 623 * which we can transfer pages to the destination then we should be 624 * able to complete migration. Some workloads dirty memory way too 625 * fast and will not effectively converge, even with auto-converge. 626 */ 627 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 628 uint64_t bytes_dirty_threshold) 629 { 630 MigrationState *s = migrate_get_current(); 631 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 632 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 633 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 634 int pct_max = s->parameters.max_cpu_throttle; 635 636 uint64_t throttle_now = cpu_throttle_get_percentage(); 637 uint64_t cpu_now, cpu_ideal, throttle_inc; 638 639 /* We have not started throttling yet. Let's start it. */ 640 if (!cpu_throttle_active()) { 641 cpu_throttle_set(pct_initial); 642 } else { 643 /* Throttling already on, just increase the rate */ 644 if (!pct_tailslow) { 645 throttle_inc = pct_increment; 646 } else { 647 /* Compute the ideal CPU percentage used by Guest, which may 648 * make the dirty rate match the dirty rate threshold. */ 649 cpu_now = 100 - throttle_now; 650 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 651 bytes_dirty_period); 652 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 653 } 654 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 655 } 656 } 657 658 void mig_throttle_counter_reset(void) 659 { 660 RAMState *rs = ram_state; 661 662 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 663 rs->num_dirty_pages_period = 0; 664 rs->bytes_xfer_prev = ram_counters.transferred; 665 } 666 667 /** 668 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 669 * 670 * @rs: current RAM state 671 * @current_addr: address for the zero page 672 * 673 * Update the xbzrle cache to reflect a page that's been sent as all 0. 674 * The important thing is that a stale (not-yet-0'd) page be replaced 675 * by the new data. 676 * As a bonus, if the page wasn't in the cache it gets added so that 677 * when a small write is made into the 0'd page it gets XBZRLE sent. 678 */ 679 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 680 { 681 if (!rs->xbzrle_enabled) { 682 return; 683 } 684 685 /* We don't care if this fails to allocate a new cache page 686 * as long as it updated an old one */ 687 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 688 ram_counters.dirty_sync_count); 689 } 690 691 #define ENCODING_FLAG_XBZRLE 0x1 692 693 /** 694 * save_xbzrle_page: compress and send current page 695 * 696 * Returns: 1 means that we wrote the page 697 * 0 means that page is identical to the one already sent 698 * -1 means that xbzrle would be longer than normal 699 * 700 * @rs: current RAM state 701 * @current_data: pointer to the address of the page contents 702 * @current_addr: addr of the page 703 * @block: block that contains the page we want to send 704 * @offset: offset inside the block for the page 705 */ 706 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 707 ram_addr_t current_addr, RAMBlock *block, 708 ram_addr_t offset) 709 { 710 int encoded_len = 0, bytes_xbzrle; 711 uint8_t *prev_cached_page; 712 713 if (!cache_is_cached(XBZRLE.cache, current_addr, 714 ram_counters.dirty_sync_count)) { 715 xbzrle_counters.cache_miss++; 716 if (!rs->last_stage) { 717 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 718 ram_counters.dirty_sync_count) == -1) { 719 return -1; 720 } else { 721 /* update *current_data when the page has been 722 inserted into cache */ 723 *current_data = get_cached_data(XBZRLE.cache, current_addr); 724 } 725 } 726 return -1; 727 } 728 729 /* 730 * Reaching here means the page has hit the xbzrle cache, no matter what 731 * encoding result it is (normal encoding, overflow or skipping the page), 732 * count the page as encoded. This is used to calculate the encoding rate. 733 * 734 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 735 * 2nd page turns out to be skipped (i.e. no new bytes written to the 736 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 737 * skipped page included. In this way, the encoding rate can tell if the 738 * guest page is good for xbzrle encoding. 739 */ 740 xbzrle_counters.pages++; 741 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 742 743 /* save current buffer into memory */ 744 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 745 746 /* XBZRLE encoding (if there is no overflow) */ 747 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 748 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 749 TARGET_PAGE_SIZE); 750 751 /* 752 * Update the cache contents, so that it corresponds to the data 753 * sent, in all cases except where we skip the page. 754 */ 755 if (!rs->last_stage && encoded_len != 0) { 756 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 757 /* 758 * In the case where we couldn't compress, ensure that the caller 759 * sends the data from the cache, since the guest might have 760 * changed the RAM since we copied it. 761 */ 762 *current_data = prev_cached_page; 763 } 764 765 if (encoded_len == 0) { 766 trace_save_xbzrle_page_skipping(); 767 return 0; 768 } else if (encoded_len == -1) { 769 trace_save_xbzrle_page_overflow(); 770 xbzrle_counters.overflow++; 771 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 772 return -1; 773 } 774 775 /* Send XBZRLE based compressed page */ 776 bytes_xbzrle = save_page_header(rs, rs->f, block, 777 offset | RAM_SAVE_FLAG_XBZRLE); 778 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 779 qemu_put_be16(rs->f, encoded_len); 780 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 781 bytes_xbzrle += encoded_len + 1 + 2; 782 /* 783 * Like compressed_size (please see update_compress_thread_counts), 784 * the xbzrle encoded bytes don't count the 8 byte header with 785 * RAM_SAVE_FLAG_CONTINUE. 786 */ 787 xbzrle_counters.bytes += bytes_xbzrle - 8; 788 ram_transferred_add(bytes_xbzrle); 789 790 return 1; 791 } 792 793 /** 794 * migration_bitmap_find_dirty: find the next dirty page from start 795 * 796 * Returns the page offset within memory region of the start of a dirty page 797 * 798 * @rs: current RAM state 799 * @rb: RAMBlock where to search for dirty pages 800 * @start: page where we start the search 801 */ 802 static inline 803 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 804 unsigned long start) 805 { 806 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 807 unsigned long *bitmap = rb->bmap; 808 809 if (ramblock_is_ignored(rb)) { 810 return size; 811 } 812 813 return find_next_bit(bitmap, size, start); 814 } 815 816 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 817 unsigned long page) 818 { 819 uint8_t shift; 820 hwaddr size, start; 821 822 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 823 return; 824 } 825 826 shift = rb->clear_bmap_shift; 827 /* 828 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 829 * can make things easier sometimes since then start address 830 * of the small chunk will always be 64 pages aligned so the 831 * bitmap will always be aligned to unsigned long. We should 832 * even be able to remove this restriction but I'm simply 833 * keeping it. 834 */ 835 assert(shift >= 6); 836 837 size = 1ULL << (TARGET_PAGE_BITS + shift); 838 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 839 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 840 memory_region_clear_dirty_bitmap(rb->mr, start, size); 841 } 842 843 static void 844 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 845 unsigned long start, 846 unsigned long npages) 847 { 848 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 849 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 850 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 851 852 /* 853 * Clear pages from start to start + npages - 1, so the end boundary is 854 * exclusive. 855 */ 856 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 857 migration_clear_memory_region_dirty_bitmap(rb, i); 858 } 859 } 860 861 /* 862 * colo_bitmap_find_diry:find contiguous dirty pages from start 863 * 864 * Returns the page offset within memory region of the start of the contiguout 865 * dirty page 866 * 867 * @rs: current RAM state 868 * @rb: RAMBlock where to search for dirty pages 869 * @start: page where we start the search 870 * @num: the number of contiguous dirty pages 871 */ 872 static inline 873 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 874 unsigned long start, unsigned long *num) 875 { 876 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 877 unsigned long *bitmap = rb->bmap; 878 unsigned long first, next; 879 880 *num = 0; 881 882 if (ramblock_is_ignored(rb)) { 883 return size; 884 } 885 886 first = find_next_bit(bitmap, size, start); 887 if (first >= size) { 888 return first; 889 } 890 next = find_next_zero_bit(bitmap, size, first + 1); 891 assert(next >= first); 892 *num = next - first; 893 return first; 894 } 895 896 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 897 RAMBlock *rb, 898 unsigned long page) 899 { 900 bool ret; 901 902 /* 903 * Clear dirty bitmap if needed. This _must_ be called before we 904 * send any of the page in the chunk because we need to make sure 905 * we can capture further page content changes when we sync dirty 906 * log the next time. So as long as we are going to send any of 907 * the page in the chunk we clear the remote dirty bitmap for all. 908 * Clearing it earlier won't be a problem, but too late will. 909 */ 910 migration_clear_memory_region_dirty_bitmap(rb, page); 911 912 ret = test_and_clear_bit(page, rb->bmap); 913 if (ret) { 914 rs->migration_dirty_pages--; 915 } 916 917 return ret; 918 } 919 920 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 921 void *opaque) 922 { 923 const hwaddr offset = section->offset_within_region; 924 const hwaddr size = int128_get64(section->size); 925 const unsigned long start = offset >> TARGET_PAGE_BITS; 926 const unsigned long npages = size >> TARGET_PAGE_BITS; 927 RAMBlock *rb = section->mr->ram_block; 928 uint64_t *cleared_bits = opaque; 929 930 /* 931 * We don't grab ram_state->bitmap_mutex because we expect to run 932 * only when starting migration or during postcopy recovery where 933 * we don't have concurrent access. 934 */ 935 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 936 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 937 } 938 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 939 bitmap_clear(rb->bmap, start, npages); 940 } 941 942 /* 943 * Exclude all dirty pages from migration that fall into a discarded range as 944 * managed by a RamDiscardManager responsible for the mapped memory region of 945 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 946 * 947 * Discarded pages ("logically unplugged") have undefined content and must 948 * not get migrated, because even reading these pages for migration might 949 * result in undesired behavior. 950 * 951 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 952 * 953 * Note: The result is only stable while migrating (precopy/postcopy). 954 */ 955 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 956 { 957 uint64_t cleared_bits = 0; 958 959 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 960 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 961 MemoryRegionSection section = { 962 .mr = rb->mr, 963 .offset_within_region = 0, 964 .size = int128_make64(qemu_ram_get_used_length(rb)), 965 }; 966 967 ram_discard_manager_replay_discarded(rdm, §ion, 968 dirty_bitmap_clear_section, 969 &cleared_bits); 970 } 971 return cleared_bits; 972 } 973 974 /* 975 * Check if a host-page aligned page falls into a discarded range as managed by 976 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 977 * 978 * Note: The result is only stable while migrating (precopy/postcopy). 979 */ 980 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 981 { 982 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 983 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 984 MemoryRegionSection section = { 985 .mr = rb->mr, 986 .offset_within_region = start, 987 .size = int128_make64(qemu_ram_pagesize(rb)), 988 }; 989 990 return !ram_discard_manager_is_populated(rdm, §ion); 991 } 992 return false; 993 } 994 995 /* Called with RCU critical section */ 996 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 997 { 998 uint64_t new_dirty_pages = 999 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1000 1001 rs->migration_dirty_pages += new_dirty_pages; 1002 rs->num_dirty_pages_period += new_dirty_pages; 1003 } 1004 1005 /** 1006 * ram_pagesize_summary: calculate all the pagesizes of a VM 1007 * 1008 * Returns a summary bitmap of the page sizes of all RAMBlocks 1009 * 1010 * For VMs with just normal pages this is equivalent to the host page 1011 * size. If it's got some huge pages then it's the OR of all the 1012 * different page sizes. 1013 */ 1014 uint64_t ram_pagesize_summary(void) 1015 { 1016 RAMBlock *block; 1017 uint64_t summary = 0; 1018 1019 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1020 summary |= block->page_size; 1021 } 1022 1023 return summary; 1024 } 1025 1026 uint64_t ram_get_total_transferred_pages(void) 1027 { 1028 return ram_counters.normal + ram_counters.duplicate + 1029 compression_counters.pages + xbzrle_counters.pages; 1030 } 1031 1032 static void migration_update_rates(RAMState *rs, int64_t end_time) 1033 { 1034 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1035 double compressed_size; 1036 1037 /* calculate period counters */ 1038 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1039 / (end_time - rs->time_last_bitmap_sync); 1040 1041 if (!page_count) { 1042 return; 1043 } 1044 1045 if (migrate_use_xbzrle()) { 1046 double encoded_size, unencoded_size; 1047 1048 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1049 rs->xbzrle_cache_miss_prev) / page_count; 1050 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1051 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1052 TARGET_PAGE_SIZE; 1053 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1054 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1055 xbzrle_counters.encoding_rate = 0; 1056 } else { 1057 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1058 } 1059 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1060 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1061 } 1062 1063 if (migrate_use_compression()) { 1064 compression_counters.busy_rate = (double)(compression_counters.busy - 1065 rs->compress_thread_busy_prev) / page_count; 1066 rs->compress_thread_busy_prev = compression_counters.busy; 1067 1068 compressed_size = compression_counters.compressed_size - 1069 rs->compressed_size_prev; 1070 if (compressed_size) { 1071 double uncompressed_size = (compression_counters.pages - 1072 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1073 1074 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1075 compression_counters.compression_rate = 1076 uncompressed_size / compressed_size; 1077 1078 rs->compress_pages_prev = compression_counters.pages; 1079 rs->compressed_size_prev = compression_counters.compressed_size; 1080 } 1081 } 1082 } 1083 1084 static void migration_trigger_throttle(RAMState *rs) 1085 { 1086 MigrationState *s = migrate_get_current(); 1087 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1088 1089 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 1090 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1091 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1092 1093 /* During block migration the auto-converge logic incorrectly detects 1094 * that ram migration makes no progress. Avoid this by disabling the 1095 * throttling logic during the bulk phase of block migration. */ 1096 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1097 /* The following detection logic can be refined later. For now: 1098 Check to see if the ratio between dirtied bytes and the approx. 1099 amount of bytes that just got transferred since the last time 1100 we were in this routine reaches the threshold. If that happens 1101 twice, start or increase throttling. */ 1102 1103 if ((bytes_dirty_period > bytes_dirty_threshold) && 1104 (++rs->dirty_rate_high_cnt >= 2)) { 1105 trace_migration_throttle(); 1106 rs->dirty_rate_high_cnt = 0; 1107 mig_throttle_guest_down(bytes_dirty_period, 1108 bytes_dirty_threshold); 1109 } 1110 } 1111 } 1112 1113 static void migration_bitmap_sync(RAMState *rs) 1114 { 1115 RAMBlock *block; 1116 int64_t end_time; 1117 1118 ram_counters.dirty_sync_count++; 1119 1120 if (!rs->time_last_bitmap_sync) { 1121 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1122 } 1123 1124 trace_migration_bitmap_sync_start(); 1125 memory_global_dirty_log_sync(); 1126 1127 qemu_mutex_lock(&rs->bitmap_mutex); 1128 WITH_RCU_READ_LOCK_GUARD() { 1129 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1130 ramblock_sync_dirty_bitmap(rs, block); 1131 } 1132 ram_counters.remaining = ram_bytes_remaining(); 1133 } 1134 qemu_mutex_unlock(&rs->bitmap_mutex); 1135 1136 memory_global_after_dirty_log_sync(); 1137 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1138 1139 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1140 1141 /* more than 1 second = 1000 millisecons */ 1142 if (end_time > rs->time_last_bitmap_sync + 1000) { 1143 migration_trigger_throttle(rs); 1144 1145 migration_update_rates(rs, end_time); 1146 1147 rs->target_page_count_prev = rs->target_page_count; 1148 1149 /* reset period counters */ 1150 rs->time_last_bitmap_sync = end_time; 1151 rs->num_dirty_pages_period = 0; 1152 rs->bytes_xfer_prev = ram_counters.transferred; 1153 } 1154 if (migrate_use_events()) { 1155 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1156 } 1157 } 1158 1159 static void migration_bitmap_sync_precopy(RAMState *rs) 1160 { 1161 Error *local_err = NULL; 1162 1163 /* 1164 * The current notifier usage is just an optimization to migration, so we 1165 * don't stop the normal migration process in the error case. 1166 */ 1167 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1168 error_report_err(local_err); 1169 local_err = NULL; 1170 } 1171 1172 migration_bitmap_sync(rs); 1173 1174 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1175 error_report_err(local_err); 1176 } 1177 } 1178 1179 static void ram_release_page(const char *rbname, uint64_t offset) 1180 { 1181 if (!migrate_release_ram() || !migration_in_postcopy()) { 1182 return; 1183 } 1184 1185 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1186 } 1187 1188 /** 1189 * save_zero_page_to_file: send the zero page to the file 1190 * 1191 * Returns the size of data written to the file, 0 means the page is not 1192 * a zero page 1193 * 1194 * @rs: current RAM state 1195 * @file: the file where the data is saved 1196 * @block: block that contains the page we want to send 1197 * @offset: offset inside the block for the page 1198 */ 1199 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1200 RAMBlock *block, ram_addr_t offset) 1201 { 1202 uint8_t *p = block->host + offset; 1203 int len = 0; 1204 1205 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1206 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1207 qemu_put_byte(file, 0); 1208 len += 1; 1209 ram_release_page(block->idstr, offset); 1210 } 1211 return len; 1212 } 1213 1214 /** 1215 * save_zero_page: send the zero page to the stream 1216 * 1217 * Returns the number of pages written. 1218 * 1219 * @rs: current RAM state 1220 * @block: block that contains the page we want to send 1221 * @offset: offset inside the block for the page 1222 */ 1223 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1224 { 1225 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1226 1227 if (len) { 1228 ram_counters.duplicate++; 1229 ram_transferred_add(len); 1230 return 1; 1231 } 1232 return -1; 1233 } 1234 1235 /* 1236 * @pages: the number of pages written by the control path, 1237 * < 0 - error 1238 * > 0 - number of pages written 1239 * 1240 * Return true if the pages has been saved, otherwise false is returned. 1241 */ 1242 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1243 int *pages) 1244 { 1245 uint64_t bytes_xmit = 0; 1246 int ret; 1247 1248 *pages = -1; 1249 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1250 &bytes_xmit); 1251 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1252 return false; 1253 } 1254 1255 if (bytes_xmit) { 1256 ram_transferred_add(bytes_xmit); 1257 *pages = 1; 1258 } 1259 1260 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1261 return true; 1262 } 1263 1264 if (bytes_xmit > 0) { 1265 ram_counters.normal++; 1266 } else if (bytes_xmit == 0) { 1267 ram_counters.duplicate++; 1268 } 1269 1270 return true; 1271 } 1272 1273 /* 1274 * directly send the page to the stream 1275 * 1276 * Returns the number of pages written. 1277 * 1278 * @rs: current RAM state 1279 * @block: block that contains the page we want to send 1280 * @offset: offset inside the block for the page 1281 * @buf: the page to be sent 1282 * @async: send to page asyncly 1283 */ 1284 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1285 uint8_t *buf, bool async) 1286 { 1287 ram_transferred_add(save_page_header(rs, rs->f, block, 1288 offset | RAM_SAVE_FLAG_PAGE)); 1289 if (async) { 1290 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1291 migrate_release_ram() & 1292 migration_in_postcopy()); 1293 } else { 1294 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1295 } 1296 ram_transferred_add(TARGET_PAGE_SIZE); 1297 ram_counters.normal++; 1298 return 1; 1299 } 1300 1301 /** 1302 * ram_save_page: send the given page to the stream 1303 * 1304 * Returns the number of pages written. 1305 * < 0 - error 1306 * >=0 - Number of pages written - this might legally be 0 1307 * if xbzrle noticed the page was the same. 1308 * 1309 * @rs: current RAM state 1310 * @block: block that contains the page we want to send 1311 * @offset: offset inside the block for the page 1312 */ 1313 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1314 { 1315 int pages = -1; 1316 uint8_t *p; 1317 bool send_async = true; 1318 RAMBlock *block = pss->block; 1319 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1320 ram_addr_t current_addr = block->offset + offset; 1321 1322 p = block->host + offset; 1323 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1324 1325 XBZRLE_cache_lock(); 1326 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1327 pages = save_xbzrle_page(rs, &p, current_addr, block, 1328 offset); 1329 if (!rs->last_stage) { 1330 /* Can't send this cached data async, since the cache page 1331 * might get updated before it gets to the wire 1332 */ 1333 send_async = false; 1334 } 1335 } 1336 1337 /* XBZRLE overflow or normal page */ 1338 if (pages == -1) { 1339 pages = save_normal_page(rs, block, offset, p, send_async); 1340 } 1341 1342 XBZRLE_cache_unlock(); 1343 1344 return pages; 1345 } 1346 1347 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1348 ram_addr_t offset) 1349 { 1350 if (multifd_queue_page(rs->f, block, offset) < 0) { 1351 return -1; 1352 } 1353 ram_counters.normal++; 1354 1355 return 1; 1356 } 1357 1358 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1359 ram_addr_t offset, uint8_t *source_buf) 1360 { 1361 RAMState *rs = ram_state; 1362 uint8_t *p = block->host + offset; 1363 int ret; 1364 1365 if (save_zero_page_to_file(rs, f, block, offset)) { 1366 return true; 1367 } 1368 1369 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1370 1371 /* 1372 * copy it to a internal buffer to avoid it being modified by VM 1373 * so that we can catch up the error during compression and 1374 * decompression 1375 */ 1376 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1377 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1378 if (ret < 0) { 1379 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1380 error_report("compressed data failed!"); 1381 } 1382 return false; 1383 } 1384 1385 static void 1386 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1387 { 1388 ram_transferred_add(bytes_xmit); 1389 1390 if (param->zero_page) { 1391 ram_counters.duplicate++; 1392 return; 1393 } 1394 1395 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1396 compression_counters.compressed_size += bytes_xmit - 8; 1397 compression_counters.pages++; 1398 } 1399 1400 static bool save_page_use_compression(RAMState *rs); 1401 1402 static void flush_compressed_data(RAMState *rs) 1403 { 1404 int idx, len, thread_count; 1405 1406 if (!save_page_use_compression(rs)) { 1407 return; 1408 } 1409 thread_count = migrate_compress_threads(); 1410 1411 qemu_mutex_lock(&comp_done_lock); 1412 for (idx = 0; idx < thread_count; idx++) { 1413 while (!comp_param[idx].done) { 1414 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1415 } 1416 } 1417 qemu_mutex_unlock(&comp_done_lock); 1418 1419 for (idx = 0; idx < thread_count; idx++) { 1420 qemu_mutex_lock(&comp_param[idx].mutex); 1421 if (!comp_param[idx].quit) { 1422 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1423 /* 1424 * it's safe to fetch zero_page without holding comp_done_lock 1425 * as there is no further request submitted to the thread, 1426 * i.e, the thread should be waiting for a request at this point. 1427 */ 1428 update_compress_thread_counts(&comp_param[idx], len); 1429 } 1430 qemu_mutex_unlock(&comp_param[idx].mutex); 1431 } 1432 } 1433 1434 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1435 ram_addr_t offset) 1436 { 1437 param->block = block; 1438 param->offset = offset; 1439 } 1440 1441 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1442 ram_addr_t offset) 1443 { 1444 int idx, thread_count, bytes_xmit = -1, pages = -1; 1445 bool wait = migrate_compress_wait_thread(); 1446 1447 thread_count = migrate_compress_threads(); 1448 qemu_mutex_lock(&comp_done_lock); 1449 retry: 1450 for (idx = 0; idx < thread_count; idx++) { 1451 if (comp_param[idx].done) { 1452 comp_param[idx].done = false; 1453 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1454 qemu_mutex_lock(&comp_param[idx].mutex); 1455 set_compress_params(&comp_param[idx], block, offset); 1456 qemu_cond_signal(&comp_param[idx].cond); 1457 qemu_mutex_unlock(&comp_param[idx].mutex); 1458 pages = 1; 1459 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1460 break; 1461 } 1462 } 1463 1464 /* 1465 * wait for the free thread if the user specifies 'compress-wait-thread', 1466 * otherwise we will post the page out in the main thread as normal page. 1467 */ 1468 if (pages < 0 && wait) { 1469 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1470 goto retry; 1471 } 1472 qemu_mutex_unlock(&comp_done_lock); 1473 1474 return pages; 1475 } 1476 1477 /** 1478 * find_dirty_block: find the next dirty page and update any state 1479 * associated with the search process. 1480 * 1481 * Returns true if a page is found 1482 * 1483 * @rs: current RAM state 1484 * @pss: data about the state of the current dirty page scan 1485 * @again: set to false if the search has scanned the whole of RAM 1486 */ 1487 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1488 { 1489 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1490 if (pss->complete_round && pss->block == rs->last_seen_block && 1491 pss->page >= rs->last_page) { 1492 /* 1493 * We've been once around the RAM and haven't found anything. 1494 * Give up. 1495 */ 1496 *again = false; 1497 return false; 1498 } 1499 if (!offset_in_ramblock(pss->block, 1500 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1501 /* Didn't find anything in this RAM Block */ 1502 pss->page = 0; 1503 pss->block = QLIST_NEXT_RCU(pss->block, next); 1504 if (!pss->block) { 1505 /* 1506 * If memory migration starts over, we will meet a dirtied page 1507 * which may still exists in compression threads's ring, so we 1508 * should flush the compressed data to make sure the new page 1509 * is not overwritten by the old one in the destination. 1510 * 1511 * Also If xbzrle is on, stop using the data compression at this 1512 * point. In theory, xbzrle can do better than compression. 1513 */ 1514 flush_compressed_data(rs); 1515 1516 /* Hit the end of the list */ 1517 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1518 /* Flag that we've looped */ 1519 pss->complete_round = true; 1520 /* After the first round, enable XBZRLE. */ 1521 if (migrate_use_xbzrle()) { 1522 rs->xbzrle_enabled = true; 1523 } 1524 } 1525 /* Didn't find anything this time, but try again on the new block */ 1526 *again = true; 1527 return false; 1528 } else { 1529 /* Can go around again, but... */ 1530 *again = true; 1531 /* We've found something so probably don't need to */ 1532 return true; 1533 } 1534 } 1535 1536 /** 1537 * unqueue_page: gets a page of the queue 1538 * 1539 * Helper for 'get_queued_page' - gets a page off the queue 1540 * 1541 * Returns the block of the page (or NULL if none available) 1542 * 1543 * @rs: current RAM state 1544 * @offset: used to return the offset within the RAMBlock 1545 */ 1546 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1547 { 1548 struct RAMSrcPageRequest *entry; 1549 RAMBlock *block = NULL; 1550 size_t page_size; 1551 1552 if (!postcopy_has_request(rs)) { 1553 return NULL; 1554 } 1555 1556 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1557 1558 /* 1559 * This should _never_ change even after we take the lock, because no one 1560 * should be taking anything off the request list other than us. 1561 */ 1562 assert(postcopy_has_request(rs)); 1563 1564 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1565 block = entry->rb; 1566 *offset = entry->offset; 1567 page_size = qemu_ram_pagesize(block); 1568 /* Each page request should only be multiple page size of the ramblock */ 1569 assert((entry->len % page_size) == 0); 1570 1571 if (entry->len > page_size) { 1572 entry->len -= page_size; 1573 entry->offset += page_size; 1574 } else { 1575 memory_region_unref(block->mr); 1576 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1577 g_free(entry); 1578 migration_consume_urgent_request(); 1579 } 1580 1581 trace_unqueue_page(block->idstr, *offset, 1582 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap)); 1583 1584 return block; 1585 } 1586 1587 #if defined(__linux__) 1588 /** 1589 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1590 * is found, return RAM block pointer and page offset 1591 * 1592 * Returns pointer to the RAMBlock containing faulting page, 1593 * NULL if no write faults are pending 1594 * 1595 * @rs: current RAM state 1596 * @offset: page offset from the beginning of the block 1597 */ 1598 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1599 { 1600 struct uffd_msg uffd_msg; 1601 void *page_address; 1602 RAMBlock *block; 1603 int res; 1604 1605 if (!migrate_background_snapshot()) { 1606 return NULL; 1607 } 1608 1609 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1610 if (res <= 0) { 1611 return NULL; 1612 } 1613 1614 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1615 block = qemu_ram_block_from_host(page_address, false, offset); 1616 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1617 return block; 1618 } 1619 1620 /** 1621 * ram_save_release_protection: release UFFD write protection after 1622 * a range of pages has been saved 1623 * 1624 * @rs: current RAM state 1625 * @pss: page-search-status structure 1626 * @start_page: index of the first page in the range relative to pss->block 1627 * 1628 * Returns 0 on success, negative value in case of an error 1629 */ 1630 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1631 unsigned long start_page) 1632 { 1633 int res = 0; 1634 1635 /* Check if page is from UFFD-managed region. */ 1636 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1637 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1638 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1639 1640 /* Flush async buffers before un-protect. */ 1641 qemu_fflush(rs->f); 1642 /* Un-protect memory range. */ 1643 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1644 false, false); 1645 } 1646 1647 return res; 1648 } 1649 1650 /* ram_write_tracking_available: check if kernel supports required UFFD features 1651 * 1652 * Returns true if supports, false otherwise 1653 */ 1654 bool ram_write_tracking_available(void) 1655 { 1656 uint64_t uffd_features; 1657 int res; 1658 1659 res = uffd_query_features(&uffd_features); 1660 return (res == 0 && 1661 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1662 } 1663 1664 /* ram_write_tracking_compatible: check if guest configuration is 1665 * compatible with 'write-tracking' 1666 * 1667 * Returns true if compatible, false otherwise 1668 */ 1669 bool ram_write_tracking_compatible(void) 1670 { 1671 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1672 int uffd_fd; 1673 RAMBlock *block; 1674 bool ret = false; 1675 1676 /* Open UFFD file descriptor */ 1677 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1678 if (uffd_fd < 0) { 1679 return false; 1680 } 1681 1682 RCU_READ_LOCK_GUARD(); 1683 1684 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1685 uint64_t uffd_ioctls; 1686 1687 /* Nothing to do with read-only and MMIO-writable regions */ 1688 if (block->mr->readonly || block->mr->rom_device) { 1689 continue; 1690 } 1691 /* Try to register block memory via UFFD-IO to track writes */ 1692 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1693 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1694 goto out; 1695 } 1696 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1697 goto out; 1698 } 1699 } 1700 ret = true; 1701 1702 out: 1703 uffd_close_fd(uffd_fd); 1704 return ret; 1705 } 1706 1707 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1708 ram_addr_t size) 1709 { 1710 /* 1711 * We read one byte of each page; this will preallocate page tables if 1712 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1713 * where no page was populated yet. This might require adaption when 1714 * supporting other mappings, like shmem. 1715 */ 1716 for (; offset < size; offset += block->page_size) { 1717 char tmp = *((char *)block->host + offset); 1718 1719 /* Don't optimize the read out */ 1720 asm volatile("" : "+r" (tmp)); 1721 } 1722 } 1723 1724 static inline int populate_read_section(MemoryRegionSection *section, 1725 void *opaque) 1726 { 1727 const hwaddr size = int128_get64(section->size); 1728 hwaddr offset = section->offset_within_region; 1729 RAMBlock *block = section->mr->ram_block; 1730 1731 populate_read_range(block, offset, size); 1732 return 0; 1733 } 1734 1735 /* 1736 * ram_block_populate_read: preallocate page tables and populate pages in the 1737 * RAM block by reading a byte of each page. 1738 * 1739 * Since it's solely used for userfault_fd WP feature, here we just 1740 * hardcode page size to qemu_real_host_page_size. 1741 * 1742 * @block: RAM block to populate 1743 */ 1744 static void ram_block_populate_read(RAMBlock *rb) 1745 { 1746 /* 1747 * Skip populating all pages that fall into a discarded range as managed by 1748 * a RamDiscardManager responsible for the mapped memory region of the 1749 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1750 * must not get populated automatically. We don't have to track 1751 * modifications via userfaultfd WP reliably, because these pages will 1752 * not be part of the migration stream either way -- see 1753 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1754 * 1755 * Note: The result is only stable while migrating (precopy/postcopy). 1756 */ 1757 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1758 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1759 MemoryRegionSection section = { 1760 .mr = rb->mr, 1761 .offset_within_region = 0, 1762 .size = rb->mr->size, 1763 }; 1764 1765 ram_discard_manager_replay_populated(rdm, §ion, 1766 populate_read_section, NULL); 1767 } else { 1768 populate_read_range(rb, 0, rb->used_length); 1769 } 1770 } 1771 1772 /* 1773 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1774 */ 1775 void ram_write_tracking_prepare(void) 1776 { 1777 RAMBlock *block; 1778 1779 RCU_READ_LOCK_GUARD(); 1780 1781 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1782 /* Nothing to do with read-only and MMIO-writable regions */ 1783 if (block->mr->readonly || block->mr->rom_device) { 1784 continue; 1785 } 1786 1787 /* 1788 * Populate pages of the RAM block before enabling userfault_fd 1789 * write protection. 1790 * 1791 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1792 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1793 * pages with pte_none() entries in page table. 1794 */ 1795 ram_block_populate_read(block); 1796 } 1797 } 1798 1799 /* 1800 * ram_write_tracking_start: start UFFD-WP memory tracking 1801 * 1802 * Returns 0 for success or negative value in case of error 1803 */ 1804 int ram_write_tracking_start(void) 1805 { 1806 int uffd_fd; 1807 RAMState *rs = ram_state; 1808 RAMBlock *block; 1809 1810 /* Open UFFD file descriptor */ 1811 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1812 if (uffd_fd < 0) { 1813 return uffd_fd; 1814 } 1815 rs->uffdio_fd = uffd_fd; 1816 1817 RCU_READ_LOCK_GUARD(); 1818 1819 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1820 /* Nothing to do with read-only and MMIO-writable regions */ 1821 if (block->mr->readonly || block->mr->rom_device) { 1822 continue; 1823 } 1824 1825 /* Register block memory with UFFD to track writes */ 1826 if (uffd_register_memory(rs->uffdio_fd, block->host, 1827 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1828 goto fail; 1829 } 1830 /* Apply UFFD write protection to the block memory range */ 1831 if (uffd_change_protection(rs->uffdio_fd, block->host, 1832 block->max_length, true, false)) { 1833 goto fail; 1834 } 1835 block->flags |= RAM_UF_WRITEPROTECT; 1836 memory_region_ref(block->mr); 1837 1838 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1839 block->host, block->max_length); 1840 } 1841 1842 return 0; 1843 1844 fail: 1845 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1846 1847 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1848 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1849 continue; 1850 } 1851 /* 1852 * In case some memory block failed to be write-protected 1853 * remove protection and unregister all succeeded RAM blocks 1854 */ 1855 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1856 false, false); 1857 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1858 /* Cleanup flags and remove reference */ 1859 block->flags &= ~RAM_UF_WRITEPROTECT; 1860 memory_region_unref(block->mr); 1861 } 1862 1863 uffd_close_fd(uffd_fd); 1864 rs->uffdio_fd = -1; 1865 return -1; 1866 } 1867 1868 /** 1869 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1870 */ 1871 void ram_write_tracking_stop(void) 1872 { 1873 RAMState *rs = ram_state; 1874 RAMBlock *block; 1875 1876 RCU_READ_LOCK_GUARD(); 1877 1878 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1879 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1880 continue; 1881 } 1882 /* Remove protection and unregister all affected RAM blocks */ 1883 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1884 false, false); 1885 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1886 1887 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1888 block->host, block->max_length); 1889 1890 /* Cleanup flags and remove reference */ 1891 block->flags &= ~RAM_UF_WRITEPROTECT; 1892 memory_region_unref(block->mr); 1893 } 1894 1895 /* Finally close UFFD file descriptor */ 1896 uffd_close_fd(rs->uffdio_fd); 1897 rs->uffdio_fd = -1; 1898 } 1899 1900 #else 1901 /* No target OS support, stubs just fail or ignore */ 1902 1903 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1904 { 1905 (void) rs; 1906 (void) offset; 1907 1908 return NULL; 1909 } 1910 1911 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1912 unsigned long start_page) 1913 { 1914 (void) rs; 1915 (void) pss; 1916 (void) start_page; 1917 1918 return 0; 1919 } 1920 1921 bool ram_write_tracking_available(void) 1922 { 1923 return false; 1924 } 1925 1926 bool ram_write_tracking_compatible(void) 1927 { 1928 assert(0); 1929 return false; 1930 } 1931 1932 int ram_write_tracking_start(void) 1933 { 1934 assert(0); 1935 return -1; 1936 } 1937 1938 void ram_write_tracking_stop(void) 1939 { 1940 assert(0); 1941 } 1942 #endif /* defined(__linux__) */ 1943 1944 /** 1945 * get_queued_page: unqueue a page from the postcopy requests 1946 * 1947 * Skips pages that are already sent (!dirty) 1948 * 1949 * Returns true if a queued page is found 1950 * 1951 * @rs: current RAM state 1952 * @pss: data about the state of the current dirty page scan 1953 */ 1954 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1955 { 1956 RAMBlock *block; 1957 ram_addr_t offset; 1958 1959 block = unqueue_page(rs, &offset); 1960 1961 if (!block) { 1962 /* 1963 * Poll write faults too if background snapshot is enabled; that's 1964 * when we have vcpus got blocked by the write protected pages. 1965 */ 1966 block = poll_fault_page(rs, &offset); 1967 } 1968 1969 if (block) { 1970 /* 1971 * We want the background search to continue from the queued page 1972 * since the guest is likely to want other pages near to the page 1973 * it just requested. 1974 */ 1975 pss->block = block; 1976 pss->page = offset >> TARGET_PAGE_BITS; 1977 1978 /* 1979 * This unqueued page would break the "one round" check, even is 1980 * really rare. 1981 */ 1982 pss->complete_round = false; 1983 } 1984 1985 return !!block; 1986 } 1987 1988 /** 1989 * migration_page_queue_free: drop any remaining pages in the ram 1990 * request queue 1991 * 1992 * It should be empty at the end anyway, but in error cases there may 1993 * be some left. in case that there is any page left, we drop it. 1994 * 1995 */ 1996 static void migration_page_queue_free(RAMState *rs) 1997 { 1998 struct RAMSrcPageRequest *mspr, *next_mspr; 1999 /* This queue generally should be empty - but in the case of a failed 2000 * migration might have some droppings in. 2001 */ 2002 RCU_READ_LOCK_GUARD(); 2003 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2004 memory_region_unref(mspr->rb->mr); 2005 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2006 g_free(mspr); 2007 } 2008 } 2009 2010 /** 2011 * ram_save_queue_pages: queue the page for transmission 2012 * 2013 * A request from postcopy destination for example. 2014 * 2015 * Returns zero on success or negative on error 2016 * 2017 * @rbname: Name of the RAMBLock of the request. NULL means the 2018 * same that last one. 2019 * @start: starting address from the start of the RAMBlock 2020 * @len: length (in bytes) to send 2021 */ 2022 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2023 { 2024 RAMBlock *ramblock; 2025 RAMState *rs = ram_state; 2026 2027 ram_counters.postcopy_requests++; 2028 RCU_READ_LOCK_GUARD(); 2029 2030 if (!rbname) { 2031 /* Reuse last RAMBlock */ 2032 ramblock = rs->last_req_rb; 2033 2034 if (!ramblock) { 2035 /* 2036 * Shouldn't happen, we can't reuse the last RAMBlock if 2037 * it's the 1st request. 2038 */ 2039 error_report("ram_save_queue_pages no previous block"); 2040 return -1; 2041 } 2042 } else { 2043 ramblock = qemu_ram_block_by_name(rbname); 2044 2045 if (!ramblock) { 2046 /* We shouldn't be asked for a non-existent RAMBlock */ 2047 error_report("ram_save_queue_pages no block '%s'", rbname); 2048 return -1; 2049 } 2050 rs->last_req_rb = ramblock; 2051 } 2052 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2053 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2054 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2055 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2056 __func__, start, len, ramblock->used_length); 2057 return -1; 2058 } 2059 2060 struct RAMSrcPageRequest *new_entry = 2061 g_malloc0(sizeof(struct RAMSrcPageRequest)); 2062 new_entry->rb = ramblock; 2063 new_entry->offset = start; 2064 new_entry->len = len; 2065 2066 memory_region_ref(ramblock->mr); 2067 qemu_mutex_lock(&rs->src_page_req_mutex); 2068 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2069 migration_make_urgent_request(); 2070 qemu_mutex_unlock(&rs->src_page_req_mutex); 2071 2072 return 0; 2073 } 2074 2075 static bool save_page_use_compression(RAMState *rs) 2076 { 2077 if (!migrate_use_compression()) { 2078 return false; 2079 } 2080 2081 /* 2082 * If xbzrle is enabled (e.g., after first round of migration), stop 2083 * using the data compression. In theory, xbzrle can do better than 2084 * compression. 2085 */ 2086 if (rs->xbzrle_enabled) { 2087 return false; 2088 } 2089 2090 return true; 2091 } 2092 2093 /* 2094 * try to compress the page before posting it out, return true if the page 2095 * has been properly handled by compression, otherwise needs other 2096 * paths to handle it 2097 */ 2098 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 2099 { 2100 if (!save_page_use_compression(rs)) { 2101 return false; 2102 } 2103 2104 /* 2105 * When starting the process of a new block, the first page of 2106 * the block should be sent out before other pages in the same 2107 * block, and all the pages in last block should have been sent 2108 * out, keeping this order is important, because the 'cont' flag 2109 * is used to avoid resending the block name. 2110 * 2111 * We post the fist page as normal page as compression will take 2112 * much CPU resource. 2113 */ 2114 if (block != rs->last_sent_block) { 2115 flush_compressed_data(rs); 2116 return false; 2117 } 2118 2119 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 2120 return true; 2121 } 2122 2123 compression_counters.busy++; 2124 return false; 2125 } 2126 2127 /** 2128 * ram_save_target_page: save one target page 2129 * 2130 * Returns the number of pages written 2131 * 2132 * @rs: current RAM state 2133 * @pss: data about the page we want to send 2134 */ 2135 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 2136 { 2137 RAMBlock *block = pss->block; 2138 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2139 int res; 2140 2141 if (control_save_page(rs, block, offset, &res)) { 2142 return res; 2143 } 2144 2145 if (save_compress_page(rs, block, offset)) { 2146 return 1; 2147 } 2148 2149 res = save_zero_page(rs, block, offset); 2150 if (res > 0) { 2151 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2152 * page would be stale 2153 */ 2154 if (!save_page_use_compression(rs)) { 2155 XBZRLE_cache_lock(); 2156 xbzrle_cache_zero_page(rs, block->offset + offset); 2157 XBZRLE_cache_unlock(); 2158 } 2159 return res; 2160 } 2161 2162 /* 2163 * Do not use multifd for: 2164 * 1. Compression as the first page in the new block should be posted out 2165 * before sending the compressed page 2166 * 2. In postcopy as one whole host page should be placed 2167 */ 2168 if (!save_page_use_compression(rs) && migrate_use_multifd() 2169 && !migration_in_postcopy()) { 2170 return ram_save_multifd_page(rs, block, offset); 2171 } 2172 2173 return ram_save_page(rs, pss); 2174 } 2175 2176 /** 2177 * ram_save_host_page: save a whole host page 2178 * 2179 * Starting at *offset send pages up to the end of the current host 2180 * page. It's valid for the initial offset to point into the middle of 2181 * a host page in which case the remainder of the hostpage is sent. 2182 * Only dirty target pages are sent. Note that the host page size may 2183 * be a huge page for this block. 2184 * The saving stops at the boundary of the used_length of the block 2185 * if the RAMBlock isn't a multiple of the host page size. 2186 * 2187 * Returns the number of pages written or negative on error 2188 * 2189 * @rs: current RAM state 2190 * @pss: data about the page we want to send 2191 */ 2192 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2193 { 2194 int tmppages, pages = 0; 2195 size_t pagesize_bits = 2196 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2197 unsigned long hostpage_boundary = 2198 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2199 unsigned long start_page = pss->page; 2200 int res; 2201 2202 if (ramblock_is_ignored(pss->block)) { 2203 error_report("block %s should not be migrated !", pss->block->idstr); 2204 return 0; 2205 } 2206 2207 do { 2208 /* Check the pages is dirty and if it is send it */ 2209 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2210 tmppages = ram_save_target_page(rs, pss); 2211 if (tmppages < 0) { 2212 return tmppages; 2213 } 2214 2215 pages += tmppages; 2216 /* 2217 * Allow rate limiting to happen in the middle of huge pages if 2218 * something is sent in the current iteration. 2219 */ 2220 if (pagesize_bits > 1 && tmppages > 0) { 2221 migration_rate_limit(); 2222 } 2223 } 2224 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2225 } while ((pss->page < hostpage_boundary) && 2226 offset_in_ramblock(pss->block, 2227 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2228 /* The offset we leave with is the min boundary of host page and block */ 2229 pss->page = MIN(pss->page, hostpage_boundary); 2230 2231 res = ram_save_release_protection(rs, pss, start_page); 2232 return (res < 0 ? res : pages); 2233 } 2234 2235 /** 2236 * ram_find_and_save_block: finds a dirty page and sends it to f 2237 * 2238 * Called within an RCU critical section. 2239 * 2240 * Returns the number of pages written where zero means no dirty pages, 2241 * or negative on error 2242 * 2243 * @rs: current RAM state 2244 * 2245 * On systems where host-page-size > target-page-size it will send all the 2246 * pages in a host page that are dirty. 2247 */ 2248 static int ram_find_and_save_block(RAMState *rs) 2249 { 2250 PageSearchStatus pss; 2251 int pages = 0; 2252 bool again, found; 2253 2254 /* No dirty page as there is zero RAM */ 2255 if (!ram_bytes_total()) { 2256 return pages; 2257 } 2258 2259 pss.block = rs->last_seen_block; 2260 pss.page = rs->last_page; 2261 pss.complete_round = false; 2262 2263 if (!pss.block) { 2264 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2265 } 2266 2267 do { 2268 again = true; 2269 found = get_queued_page(rs, &pss); 2270 2271 if (!found) { 2272 /* priority queue empty, so just search for something dirty */ 2273 found = find_dirty_block(rs, &pss, &again); 2274 } 2275 2276 if (found) { 2277 pages = ram_save_host_page(rs, &pss); 2278 } 2279 } while (!pages && again); 2280 2281 rs->last_seen_block = pss.block; 2282 rs->last_page = pss.page; 2283 2284 return pages; 2285 } 2286 2287 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2288 { 2289 uint64_t pages = size / TARGET_PAGE_SIZE; 2290 2291 if (zero) { 2292 ram_counters.duplicate += pages; 2293 } else { 2294 ram_counters.normal += pages; 2295 ram_transferred_add(size); 2296 qemu_update_position(f, size); 2297 } 2298 } 2299 2300 static uint64_t ram_bytes_total_common(bool count_ignored) 2301 { 2302 RAMBlock *block; 2303 uint64_t total = 0; 2304 2305 RCU_READ_LOCK_GUARD(); 2306 2307 if (count_ignored) { 2308 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2309 total += block->used_length; 2310 } 2311 } else { 2312 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2313 total += block->used_length; 2314 } 2315 } 2316 return total; 2317 } 2318 2319 uint64_t ram_bytes_total(void) 2320 { 2321 return ram_bytes_total_common(false); 2322 } 2323 2324 static void xbzrle_load_setup(void) 2325 { 2326 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2327 } 2328 2329 static void xbzrle_load_cleanup(void) 2330 { 2331 g_free(XBZRLE.decoded_buf); 2332 XBZRLE.decoded_buf = NULL; 2333 } 2334 2335 static void ram_state_cleanup(RAMState **rsp) 2336 { 2337 if (*rsp) { 2338 migration_page_queue_free(*rsp); 2339 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2340 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2341 g_free(*rsp); 2342 *rsp = NULL; 2343 } 2344 } 2345 2346 static void xbzrle_cleanup(void) 2347 { 2348 XBZRLE_cache_lock(); 2349 if (XBZRLE.cache) { 2350 cache_fini(XBZRLE.cache); 2351 g_free(XBZRLE.encoded_buf); 2352 g_free(XBZRLE.current_buf); 2353 g_free(XBZRLE.zero_target_page); 2354 XBZRLE.cache = NULL; 2355 XBZRLE.encoded_buf = NULL; 2356 XBZRLE.current_buf = NULL; 2357 XBZRLE.zero_target_page = NULL; 2358 } 2359 XBZRLE_cache_unlock(); 2360 } 2361 2362 static void ram_save_cleanup(void *opaque) 2363 { 2364 RAMState **rsp = opaque; 2365 RAMBlock *block; 2366 2367 /* We don't use dirty log with background snapshots */ 2368 if (!migrate_background_snapshot()) { 2369 /* caller have hold iothread lock or is in a bh, so there is 2370 * no writing race against the migration bitmap 2371 */ 2372 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2373 /* 2374 * do not stop dirty log without starting it, since 2375 * memory_global_dirty_log_stop will assert that 2376 * memory_global_dirty_log_start/stop used in pairs 2377 */ 2378 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2379 } 2380 } 2381 2382 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2383 g_free(block->clear_bmap); 2384 block->clear_bmap = NULL; 2385 g_free(block->bmap); 2386 block->bmap = NULL; 2387 } 2388 2389 xbzrle_cleanup(); 2390 compress_threads_save_cleanup(); 2391 ram_state_cleanup(rsp); 2392 } 2393 2394 static void ram_state_reset(RAMState *rs) 2395 { 2396 rs->last_seen_block = NULL; 2397 rs->last_sent_block = NULL; 2398 rs->last_page = 0; 2399 rs->last_version = ram_list.version; 2400 rs->xbzrle_enabled = false; 2401 } 2402 2403 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2404 2405 /* **** functions for postcopy ***** */ 2406 2407 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2408 { 2409 struct RAMBlock *block; 2410 2411 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2412 unsigned long *bitmap = block->bmap; 2413 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2414 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2415 2416 while (run_start < range) { 2417 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2418 ram_discard_range(block->idstr, 2419 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2420 ((ram_addr_t)(run_end - run_start)) 2421 << TARGET_PAGE_BITS); 2422 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2423 } 2424 } 2425 } 2426 2427 /** 2428 * postcopy_send_discard_bm_ram: discard a RAMBlock 2429 * 2430 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2431 * 2432 * @ms: current migration state 2433 * @block: RAMBlock to discard 2434 */ 2435 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2436 { 2437 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2438 unsigned long current; 2439 unsigned long *bitmap = block->bmap; 2440 2441 for (current = 0; current < end; ) { 2442 unsigned long one = find_next_bit(bitmap, end, current); 2443 unsigned long zero, discard_length; 2444 2445 if (one >= end) { 2446 break; 2447 } 2448 2449 zero = find_next_zero_bit(bitmap, end, one + 1); 2450 2451 if (zero >= end) { 2452 discard_length = end - one; 2453 } else { 2454 discard_length = zero - one; 2455 } 2456 postcopy_discard_send_range(ms, one, discard_length); 2457 current = one + discard_length; 2458 } 2459 } 2460 2461 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2462 2463 /** 2464 * postcopy_each_ram_send_discard: discard all RAMBlocks 2465 * 2466 * Utility for the outgoing postcopy code. 2467 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2468 * passing it bitmap indexes and name. 2469 * (qemu_ram_foreach_block ends up passing unscaled lengths 2470 * which would mean postcopy code would have to deal with target page) 2471 * 2472 * @ms: current migration state 2473 */ 2474 static void postcopy_each_ram_send_discard(MigrationState *ms) 2475 { 2476 struct RAMBlock *block; 2477 2478 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2479 postcopy_discard_send_init(ms, block->idstr); 2480 2481 /* 2482 * Deal with TPS != HPS and huge pages. It discard any partially sent 2483 * host-page size chunks, mark any partially dirty host-page size 2484 * chunks as all dirty. In this case the host-page is the host-page 2485 * for the particular RAMBlock, i.e. it might be a huge page. 2486 */ 2487 postcopy_chunk_hostpages_pass(ms, block); 2488 2489 /* 2490 * Postcopy sends chunks of bitmap over the wire, but it 2491 * just needs indexes at this point, avoids it having 2492 * target page specific code. 2493 */ 2494 postcopy_send_discard_bm_ram(ms, block); 2495 postcopy_discard_send_finish(ms); 2496 } 2497 } 2498 2499 /** 2500 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2501 * 2502 * Helper for postcopy_chunk_hostpages; it's called twice to 2503 * canonicalize the two bitmaps, that are similar, but one is 2504 * inverted. 2505 * 2506 * Postcopy requires that all target pages in a hostpage are dirty or 2507 * clean, not a mix. This function canonicalizes the bitmaps. 2508 * 2509 * @ms: current migration state 2510 * @block: block that contains the page we want to canonicalize 2511 */ 2512 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2513 { 2514 RAMState *rs = ram_state; 2515 unsigned long *bitmap = block->bmap; 2516 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2517 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2518 unsigned long run_start; 2519 2520 if (block->page_size == TARGET_PAGE_SIZE) { 2521 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2522 return; 2523 } 2524 2525 /* Find a dirty page */ 2526 run_start = find_next_bit(bitmap, pages, 0); 2527 2528 while (run_start < pages) { 2529 2530 /* 2531 * If the start of this run of pages is in the middle of a host 2532 * page, then we need to fixup this host page. 2533 */ 2534 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2535 /* Find the end of this run */ 2536 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2537 /* 2538 * If the end isn't at the start of a host page, then the 2539 * run doesn't finish at the end of a host page 2540 * and we need to discard. 2541 */ 2542 } 2543 2544 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2545 unsigned long page; 2546 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2547 host_ratio); 2548 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2549 2550 /* Clean up the bitmap */ 2551 for (page = fixup_start_addr; 2552 page < fixup_start_addr + host_ratio; page++) { 2553 /* 2554 * Remark them as dirty, updating the count for any pages 2555 * that weren't previously dirty. 2556 */ 2557 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2558 } 2559 } 2560 2561 /* Find the next dirty page for the next iteration */ 2562 run_start = find_next_bit(bitmap, pages, run_start); 2563 } 2564 } 2565 2566 /** 2567 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2568 * 2569 * Transmit the set of pages to be discarded after precopy to the target 2570 * these are pages that: 2571 * a) Have been previously transmitted but are now dirty again 2572 * b) Pages that have never been transmitted, this ensures that 2573 * any pages on the destination that have been mapped by background 2574 * tasks get discarded (transparent huge pages is the specific concern) 2575 * Hopefully this is pretty sparse 2576 * 2577 * @ms: current migration state 2578 */ 2579 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2580 { 2581 RAMState *rs = ram_state; 2582 2583 RCU_READ_LOCK_GUARD(); 2584 2585 /* This should be our last sync, the src is now paused */ 2586 migration_bitmap_sync(rs); 2587 2588 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2589 rs->last_seen_block = NULL; 2590 rs->last_sent_block = NULL; 2591 rs->last_page = 0; 2592 2593 postcopy_each_ram_send_discard(ms); 2594 2595 trace_ram_postcopy_send_discard_bitmap(); 2596 } 2597 2598 /** 2599 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2600 * 2601 * Returns zero on success 2602 * 2603 * @rbname: name of the RAMBlock of the request. NULL means the 2604 * same that last one. 2605 * @start: RAMBlock starting page 2606 * @length: RAMBlock size 2607 */ 2608 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2609 { 2610 trace_ram_discard_range(rbname, start, length); 2611 2612 RCU_READ_LOCK_GUARD(); 2613 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2614 2615 if (!rb) { 2616 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2617 return -1; 2618 } 2619 2620 /* 2621 * On source VM, we don't need to update the received bitmap since 2622 * we don't even have one. 2623 */ 2624 if (rb->receivedmap) { 2625 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2626 length >> qemu_target_page_bits()); 2627 } 2628 2629 return ram_block_discard_range(rb, start, length); 2630 } 2631 2632 /* 2633 * For every allocation, we will try not to crash the VM if the 2634 * allocation failed. 2635 */ 2636 static int xbzrle_init(void) 2637 { 2638 Error *local_err = NULL; 2639 2640 if (!migrate_use_xbzrle()) { 2641 return 0; 2642 } 2643 2644 XBZRLE_cache_lock(); 2645 2646 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2647 if (!XBZRLE.zero_target_page) { 2648 error_report("%s: Error allocating zero page", __func__); 2649 goto err_out; 2650 } 2651 2652 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2653 TARGET_PAGE_SIZE, &local_err); 2654 if (!XBZRLE.cache) { 2655 error_report_err(local_err); 2656 goto free_zero_page; 2657 } 2658 2659 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2660 if (!XBZRLE.encoded_buf) { 2661 error_report("%s: Error allocating encoded_buf", __func__); 2662 goto free_cache; 2663 } 2664 2665 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2666 if (!XBZRLE.current_buf) { 2667 error_report("%s: Error allocating current_buf", __func__); 2668 goto free_encoded_buf; 2669 } 2670 2671 /* We are all good */ 2672 XBZRLE_cache_unlock(); 2673 return 0; 2674 2675 free_encoded_buf: 2676 g_free(XBZRLE.encoded_buf); 2677 XBZRLE.encoded_buf = NULL; 2678 free_cache: 2679 cache_fini(XBZRLE.cache); 2680 XBZRLE.cache = NULL; 2681 free_zero_page: 2682 g_free(XBZRLE.zero_target_page); 2683 XBZRLE.zero_target_page = NULL; 2684 err_out: 2685 XBZRLE_cache_unlock(); 2686 return -ENOMEM; 2687 } 2688 2689 static int ram_state_init(RAMState **rsp) 2690 { 2691 *rsp = g_try_new0(RAMState, 1); 2692 2693 if (!*rsp) { 2694 error_report("%s: Init ramstate fail", __func__); 2695 return -1; 2696 } 2697 2698 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2699 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2700 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2701 2702 /* 2703 * Count the total number of pages used by ram blocks not including any 2704 * gaps due to alignment or unplugs. 2705 * This must match with the initial values of dirty bitmap. 2706 */ 2707 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2708 ram_state_reset(*rsp); 2709 2710 return 0; 2711 } 2712 2713 static void ram_list_init_bitmaps(void) 2714 { 2715 MigrationState *ms = migrate_get_current(); 2716 RAMBlock *block; 2717 unsigned long pages; 2718 uint8_t shift; 2719 2720 /* Skip setting bitmap if there is no RAM */ 2721 if (ram_bytes_total()) { 2722 shift = ms->clear_bitmap_shift; 2723 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2724 error_report("clear_bitmap_shift (%u) too big, using " 2725 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2726 shift = CLEAR_BITMAP_SHIFT_MAX; 2727 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2728 error_report("clear_bitmap_shift (%u) too small, using " 2729 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2730 shift = CLEAR_BITMAP_SHIFT_MIN; 2731 } 2732 2733 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2734 pages = block->max_length >> TARGET_PAGE_BITS; 2735 /* 2736 * The initial dirty bitmap for migration must be set with all 2737 * ones to make sure we'll migrate every guest RAM page to 2738 * destination. 2739 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2740 * new migration after a failed migration, ram_list. 2741 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2742 * guest memory. 2743 */ 2744 block->bmap = bitmap_new(pages); 2745 bitmap_set(block->bmap, 0, pages); 2746 block->clear_bmap_shift = shift; 2747 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2748 } 2749 } 2750 } 2751 2752 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2753 { 2754 unsigned long pages; 2755 RAMBlock *rb; 2756 2757 RCU_READ_LOCK_GUARD(); 2758 2759 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2760 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2761 rs->migration_dirty_pages -= pages; 2762 } 2763 } 2764 2765 static void ram_init_bitmaps(RAMState *rs) 2766 { 2767 /* For memory_global_dirty_log_start below. */ 2768 qemu_mutex_lock_iothread(); 2769 qemu_mutex_lock_ramlist(); 2770 2771 WITH_RCU_READ_LOCK_GUARD() { 2772 ram_list_init_bitmaps(); 2773 /* We don't use dirty log with background snapshots */ 2774 if (!migrate_background_snapshot()) { 2775 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 2776 migration_bitmap_sync_precopy(rs); 2777 } 2778 } 2779 qemu_mutex_unlock_ramlist(); 2780 qemu_mutex_unlock_iothread(); 2781 2782 /* 2783 * After an eventual first bitmap sync, fixup the initial bitmap 2784 * containing all 1s to exclude any discarded pages from migration. 2785 */ 2786 migration_bitmap_clear_discarded_pages(rs); 2787 } 2788 2789 static int ram_init_all(RAMState **rsp) 2790 { 2791 if (ram_state_init(rsp)) { 2792 return -1; 2793 } 2794 2795 if (xbzrle_init()) { 2796 ram_state_cleanup(rsp); 2797 return -1; 2798 } 2799 2800 ram_init_bitmaps(*rsp); 2801 2802 return 0; 2803 } 2804 2805 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2806 { 2807 RAMBlock *block; 2808 uint64_t pages = 0; 2809 2810 /* 2811 * Postcopy is not using xbzrle/compression, so no need for that. 2812 * Also, since source are already halted, we don't need to care 2813 * about dirty page logging as well. 2814 */ 2815 2816 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2817 pages += bitmap_count_one(block->bmap, 2818 block->used_length >> TARGET_PAGE_BITS); 2819 } 2820 2821 /* This may not be aligned with current bitmaps. Recalculate. */ 2822 rs->migration_dirty_pages = pages; 2823 2824 ram_state_reset(rs); 2825 2826 /* Update RAMState cache of output QEMUFile */ 2827 rs->f = out; 2828 2829 trace_ram_state_resume_prepare(pages); 2830 } 2831 2832 /* 2833 * This function clears bits of the free pages reported by the caller from the 2834 * migration dirty bitmap. @addr is the host address corresponding to the 2835 * start of the continuous guest free pages, and @len is the total bytes of 2836 * those pages. 2837 */ 2838 void qemu_guest_free_page_hint(void *addr, size_t len) 2839 { 2840 RAMBlock *block; 2841 ram_addr_t offset; 2842 size_t used_len, start, npages; 2843 MigrationState *s = migrate_get_current(); 2844 2845 /* This function is currently expected to be used during live migration */ 2846 if (!migration_is_setup_or_active(s->state)) { 2847 return; 2848 } 2849 2850 for (; len > 0; len -= used_len, addr += used_len) { 2851 block = qemu_ram_block_from_host(addr, false, &offset); 2852 if (unlikely(!block || offset >= block->used_length)) { 2853 /* 2854 * The implementation might not support RAMBlock resize during 2855 * live migration, but it could happen in theory with future 2856 * updates. So we add a check here to capture that case. 2857 */ 2858 error_report_once("%s unexpected error", __func__); 2859 return; 2860 } 2861 2862 if (len <= block->used_length - offset) { 2863 used_len = len; 2864 } else { 2865 used_len = block->used_length - offset; 2866 } 2867 2868 start = offset >> TARGET_PAGE_BITS; 2869 npages = used_len >> TARGET_PAGE_BITS; 2870 2871 qemu_mutex_lock(&ram_state->bitmap_mutex); 2872 /* 2873 * The skipped free pages are equavalent to be sent from clear_bmap's 2874 * perspective, so clear the bits from the memory region bitmap which 2875 * are initially set. Otherwise those skipped pages will be sent in 2876 * the next round after syncing from the memory region bitmap. 2877 */ 2878 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2879 ram_state->migration_dirty_pages -= 2880 bitmap_count_one_with_offset(block->bmap, start, npages); 2881 bitmap_clear(block->bmap, start, npages); 2882 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2883 } 2884 } 2885 2886 /* 2887 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2888 * long-running RCU critical section. When rcu-reclaims in the code 2889 * start to become numerous it will be necessary to reduce the 2890 * granularity of these critical sections. 2891 */ 2892 2893 /** 2894 * ram_save_setup: Setup RAM for migration 2895 * 2896 * Returns zero to indicate success and negative for error 2897 * 2898 * @f: QEMUFile where to send the data 2899 * @opaque: RAMState pointer 2900 */ 2901 static int ram_save_setup(QEMUFile *f, void *opaque) 2902 { 2903 RAMState **rsp = opaque; 2904 RAMBlock *block; 2905 2906 if (compress_threads_save_setup()) { 2907 return -1; 2908 } 2909 2910 /* migration has already setup the bitmap, reuse it. */ 2911 if (!migration_in_colo_state()) { 2912 if (ram_init_all(rsp) != 0) { 2913 compress_threads_save_cleanup(); 2914 return -1; 2915 } 2916 } 2917 (*rsp)->f = f; 2918 2919 WITH_RCU_READ_LOCK_GUARD() { 2920 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2921 2922 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2923 qemu_put_byte(f, strlen(block->idstr)); 2924 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2925 qemu_put_be64(f, block->used_length); 2926 if (migrate_postcopy_ram() && block->page_size != 2927 qemu_host_page_size) { 2928 qemu_put_be64(f, block->page_size); 2929 } 2930 if (migrate_ignore_shared()) { 2931 qemu_put_be64(f, block->mr->addr); 2932 } 2933 } 2934 } 2935 2936 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2937 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2938 2939 multifd_send_sync_main(f); 2940 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2941 qemu_fflush(f); 2942 2943 return 0; 2944 } 2945 2946 /** 2947 * ram_save_iterate: iterative stage for migration 2948 * 2949 * Returns zero to indicate success and negative for error 2950 * 2951 * @f: QEMUFile where to send the data 2952 * @opaque: RAMState pointer 2953 */ 2954 static int ram_save_iterate(QEMUFile *f, void *opaque) 2955 { 2956 RAMState **temp = opaque; 2957 RAMState *rs = *temp; 2958 int ret = 0; 2959 int i; 2960 int64_t t0; 2961 int done = 0; 2962 2963 if (blk_mig_bulk_active()) { 2964 /* Avoid transferring ram during bulk phase of block migration as 2965 * the bulk phase will usually take a long time and transferring 2966 * ram updates during that time is pointless. */ 2967 goto out; 2968 } 2969 2970 /* 2971 * We'll take this lock a little bit long, but it's okay for two reasons. 2972 * Firstly, the only possible other thread to take it is who calls 2973 * qemu_guest_free_page_hint(), which should be rare; secondly, see 2974 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 2975 * guarantees that we'll at least released it in a regular basis. 2976 */ 2977 qemu_mutex_lock(&rs->bitmap_mutex); 2978 WITH_RCU_READ_LOCK_GUARD() { 2979 if (ram_list.version != rs->last_version) { 2980 ram_state_reset(rs); 2981 } 2982 2983 /* Read version before ram_list.blocks */ 2984 smp_rmb(); 2985 2986 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2987 2988 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2989 i = 0; 2990 while ((ret = qemu_file_rate_limit(f)) == 0 || 2991 postcopy_has_request(rs)) { 2992 int pages; 2993 2994 if (qemu_file_get_error(f)) { 2995 break; 2996 } 2997 2998 pages = ram_find_and_save_block(rs); 2999 /* no more pages to sent */ 3000 if (pages == 0) { 3001 done = 1; 3002 break; 3003 } 3004 3005 if (pages < 0) { 3006 qemu_file_set_error(f, pages); 3007 break; 3008 } 3009 3010 rs->target_page_count += pages; 3011 3012 /* 3013 * During postcopy, it is necessary to make sure one whole host 3014 * page is sent in one chunk. 3015 */ 3016 if (migrate_postcopy_ram()) { 3017 flush_compressed_data(rs); 3018 } 3019 3020 /* 3021 * we want to check in the 1st loop, just in case it was the 1st 3022 * time and we had to sync the dirty bitmap. 3023 * qemu_clock_get_ns() is a bit expensive, so we only check each 3024 * some iterations 3025 */ 3026 if ((i & 63) == 0) { 3027 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3028 1000000; 3029 if (t1 > MAX_WAIT) { 3030 trace_ram_save_iterate_big_wait(t1, i); 3031 break; 3032 } 3033 } 3034 i++; 3035 } 3036 } 3037 qemu_mutex_unlock(&rs->bitmap_mutex); 3038 3039 /* 3040 * Must occur before EOS (or any QEMUFile operation) 3041 * because of RDMA protocol. 3042 */ 3043 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3044 3045 out: 3046 if (ret >= 0 3047 && migration_is_setup_or_active(migrate_get_current()->state)) { 3048 multifd_send_sync_main(rs->f); 3049 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3050 qemu_fflush(f); 3051 ram_transferred_add(8); 3052 3053 ret = qemu_file_get_error(f); 3054 } 3055 if (ret < 0) { 3056 return ret; 3057 } 3058 3059 return done; 3060 } 3061 3062 /** 3063 * ram_save_complete: function called to send the remaining amount of ram 3064 * 3065 * Returns zero to indicate success or negative on error 3066 * 3067 * Called with iothread lock 3068 * 3069 * @f: QEMUFile where to send the data 3070 * @opaque: RAMState pointer 3071 */ 3072 static int ram_save_complete(QEMUFile *f, void *opaque) 3073 { 3074 RAMState **temp = opaque; 3075 RAMState *rs = *temp; 3076 int ret = 0; 3077 3078 rs->last_stage = !migration_in_colo_state(); 3079 3080 WITH_RCU_READ_LOCK_GUARD() { 3081 if (!migration_in_postcopy()) { 3082 migration_bitmap_sync_precopy(rs); 3083 } 3084 3085 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3086 3087 /* try transferring iterative blocks of memory */ 3088 3089 /* flush all remaining blocks regardless of rate limiting */ 3090 while (true) { 3091 int pages; 3092 3093 pages = ram_find_and_save_block(rs); 3094 /* no more blocks to sent */ 3095 if (pages == 0) { 3096 break; 3097 } 3098 if (pages < 0) { 3099 ret = pages; 3100 break; 3101 } 3102 } 3103 3104 flush_compressed_data(rs); 3105 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3106 } 3107 3108 if (ret >= 0) { 3109 multifd_send_sync_main(rs->f); 3110 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3111 qemu_fflush(f); 3112 } 3113 3114 return ret; 3115 } 3116 3117 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3118 uint64_t *res_precopy_only, 3119 uint64_t *res_compatible, 3120 uint64_t *res_postcopy_only) 3121 { 3122 RAMState **temp = opaque; 3123 RAMState *rs = *temp; 3124 uint64_t remaining_size; 3125 3126 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3127 3128 if (!migration_in_postcopy() && 3129 remaining_size < max_size) { 3130 qemu_mutex_lock_iothread(); 3131 WITH_RCU_READ_LOCK_GUARD() { 3132 migration_bitmap_sync_precopy(rs); 3133 } 3134 qemu_mutex_unlock_iothread(); 3135 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3136 } 3137 3138 if (migrate_postcopy_ram()) { 3139 /* We can do postcopy, and all the data is postcopiable */ 3140 *res_compatible += remaining_size; 3141 } else { 3142 *res_precopy_only += remaining_size; 3143 } 3144 } 3145 3146 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3147 { 3148 unsigned int xh_len; 3149 int xh_flags; 3150 uint8_t *loaded_data; 3151 3152 /* extract RLE header */ 3153 xh_flags = qemu_get_byte(f); 3154 xh_len = qemu_get_be16(f); 3155 3156 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3157 error_report("Failed to load XBZRLE page - wrong compression!"); 3158 return -1; 3159 } 3160 3161 if (xh_len > TARGET_PAGE_SIZE) { 3162 error_report("Failed to load XBZRLE page - len overflow!"); 3163 return -1; 3164 } 3165 loaded_data = XBZRLE.decoded_buf; 3166 /* load data and decode */ 3167 /* it can change loaded_data to point to an internal buffer */ 3168 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3169 3170 /* decode RLE */ 3171 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3172 TARGET_PAGE_SIZE) == -1) { 3173 error_report("Failed to load XBZRLE page - decode error!"); 3174 return -1; 3175 } 3176 3177 return 0; 3178 } 3179 3180 /** 3181 * ram_block_from_stream: read a RAMBlock id from the migration stream 3182 * 3183 * Must be called from within a rcu critical section. 3184 * 3185 * Returns a pointer from within the RCU-protected ram_list. 3186 * 3187 * @f: QEMUFile where to read the data from 3188 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3189 */ 3190 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 3191 { 3192 static RAMBlock *block; 3193 char id[256]; 3194 uint8_t len; 3195 3196 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3197 if (!block) { 3198 error_report("Ack, bad migration stream!"); 3199 return NULL; 3200 } 3201 return block; 3202 } 3203 3204 len = qemu_get_byte(f); 3205 qemu_get_buffer(f, (uint8_t *)id, len); 3206 id[len] = 0; 3207 3208 block = qemu_ram_block_by_name(id); 3209 if (!block) { 3210 error_report("Can't find block %s", id); 3211 return NULL; 3212 } 3213 3214 if (ramblock_is_ignored(block)) { 3215 error_report("block %s should not be migrated !", id); 3216 return NULL; 3217 } 3218 3219 return block; 3220 } 3221 3222 static inline void *host_from_ram_block_offset(RAMBlock *block, 3223 ram_addr_t offset) 3224 { 3225 if (!offset_in_ramblock(block, offset)) { 3226 return NULL; 3227 } 3228 3229 return block->host + offset; 3230 } 3231 3232 static void *host_page_from_ram_block_offset(RAMBlock *block, 3233 ram_addr_t offset) 3234 { 3235 /* Note: Explicitly no check against offset_in_ramblock(). */ 3236 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3237 block->page_size); 3238 } 3239 3240 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3241 ram_addr_t offset) 3242 { 3243 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3244 } 3245 3246 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3247 ram_addr_t offset, bool record_bitmap) 3248 { 3249 if (!offset_in_ramblock(block, offset)) { 3250 return NULL; 3251 } 3252 if (!block->colo_cache) { 3253 error_report("%s: colo_cache is NULL in block :%s", 3254 __func__, block->idstr); 3255 return NULL; 3256 } 3257 3258 /* 3259 * During colo checkpoint, we need bitmap of these migrated pages. 3260 * It help us to decide which pages in ram cache should be flushed 3261 * into VM's RAM later. 3262 */ 3263 if (record_bitmap && 3264 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3265 ram_state->migration_dirty_pages++; 3266 } 3267 return block->colo_cache + offset; 3268 } 3269 3270 /** 3271 * ram_handle_compressed: handle the zero page case 3272 * 3273 * If a page (or a whole RDMA chunk) has been 3274 * determined to be zero, then zap it. 3275 * 3276 * @host: host address for the zero page 3277 * @ch: what the page is filled from. We only support zero 3278 * @size: size of the zero page 3279 */ 3280 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3281 { 3282 if (ch != 0 || !buffer_is_zero(host, size)) { 3283 memset(host, ch, size); 3284 } 3285 } 3286 3287 /* return the size after decompression, or negative value on error */ 3288 static int 3289 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3290 const uint8_t *source, size_t source_len) 3291 { 3292 int err; 3293 3294 err = inflateReset(stream); 3295 if (err != Z_OK) { 3296 return -1; 3297 } 3298 3299 stream->avail_in = source_len; 3300 stream->next_in = (uint8_t *)source; 3301 stream->avail_out = dest_len; 3302 stream->next_out = dest; 3303 3304 err = inflate(stream, Z_NO_FLUSH); 3305 if (err != Z_STREAM_END) { 3306 return -1; 3307 } 3308 3309 return stream->total_out; 3310 } 3311 3312 static void *do_data_decompress(void *opaque) 3313 { 3314 DecompressParam *param = opaque; 3315 unsigned long pagesize; 3316 uint8_t *des; 3317 int len, ret; 3318 3319 qemu_mutex_lock(¶m->mutex); 3320 while (!param->quit) { 3321 if (param->des) { 3322 des = param->des; 3323 len = param->len; 3324 param->des = 0; 3325 qemu_mutex_unlock(¶m->mutex); 3326 3327 pagesize = TARGET_PAGE_SIZE; 3328 3329 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3330 param->compbuf, len); 3331 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3332 error_report("decompress data failed"); 3333 qemu_file_set_error(decomp_file, ret); 3334 } 3335 3336 qemu_mutex_lock(&decomp_done_lock); 3337 param->done = true; 3338 qemu_cond_signal(&decomp_done_cond); 3339 qemu_mutex_unlock(&decomp_done_lock); 3340 3341 qemu_mutex_lock(¶m->mutex); 3342 } else { 3343 qemu_cond_wait(¶m->cond, ¶m->mutex); 3344 } 3345 } 3346 qemu_mutex_unlock(¶m->mutex); 3347 3348 return NULL; 3349 } 3350 3351 static int wait_for_decompress_done(void) 3352 { 3353 int idx, thread_count; 3354 3355 if (!migrate_use_compression()) { 3356 return 0; 3357 } 3358 3359 thread_count = migrate_decompress_threads(); 3360 qemu_mutex_lock(&decomp_done_lock); 3361 for (idx = 0; idx < thread_count; idx++) { 3362 while (!decomp_param[idx].done) { 3363 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3364 } 3365 } 3366 qemu_mutex_unlock(&decomp_done_lock); 3367 return qemu_file_get_error(decomp_file); 3368 } 3369 3370 static void compress_threads_load_cleanup(void) 3371 { 3372 int i, thread_count; 3373 3374 if (!migrate_use_compression()) { 3375 return; 3376 } 3377 thread_count = migrate_decompress_threads(); 3378 for (i = 0; i < thread_count; i++) { 3379 /* 3380 * we use it as a indicator which shows if the thread is 3381 * properly init'd or not 3382 */ 3383 if (!decomp_param[i].compbuf) { 3384 break; 3385 } 3386 3387 qemu_mutex_lock(&decomp_param[i].mutex); 3388 decomp_param[i].quit = true; 3389 qemu_cond_signal(&decomp_param[i].cond); 3390 qemu_mutex_unlock(&decomp_param[i].mutex); 3391 } 3392 for (i = 0; i < thread_count; i++) { 3393 if (!decomp_param[i].compbuf) { 3394 break; 3395 } 3396 3397 qemu_thread_join(decompress_threads + i); 3398 qemu_mutex_destroy(&decomp_param[i].mutex); 3399 qemu_cond_destroy(&decomp_param[i].cond); 3400 inflateEnd(&decomp_param[i].stream); 3401 g_free(decomp_param[i].compbuf); 3402 decomp_param[i].compbuf = NULL; 3403 } 3404 g_free(decompress_threads); 3405 g_free(decomp_param); 3406 decompress_threads = NULL; 3407 decomp_param = NULL; 3408 decomp_file = NULL; 3409 } 3410 3411 static int compress_threads_load_setup(QEMUFile *f) 3412 { 3413 int i, thread_count; 3414 3415 if (!migrate_use_compression()) { 3416 return 0; 3417 } 3418 3419 thread_count = migrate_decompress_threads(); 3420 decompress_threads = g_new0(QemuThread, thread_count); 3421 decomp_param = g_new0(DecompressParam, thread_count); 3422 qemu_mutex_init(&decomp_done_lock); 3423 qemu_cond_init(&decomp_done_cond); 3424 decomp_file = f; 3425 for (i = 0; i < thread_count; i++) { 3426 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3427 goto exit; 3428 } 3429 3430 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3431 qemu_mutex_init(&decomp_param[i].mutex); 3432 qemu_cond_init(&decomp_param[i].cond); 3433 decomp_param[i].done = true; 3434 decomp_param[i].quit = false; 3435 qemu_thread_create(decompress_threads + i, "decompress", 3436 do_data_decompress, decomp_param + i, 3437 QEMU_THREAD_JOINABLE); 3438 } 3439 return 0; 3440 exit: 3441 compress_threads_load_cleanup(); 3442 return -1; 3443 } 3444 3445 static void decompress_data_with_multi_threads(QEMUFile *f, 3446 void *host, int len) 3447 { 3448 int idx, thread_count; 3449 3450 thread_count = migrate_decompress_threads(); 3451 QEMU_LOCK_GUARD(&decomp_done_lock); 3452 while (true) { 3453 for (idx = 0; idx < thread_count; idx++) { 3454 if (decomp_param[idx].done) { 3455 decomp_param[idx].done = false; 3456 qemu_mutex_lock(&decomp_param[idx].mutex); 3457 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3458 decomp_param[idx].des = host; 3459 decomp_param[idx].len = len; 3460 qemu_cond_signal(&decomp_param[idx].cond); 3461 qemu_mutex_unlock(&decomp_param[idx].mutex); 3462 break; 3463 } 3464 } 3465 if (idx < thread_count) { 3466 break; 3467 } else { 3468 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3469 } 3470 } 3471 } 3472 3473 static void colo_init_ram_state(void) 3474 { 3475 ram_state_init(&ram_state); 3476 } 3477 3478 /* 3479 * colo cache: this is for secondary VM, we cache the whole 3480 * memory of the secondary VM, it is need to hold the global lock 3481 * to call this helper. 3482 */ 3483 int colo_init_ram_cache(void) 3484 { 3485 RAMBlock *block; 3486 3487 WITH_RCU_READ_LOCK_GUARD() { 3488 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3489 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3490 NULL, false, false); 3491 if (!block->colo_cache) { 3492 error_report("%s: Can't alloc memory for COLO cache of block %s," 3493 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3494 block->used_length); 3495 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3496 if (block->colo_cache) { 3497 qemu_anon_ram_free(block->colo_cache, block->used_length); 3498 block->colo_cache = NULL; 3499 } 3500 } 3501 return -errno; 3502 } 3503 if (!machine_dump_guest_core(current_machine)) { 3504 qemu_madvise(block->colo_cache, block->used_length, 3505 QEMU_MADV_DONTDUMP); 3506 } 3507 } 3508 } 3509 3510 /* 3511 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3512 * with to decide which page in cache should be flushed into SVM's RAM. Here 3513 * we use the same name 'ram_bitmap' as for migration. 3514 */ 3515 if (ram_bytes_total()) { 3516 RAMBlock *block; 3517 3518 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3519 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3520 block->bmap = bitmap_new(pages); 3521 } 3522 } 3523 3524 colo_init_ram_state(); 3525 return 0; 3526 } 3527 3528 /* TODO: duplicated with ram_init_bitmaps */ 3529 void colo_incoming_start_dirty_log(void) 3530 { 3531 RAMBlock *block = NULL; 3532 /* For memory_global_dirty_log_start below. */ 3533 qemu_mutex_lock_iothread(); 3534 qemu_mutex_lock_ramlist(); 3535 3536 memory_global_dirty_log_sync(); 3537 WITH_RCU_READ_LOCK_GUARD() { 3538 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3539 ramblock_sync_dirty_bitmap(ram_state, block); 3540 /* Discard this dirty bitmap record */ 3541 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3542 } 3543 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3544 } 3545 ram_state->migration_dirty_pages = 0; 3546 qemu_mutex_unlock_ramlist(); 3547 qemu_mutex_unlock_iothread(); 3548 } 3549 3550 /* It is need to hold the global lock to call this helper */ 3551 void colo_release_ram_cache(void) 3552 { 3553 RAMBlock *block; 3554 3555 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3556 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3557 g_free(block->bmap); 3558 block->bmap = NULL; 3559 } 3560 3561 WITH_RCU_READ_LOCK_GUARD() { 3562 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3563 if (block->colo_cache) { 3564 qemu_anon_ram_free(block->colo_cache, block->used_length); 3565 block->colo_cache = NULL; 3566 } 3567 } 3568 } 3569 ram_state_cleanup(&ram_state); 3570 } 3571 3572 /** 3573 * ram_load_setup: Setup RAM for migration incoming side 3574 * 3575 * Returns zero to indicate success and negative for error 3576 * 3577 * @f: QEMUFile where to receive the data 3578 * @opaque: RAMState pointer 3579 */ 3580 static int ram_load_setup(QEMUFile *f, void *opaque) 3581 { 3582 if (compress_threads_load_setup(f)) { 3583 return -1; 3584 } 3585 3586 xbzrle_load_setup(); 3587 ramblock_recv_map_init(); 3588 3589 return 0; 3590 } 3591 3592 static int ram_load_cleanup(void *opaque) 3593 { 3594 RAMBlock *rb; 3595 3596 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3597 qemu_ram_block_writeback(rb); 3598 } 3599 3600 xbzrle_load_cleanup(); 3601 compress_threads_load_cleanup(); 3602 3603 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3604 g_free(rb->receivedmap); 3605 rb->receivedmap = NULL; 3606 } 3607 3608 return 0; 3609 } 3610 3611 /** 3612 * ram_postcopy_incoming_init: allocate postcopy data structures 3613 * 3614 * Returns 0 for success and negative if there was one error 3615 * 3616 * @mis: current migration incoming state 3617 * 3618 * Allocate data structures etc needed by incoming migration with 3619 * postcopy-ram. postcopy-ram's similarly names 3620 * postcopy_ram_incoming_init does the work. 3621 */ 3622 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3623 { 3624 return postcopy_ram_incoming_init(mis); 3625 } 3626 3627 /** 3628 * ram_load_postcopy: load a page in postcopy case 3629 * 3630 * Returns 0 for success or -errno in case of error 3631 * 3632 * Called in postcopy mode by ram_load(). 3633 * rcu_read_lock is taken prior to this being called. 3634 * 3635 * @f: QEMUFile where to send the data 3636 */ 3637 static int ram_load_postcopy(QEMUFile *f) 3638 { 3639 int flags = 0, ret = 0; 3640 bool place_needed = false; 3641 bool matches_target_page_size = false; 3642 MigrationIncomingState *mis = migration_incoming_get_current(); 3643 /* Temporary page that is later 'placed' */ 3644 void *postcopy_host_page = mis->postcopy_tmp_page; 3645 void *host_page = NULL; 3646 bool all_zero = true; 3647 int target_pages = 0; 3648 3649 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3650 ram_addr_t addr; 3651 void *page_buffer = NULL; 3652 void *place_source = NULL; 3653 RAMBlock *block = NULL; 3654 uint8_t ch; 3655 int len; 3656 3657 addr = qemu_get_be64(f); 3658 3659 /* 3660 * If qemu file error, we should stop here, and then "addr" 3661 * may be invalid 3662 */ 3663 ret = qemu_file_get_error(f); 3664 if (ret) { 3665 break; 3666 } 3667 3668 flags = addr & ~TARGET_PAGE_MASK; 3669 addr &= TARGET_PAGE_MASK; 3670 3671 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3672 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3673 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3674 block = ram_block_from_stream(f, flags); 3675 if (!block) { 3676 ret = -EINVAL; 3677 break; 3678 } 3679 3680 /* 3681 * Relying on used_length is racy and can result in false positives. 3682 * We might place pages beyond used_length in case RAM was shrunk 3683 * while in postcopy, which is fine - trying to place via 3684 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3685 */ 3686 if (!block->host || addr >= block->postcopy_length) { 3687 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3688 ret = -EINVAL; 3689 break; 3690 } 3691 target_pages++; 3692 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3693 /* 3694 * Postcopy requires that we place whole host pages atomically; 3695 * these may be huge pages for RAMBlocks that are backed by 3696 * hugetlbfs. 3697 * To make it atomic, the data is read into a temporary page 3698 * that's moved into place later. 3699 * The migration protocol uses, possibly smaller, target-pages 3700 * however the source ensures it always sends all the components 3701 * of a host page in one chunk. 3702 */ 3703 page_buffer = postcopy_host_page + 3704 host_page_offset_from_ram_block_offset(block, addr); 3705 /* If all TP are zero then we can optimise the place */ 3706 if (target_pages == 1) { 3707 host_page = host_page_from_ram_block_offset(block, addr); 3708 } else if (host_page != host_page_from_ram_block_offset(block, 3709 addr)) { 3710 /* not the 1st TP within the HP */ 3711 error_report("Non-same host page %p/%p", host_page, 3712 host_page_from_ram_block_offset(block, addr)); 3713 ret = -EINVAL; 3714 break; 3715 } 3716 3717 /* 3718 * If it's the last part of a host page then we place the host 3719 * page 3720 */ 3721 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3722 place_needed = true; 3723 } 3724 place_source = postcopy_host_page; 3725 } 3726 3727 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3728 case RAM_SAVE_FLAG_ZERO: 3729 ch = qemu_get_byte(f); 3730 /* 3731 * Can skip to set page_buffer when 3732 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3733 */ 3734 if (ch || !matches_target_page_size) { 3735 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3736 } 3737 if (ch) { 3738 all_zero = false; 3739 } 3740 break; 3741 3742 case RAM_SAVE_FLAG_PAGE: 3743 all_zero = false; 3744 if (!matches_target_page_size) { 3745 /* For huge pages, we always use temporary buffer */ 3746 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3747 } else { 3748 /* 3749 * For small pages that matches target page size, we 3750 * avoid the qemu_file copy. Instead we directly use 3751 * the buffer of QEMUFile to place the page. Note: we 3752 * cannot do any QEMUFile operation before using that 3753 * buffer to make sure the buffer is valid when 3754 * placing the page. 3755 */ 3756 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3757 TARGET_PAGE_SIZE); 3758 } 3759 break; 3760 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3761 all_zero = false; 3762 len = qemu_get_be32(f); 3763 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3764 error_report("Invalid compressed data length: %d", len); 3765 ret = -EINVAL; 3766 break; 3767 } 3768 decompress_data_with_multi_threads(f, page_buffer, len); 3769 break; 3770 3771 case RAM_SAVE_FLAG_EOS: 3772 /* normal exit */ 3773 multifd_recv_sync_main(); 3774 break; 3775 default: 3776 error_report("Unknown combination of migration flags: 0x%x" 3777 " (postcopy mode)", flags); 3778 ret = -EINVAL; 3779 break; 3780 } 3781 3782 /* Got the whole host page, wait for decompress before placing. */ 3783 if (place_needed) { 3784 ret |= wait_for_decompress_done(); 3785 } 3786 3787 /* Detect for any possible file errors */ 3788 if (!ret && qemu_file_get_error(f)) { 3789 ret = qemu_file_get_error(f); 3790 } 3791 3792 if (!ret && place_needed) { 3793 if (all_zero) { 3794 ret = postcopy_place_page_zero(mis, host_page, block); 3795 } else { 3796 ret = postcopy_place_page(mis, host_page, place_source, 3797 block); 3798 } 3799 place_needed = false; 3800 target_pages = 0; 3801 /* Assume we have a zero page until we detect something different */ 3802 all_zero = true; 3803 } 3804 } 3805 3806 return ret; 3807 } 3808 3809 static bool postcopy_is_advised(void) 3810 { 3811 PostcopyState ps = postcopy_state_get(); 3812 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3813 } 3814 3815 static bool postcopy_is_running(void) 3816 { 3817 PostcopyState ps = postcopy_state_get(); 3818 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3819 } 3820 3821 /* 3822 * Flush content of RAM cache into SVM's memory. 3823 * Only flush the pages that be dirtied by PVM or SVM or both. 3824 */ 3825 void colo_flush_ram_cache(void) 3826 { 3827 RAMBlock *block = NULL; 3828 void *dst_host; 3829 void *src_host; 3830 unsigned long offset = 0; 3831 3832 memory_global_dirty_log_sync(); 3833 WITH_RCU_READ_LOCK_GUARD() { 3834 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3835 ramblock_sync_dirty_bitmap(ram_state, block); 3836 } 3837 } 3838 3839 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3840 WITH_RCU_READ_LOCK_GUARD() { 3841 block = QLIST_FIRST_RCU(&ram_list.blocks); 3842 3843 while (block) { 3844 unsigned long num = 0; 3845 3846 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3847 if (!offset_in_ramblock(block, 3848 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3849 offset = 0; 3850 num = 0; 3851 block = QLIST_NEXT_RCU(block, next); 3852 } else { 3853 unsigned long i = 0; 3854 3855 for (i = 0; i < num; i++) { 3856 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3857 } 3858 dst_host = block->host 3859 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3860 src_host = block->colo_cache 3861 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3862 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3863 offset += num; 3864 } 3865 } 3866 } 3867 trace_colo_flush_ram_cache_end(); 3868 } 3869 3870 /** 3871 * ram_load_precopy: load pages in precopy case 3872 * 3873 * Returns 0 for success or -errno in case of error 3874 * 3875 * Called in precopy mode by ram_load(). 3876 * rcu_read_lock is taken prior to this being called. 3877 * 3878 * @f: QEMUFile where to send the data 3879 */ 3880 static int ram_load_precopy(QEMUFile *f) 3881 { 3882 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3883 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3884 bool postcopy_advised = postcopy_is_advised(); 3885 if (!migrate_use_compression()) { 3886 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3887 } 3888 3889 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3890 ram_addr_t addr, total_ram_bytes; 3891 void *host = NULL, *host_bak = NULL; 3892 uint8_t ch; 3893 3894 /* 3895 * Yield periodically to let main loop run, but an iteration of 3896 * the main loop is expensive, so do it each some iterations 3897 */ 3898 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3899 aio_co_schedule(qemu_get_current_aio_context(), 3900 qemu_coroutine_self()); 3901 qemu_coroutine_yield(); 3902 } 3903 i++; 3904 3905 addr = qemu_get_be64(f); 3906 flags = addr & ~TARGET_PAGE_MASK; 3907 addr &= TARGET_PAGE_MASK; 3908 3909 if (flags & invalid_flags) { 3910 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3911 error_report("Received an unexpected compressed page"); 3912 } 3913 3914 ret = -EINVAL; 3915 break; 3916 } 3917 3918 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3919 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3920 RAMBlock *block = ram_block_from_stream(f, flags); 3921 3922 host = host_from_ram_block_offset(block, addr); 3923 /* 3924 * After going into COLO stage, we should not load the page 3925 * into SVM's memory directly, we put them into colo_cache firstly. 3926 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3927 * Previously, we copied all these memory in preparing stage of COLO 3928 * while we need to stop VM, which is a time-consuming process. 3929 * Here we optimize it by a trick, back-up every page while in 3930 * migration process while COLO is enabled, though it affects the 3931 * speed of the migration, but it obviously reduce the downtime of 3932 * back-up all SVM'S memory in COLO preparing stage. 3933 */ 3934 if (migration_incoming_colo_enabled()) { 3935 if (migration_incoming_in_colo_state()) { 3936 /* In COLO stage, put all pages into cache temporarily */ 3937 host = colo_cache_from_block_offset(block, addr, true); 3938 } else { 3939 /* 3940 * In migration stage but before COLO stage, 3941 * Put all pages into both cache and SVM's memory. 3942 */ 3943 host_bak = colo_cache_from_block_offset(block, addr, false); 3944 } 3945 } 3946 if (!host) { 3947 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3948 ret = -EINVAL; 3949 break; 3950 } 3951 if (!migration_incoming_in_colo_state()) { 3952 ramblock_recv_bitmap_set(block, host); 3953 } 3954 3955 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3956 } 3957 3958 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3959 case RAM_SAVE_FLAG_MEM_SIZE: 3960 /* Synchronize RAM block list */ 3961 total_ram_bytes = addr; 3962 while (!ret && total_ram_bytes) { 3963 RAMBlock *block; 3964 char id[256]; 3965 ram_addr_t length; 3966 3967 len = qemu_get_byte(f); 3968 qemu_get_buffer(f, (uint8_t *)id, len); 3969 id[len] = 0; 3970 length = qemu_get_be64(f); 3971 3972 block = qemu_ram_block_by_name(id); 3973 if (block && !qemu_ram_is_migratable(block)) { 3974 error_report("block %s should not be migrated !", id); 3975 ret = -EINVAL; 3976 } else if (block) { 3977 if (length != block->used_length) { 3978 Error *local_err = NULL; 3979 3980 ret = qemu_ram_resize(block, length, 3981 &local_err); 3982 if (local_err) { 3983 error_report_err(local_err); 3984 } 3985 } 3986 /* For postcopy we need to check hugepage sizes match */ 3987 if (postcopy_advised && migrate_postcopy_ram() && 3988 block->page_size != qemu_host_page_size) { 3989 uint64_t remote_page_size = qemu_get_be64(f); 3990 if (remote_page_size != block->page_size) { 3991 error_report("Mismatched RAM page size %s " 3992 "(local) %zd != %" PRId64, 3993 id, block->page_size, 3994 remote_page_size); 3995 ret = -EINVAL; 3996 } 3997 } 3998 if (migrate_ignore_shared()) { 3999 hwaddr addr = qemu_get_be64(f); 4000 if (ramblock_is_ignored(block) && 4001 block->mr->addr != addr) { 4002 error_report("Mismatched GPAs for block %s " 4003 "%" PRId64 "!= %" PRId64, 4004 id, (uint64_t)addr, 4005 (uint64_t)block->mr->addr); 4006 ret = -EINVAL; 4007 } 4008 } 4009 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4010 block->idstr); 4011 } else { 4012 error_report("Unknown ramblock \"%s\", cannot " 4013 "accept migration", id); 4014 ret = -EINVAL; 4015 } 4016 4017 total_ram_bytes -= length; 4018 } 4019 break; 4020 4021 case RAM_SAVE_FLAG_ZERO: 4022 ch = qemu_get_byte(f); 4023 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4024 break; 4025 4026 case RAM_SAVE_FLAG_PAGE: 4027 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4028 break; 4029 4030 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4031 len = qemu_get_be32(f); 4032 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4033 error_report("Invalid compressed data length: %d", len); 4034 ret = -EINVAL; 4035 break; 4036 } 4037 decompress_data_with_multi_threads(f, host, len); 4038 break; 4039 4040 case RAM_SAVE_FLAG_XBZRLE: 4041 if (load_xbzrle(f, addr, host) < 0) { 4042 error_report("Failed to decompress XBZRLE page at " 4043 RAM_ADDR_FMT, addr); 4044 ret = -EINVAL; 4045 break; 4046 } 4047 break; 4048 case RAM_SAVE_FLAG_EOS: 4049 /* normal exit */ 4050 multifd_recv_sync_main(); 4051 break; 4052 default: 4053 if (flags & RAM_SAVE_FLAG_HOOK) { 4054 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4055 } else { 4056 error_report("Unknown combination of migration flags: 0x%x", 4057 flags); 4058 ret = -EINVAL; 4059 } 4060 } 4061 if (!ret) { 4062 ret = qemu_file_get_error(f); 4063 } 4064 if (!ret && host_bak) { 4065 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4066 } 4067 } 4068 4069 ret |= wait_for_decompress_done(); 4070 return ret; 4071 } 4072 4073 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4074 { 4075 int ret = 0; 4076 static uint64_t seq_iter; 4077 /* 4078 * If system is running in postcopy mode, page inserts to host memory must 4079 * be atomic 4080 */ 4081 bool postcopy_running = postcopy_is_running(); 4082 4083 seq_iter++; 4084 4085 if (version_id != 4) { 4086 return -EINVAL; 4087 } 4088 4089 /* 4090 * This RCU critical section can be very long running. 4091 * When RCU reclaims in the code start to become numerous, 4092 * it will be necessary to reduce the granularity of this 4093 * critical section. 4094 */ 4095 WITH_RCU_READ_LOCK_GUARD() { 4096 if (postcopy_running) { 4097 ret = ram_load_postcopy(f); 4098 } else { 4099 ret = ram_load_precopy(f); 4100 } 4101 } 4102 trace_ram_load_complete(ret, seq_iter); 4103 4104 return ret; 4105 } 4106 4107 static bool ram_has_postcopy(void *opaque) 4108 { 4109 RAMBlock *rb; 4110 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4111 if (ramblock_is_pmem(rb)) { 4112 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4113 "is not supported now!", rb->idstr, rb->host); 4114 return false; 4115 } 4116 } 4117 4118 return migrate_postcopy_ram(); 4119 } 4120 4121 /* Sync all the dirty bitmap with destination VM. */ 4122 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4123 { 4124 RAMBlock *block; 4125 QEMUFile *file = s->to_dst_file; 4126 int ramblock_count = 0; 4127 4128 trace_ram_dirty_bitmap_sync_start(); 4129 4130 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4131 qemu_savevm_send_recv_bitmap(file, block->idstr); 4132 trace_ram_dirty_bitmap_request(block->idstr); 4133 ramblock_count++; 4134 } 4135 4136 trace_ram_dirty_bitmap_sync_wait(); 4137 4138 /* Wait until all the ramblocks' dirty bitmap synced */ 4139 while (ramblock_count--) { 4140 qemu_sem_wait(&s->rp_state.rp_sem); 4141 } 4142 4143 trace_ram_dirty_bitmap_sync_complete(); 4144 4145 return 0; 4146 } 4147 4148 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4149 { 4150 qemu_sem_post(&s->rp_state.rp_sem); 4151 } 4152 4153 /* 4154 * Read the received bitmap, revert it as the initial dirty bitmap. 4155 * This is only used when the postcopy migration is paused but wants 4156 * to resume from a middle point. 4157 */ 4158 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4159 { 4160 int ret = -EINVAL; 4161 /* from_dst_file is always valid because we're within rp_thread */ 4162 QEMUFile *file = s->rp_state.from_dst_file; 4163 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4164 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4165 uint64_t size, end_mark; 4166 4167 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4168 4169 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4170 error_report("%s: incorrect state %s", __func__, 4171 MigrationStatus_str(s->state)); 4172 return -EINVAL; 4173 } 4174 4175 /* 4176 * Note: see comments in ramblock_recv_bitmap_send() on why we 4177 * need the endianness conversion, and the paddings. 4178 */ 4179 local_size = ROUND_UP(local_size, 8); 4180 4181 /* Add paddings */ 4182 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4183 4184 size = qemu_get_be64(file); 4185 4186 /* The size of the bitmap should match with our ramblock */ 4187 if (size != local_size) { 4188 error_report("%s: ramblock '%s' bitmap size mismatch " 4189 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4190 block->idstr, size, local_size); 4191 ret = -EINVAL; 4192 goto out; 4193 } 4194 4195 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4196 end_mark = qemu_get_be64(file); 4197 4198 ret = qemu_file_get_error(file); 4199 if (ret || size != local_size) { 4200 error_report("%s: read bitmap failed for ramblock '%s': %d" 4201 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4202 __func__, block->idstr, ret, local_size, size); 4203 ret = -EIO; 4204 goto out; 4205 } 4206 4207 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4208 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4209 __func__, block->idstr, end_mark); 4210 ret = -EINVAL; 4211 goto out; 4212 } 4213 4214 /* 4215 * Endianness conversion. We are during postcopy (though paused). 4216 * The dirty bitmap won't change. We can directly modify it. 4217 */ 4218 bitmap_from_le(block->bmap, le_bitmap, nbits); 4219 4220 /* 4221 * What we received is "received bitmap". Revert it as the initial 4222 * dirty bitmap for this ramblock. 4223 */ 4224 bitmap_complement(block->bmap, block->bmap, nbits); 4225 4226 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4227 ramblock_dirty_bitmap_clear_discarded_pages(block); 4228 4229 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4230 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4231 4232 /* 4233 * We succeeded to sync bitmap for current ramblock. If this is 4234 * the last one to sync, we need to notify the main send thread. 4235 */ 4236 ram_dirty_bitmap_reload_notify(s); 4237 4238 ret = 0; 4239 out: 4240 g_free(le_bitmap); 4241 return ret; 4242 } 4243 4244 static int ram_resume_prepare(MigrationState *s, void *opaque) 4245 { 4246 RAMState *rs = *(RAMState **)opaque; 4247 int ret; 4248 4249 ret = ram_dirty_bitmap_sync_all(s, rs); 4250 if (ret) { 4251 return ret; 4252 } 4253 4254 ram_state_resume_prepare(rs, s->to_dst_file); 4255 4256 return 0; 4257 } 4258 4259 static SaveVMHandlers savevm_ram_handlers = { 4260 .save_setup = ram_save_setup, 4261 .save_live_iterate = ram_save_iterate, 4262 .save_live_complete_postcopy = ram_save_complete, 4263 .save_live_complete_precopy = ram_save_complete, 4264 .has_postcopy = ram_has_postcopy, 4265 .save_live_pending = ram_save_pending, 4266 .load_state = ram_load, 4267 .save_cleanup = ram_save_cleanup, 4268 .load_setup = ram_load_setup, 4269 .load_cleanup = ram_load_cleanup, 4270 .resume_prepare = ram_resume_prepare, 4271 }; 4272 4273 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4274 size_t old_size, size_t new_size) 4275 { 4276 PostcopyState ps = postcopy_state_get(); 4277 ram_addr_t offset; 4278 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4279 Error *err = NULL; 4280 4281 if (ramblock_is_ignored(rb)) { 4282 return; 4283 } 4284 4285 if (!migration_is_idle()) { 4286 /* 4287 * Precopy code on the source cannot deal with the size of RAM blocks 4288 * changing at random points in time - especially after sending the 4289 * RAM block sizes in the migration stream, they must no longer change. 4290 * Abort and indicate a proper reason. 4291 */ 4292 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4293 migration_cancel(err); 4294 error_free(err); 4295 } 4296 4297 switch (ps) { 4298 case POSTCOPY_INCOMING_ADVISE: 4299 /* 4300 * Update what ram_postcopy_incoming_init()->init_range() does at the 4301 * time postcopy was advised. Syncing RAM blocks with the source will 4302 * result in RAM resizes. 4303 */ 4304 if (old_size < new_size) { 4305 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4306 error_report("RAM block '%s' discard of resized RAM failed", 4307 rb->idstr); 4308 } 4309 } 4310 rb->postcopy_length = new_size; 4311 break; 4312 case POSTCOPY_INCOMING_NONE: 4313 case POSTCOPY_INCOMING_RUNNING: 4314 case POSTCOPY_INCOMING_END: 4315 /* 4316 * Once our guest is running, postcopy does no longer care about 4317 * resizes. When growing, the new memory was not available on the 4318 * source, no handler needed. 4319 */ 4320 break; 4321 default: 4322 error_report("RAM block '%s' resized during postcopy state: %d", 4323 rb->idstr, ps); 4324 exit(-1); 4325 } 4326 } 4327 4328 static RAMBlockNotifier ram_mig_ram_notifier = { 4329 .ram_block_resized = ram_mig_ram_block_resized, 4330 }; 4331 4332 void ram_mig_init(void) 4333 { 4334 qemu_mutex_init(&XBZRLE.lock); 4335 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4336 ram_block_notifier_add(&ram_mig_ram_notifier); 4337 } 4338