1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/main-loop.h" 34 #include "xbzrle.h" 35 #include "ram.h" 36 #include "migration.h" 37 #include "migration/register.h" 38 #include "migration/misc.h" 39 #include "qemu-file.h" 40 #include "postcopy-ram.h" 41 #include "page_cache.h" 42 #include "qemu/error-report.h" 43 #include "qapi/error.h" 44 #include "qapi/qapi-types-migration.h" 45 #include "qapi/qapi-events-migration.h" 46 #include "qapi/qmp/qerror.h" 47 #include "trace.h" 48 #include "exec/ram_addr.h" 49 #include "exec/target_page.h" 50 #include "qemu/rcu_queue.h" 51 #include "migration/colo.h" 52 #include "block.h" 53 #include "sysemu/cpu-throttle.h" 54 #include "savevm.h" 55 #include "qemu/iov.h" 56 #include "multifd.h" 57 #include "sysemu/runstate.h" 58 59 #include "hw/boards.h" /* for machine_dump_guest_core() */ 60 61 #if defined(__linux__) 62 #include "qemu/userfaultfd.h" 63 #endif /* defined(__linux__) */ 64 65 /***********************************************************/ 66 /* ram save/restore */ 67 68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 69 * worked for pages that where filled with the same char. We switched 70 * it to only search for the zero value. And to avoid confusion with 71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 72 */ 73 74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 75 #define RAM_SAVE_FLAG_ZERO 0x02 76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 77 #define RAM_SAVE_FLAG_PAGE 0x08 78 #define RAM_SAVE_FLAG_EOS 0x10 79 #define RAM_SAVE_FLAG_CONTINUE 0x20 80 #define RAM_SAVE_FLAG_XBZRLE 0x40 81 /* 0x80 is reserved in migration.h start with 0x100 next */ 82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 83 84 static inline bool is_zero_range(uint8_t *p, uint64_t size) 85 { 86 return buffer_is_zero(p, size); 87 } 88 89 XBZRLECacheStats xbzrle_counters; 90 91 /* struct contains XBZRLE cache and a static page 92 used by the compression */ 93 static struct { 94 /* buffer used for XBZRLE encoding */ 95 uint8_t *encoded_buf; 96 /* buffer for storing page content */ 97 uint8_t *current_buf; 98 /* Cache for XBZRLE, Protected by lock. */ 99 PageCache *cache; 100 QemuMutex lock; 101 /* it will store a page full of zeros */ 102 uint8_t *zero_target_page; 103 /* buffer used for XBZRLE decoding */ 104 uint8_t *decoded_buf; 105 } XBZRLE; 106 107 static void XBZRLE_cache_lock(void) 108 { 109 if (migrate_use_xbzrle()) { 110 qemu_mutex_lock(&XBZRLE.lock); 111 } 112 } 113 114 static void XBZRLE_cache_unlock(void) 115 { 116 if (migrate_use_xbzrle()) { 117 qemu_mutex_unlock(&XBZRLE.lock); 118 } 119 } 120 121 /** 122 * xbzrle_cache_resize: resize the xbzrle cache 123 * 124 * This function is called from migrate_params_apply in main 125 * thread, possibly while a migration is in progress. A running 126 * migration may be using the cache and might finish during this call, 127 * hence changes to the cache are protected by XBZRLE.lock(). 128 * 129 * Returns 0 for success or -1 for error 130 * 131 * @new_size: new cache size 132 * @errp: set *errp if the check failed, with reason 133 */ 134 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 135 { 136 PageCache *new_cache; 137 int64_t ret = 0; 138 139 /* Check for truncation */ 140 if (new_size != (size_t)new_size) { 141 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 142 "exceeding address space"); 143 return -1; 144 } 145 146 if (new_size == migrate_xbzrle_cache_size()) { 147 /* nothing to do */ 148 return 0; 149 } 150 151 XBZRLE_cache_lock(); 152 153 if (XBZRLE.cache != NULL) { 154 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 155 if (!new_cache) { 156 ret = -1; 157 goto out; 158 } 159 160 cache_fini(XBZRLE.cache); 161 XBZRLE.cache = new_cache; 162 } 163 out: 164 XBZRLE_cache_unlock(); 165 return ret; 166 } 167 168 bool ramblock_is_ignored(RAMBlock *block) 169 { 170 return !qemu_ram_is_migratable(block) || 171 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 172 } 173 174 #undef RAMBLOCK_FOREACH 175 176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 177 { 178 RAMBlock *block; 179 int ret = 0; 180 181 RCU_READ_LOCK_GUARD(); 182 183 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 184 ret = func(block, opaque); 185 if (ret) { 186 break; 187 } 188 } 189 return ret; 190 } 191 192 static void ramblock_recv_map_init(void) 193 { 194 RAMBlock *rb; 195 196 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 197 assert(!rb->receivedmap); 198 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 199 } 200 } 201 202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 203 { 204 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 205 rb->receivedmap); 206 } 207 208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 209 { 210 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 211 } 212 213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 214 { 215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 216 } 217 218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 219 size_t nr) 220 { 221 bitmap_set_atomic(rb->receivedmap, 222 ramblock_recv_bitmap_offset(host_addr, rb), 223 nr); 224 } 225 226 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 227 228 /* 229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 230 * 231 * Returns >0 if success with sent bytes, or <0 if error. 232 */ 233 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 234 const char *block_name) 235 { 236 RAMBlock *block = qemu_ram_block_by_name(block_name); 237 unsigned long *le_bitmap, nbits; 238 uint64_t size; 239 240 if (!block) { 241 error_report("%s: invalid block name: %s", __func__, block_name); 242 return -1; 243 } 244 245 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 246 247 /* 248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 249 * machines we may need 4 more bytes for padding (see below 250 * comment). So extend it a bit before hand. 251 */ 252 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 253 254 /* 255 * Always use little endian when sending the bitmap. This is 256 * required that when source and destination VMs are not using the 257 * same endianness. (Note: big endian won't work.) 258 */ 259 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 260 261 /* Size of the bitmap, in bytes */ 262 size = DIV_ROUND_UP(nbits, 8); 263 264 /* 265 * size is always aligned to 8 bytes for 64bit machines, but it 266 * may not be true for 32bit machines. We need this padding to 267 * make sure the migration can survive even between 32bit and 268 * 64bit machines. 269 */ 270 size = ROUND_UP(size, 8); 271 272 qemu_put_be64(file, size); 273 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 274 /* 275 * Mark as an end, in case the middle part is screwed up due to 276 * some "mysterious" reason. 277 */ 278 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 279 qemu_fflush(file); 280 281 g_free(le_bitmap); 282 283 if (qemu_file_get_error(file)) { 284 return qemu_file_get_error(file); 285 } 286 287 return size + sizeof(size); 288 } 289 290 /* 291 * An outstanding page request, on the source, having been received 292 * and queued 293 */ 294 struct RAMSrcPageRequest { 295 RAMBlock *rb; 296 hwaddr offset; 297 hwaddr len; 298 299 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 300 }; 301 302 /* State of RAM for migration */ 303 struct RAMState { 304 /* QEMUFile used for this migration */ 305 QEMUFile *f; 306 /* UFFD file descriptor, used in 'write-tracking' migration */ 307 int uffdio_fd; 308 /* Last block that we have visited searching for dirty pages */ 309 RAMBlock *last_seen_block; 310 /* Last block from where we have sent data */ 311 RAMBlock *last_sent_block; 312 /* Last dirty target page we have sent */ 313 ram_addr_t last_page; 314 /* last ram version we have seen */ 315 uint32_t last_version; 316 /* How many times we have dirty too many pages */ 317 int dirty_rate_high_cnt; 318 /* these variables are used for bitmap sync */ 319 /* last time we did a full bitmap_sync */ 320 int64_t time_last_bitmap_sync; 321 /* bytes transferred at start_time */ 322 uint64_t bytes_xfer_prev; 323 /* number of dirty pages since start_time */ 324 uint64_t num_dirty_pages_period; 325 /* xbzrle misses since the beginning of the period */ 326 uint64_t xbzrle_cache_miss_prev; 327 /* Amount of xbzrle pages since the beginning of the period */ 328 uint64_t xbzrle_pages_prev; 329 /* Amount of xbzrle encoded bytes since the beginning of the period */ 330 uint64_t xbzrle_bytes_prev; 331 /* Start using XBZRLE (e.g., after the first round). */ 332 bool xbzrle_enabled; 333 334 /* compression statistics since the beginning of the period */ 335 /* amount of count that no free thread to compress data */ 336 uint64_t compress_thread_busy_prev; 337 /* amount bytes after compression */ 338 uint64_t compressed_size_prev; 339 /* amount of compressed pages */ 340 uint64_t compress_pages_prev; 341 342 /* total handled target pages at the beginning of period */ 343 uint64_t target_page_count_prev; 344 /* total handled target pages since start */ 345 uint64_t target_page_count; 346 /* number of dirty bits in the bitmap */ 347 uint64_t migration_dirty_pages; 348 /* Protects modification of the bitmap and migration dirty pages */ 349 QemuMutex bitmap_mutex; 350 /* The RAMBlock used in the last src_page_requests */ 351 RAMBlock *last_req_rb; 352 /* Queue of outstanding page requests from the destination */ 353 QemuMutex src_page_req_mutex; 354 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 355 }; 356 typedef struct RAMState RAMState; 357 358 static RAMState *ram_state; 359 360 static NotifierWithReturnList precopy_notifier_list; 361 362 void precopy_infrastructure_init(void) 363 { 364 notifier_with_return_list_init(&precopy_notifier_list); 365 } 366 367 void precopy_add_notifier(NotifierWithReturn *n) 368 { 369 notifier_with_return_list_add(&precopy_notifier_list, n); 370 } 371 372 void precopy_remove_notifier(NotifierWithReturn *n) 373 { 374 notifier_with_return_remove(n); 375 } 376 377 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 378 { 379 PrecopyNotifyData pnd; 380 pnd.reason = reason; 381 pnd.errp = errp; 382 383 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 384 } 385 386 uint64_t ram_bytes_remaining(void) 387 { 388 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 389 0; 390 } 391 392 MigrationStats ram_counters; 393 394 /* used by the search for pages to send */ 395 struct PageSearchStatus { 396 /* Current block being searched */ 397 RAMBlock *block; 398 /* Current page to search from */ 399 unsigned long page; 400 /* Set once we wrap around */ 401 bool complete_round; 402 }; 403 typedef struct PageSearchStatus PageSearchStatus; 404 405 CompressionStats compression_counters; 406 407 struct CompressParam { 408 bool done; 409 bool quit; 410 bool zero_page; 411 QEMUFile *file; 412 QemuMutex mutex; 413 QemuCond cond; 414 RAMBlock *block; 415 ram_addr_t offset; 416 417 /* internally used fields */ 418 z_stream stream; 419 uint8_t *originbuf; 420 }; 421 typedef struct CompressParam CompressParam; 422 423 struct DecompressParam { 424 bool done; 425 bool quit; 426 QemuMutex mutex; 427 QemuCond cond; 428 void *des; 429 uint8_t *compbuf; 430 int len; 431 z_stream stream; 432 }; 433 typedef struct DecompressParam DecompressParam; 434 435 static CompressParam *comp_param; 436 static QemuThread *compress_threads; 437 /* comp_done_cond is used to wake up the migration thread when 438 * one of the compression threads has finished the compression. 439 * comp_done_lock is used to co-work with comp_done_cond. 440 */ 441 static QemuMutex comp_done_lock; 442 static QemuCond comp_done_cond; 443 /* The empty QEMUFileOps will be used by file in CompressParam */ 444 static const QEMUFileOps empty_ops = { }; 445 446 static QEMUFile *decomp_file; 447 static DecompressParam *decomp_param; 448 static QemuThread *decompress_threads; 449 static QemuMutex decomp_done_lock; 450 static QemuCond decomp_done_cond; 451 452 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 453 ram_addr_t offset, uint8_t *source_buf); 454 455 static void *do_data_compress(void *opaque) 456 { 457 CompressParam *param = opaque; 458 RAMBlock *block; 459 ram_addr_t offset; 460 bool zero_page; 461 462 qemu_mutex_lock(¶m->mutex); 463 while (!param->quit) { 464 if (param->block) { 465 block = param->block; 466 offset = param->offset; 467 param->block = NULL; 468 qemu_mutex_unlock(¶m->mutex); 469 470 zero_page = do_compress_ram_page(param->file, ¶m->stream, 471 block, offset, param->originbuf); 472 473 qemu_mutex_lock(&comp_done_lock); 474 param->done = true; 475 param->zero_page = zero_page; 476 qemu_cond_signal(&comp_done_cond); 477 qemu_mutex_unlock(&comp_done_lock); 478 479 qemu_mutex_lock(¶m->mutex); 480 } else { 481 qemu_cond_wait(¶m->cond, ¶m->mutex); 482 } 483 } 484 qemu_mutex_unlock(¶m->mutex); 485 486 return NULL; 487 } 488 489 static void compress_threads_save_cleanup(void) 490 { 491 int i, thread_count; 492 493 if (!migrate_use_compression() || !comp_param) { 494 return; 495 } 496 497 thread_count = migrate_compress_threads(); 498 for (i = 0; i < thread_count; i++) { 499 /* 500 * we use it as a indicator which shows if the thread is 501 * properly init'd or not 502 */ 503 if (!comp_param[i].file) { 504 break; 505 } 506 507 qemu_mutex_lock(&comp_param[i].mutex); 508 comp_param[i].quit = true; 509 qemu_cond_signal(&comp_param[i].cond); 510 qemu_mutex_unlock(&comp_param[i].mutex); 511 512 qemu_thread_join(compress_threads + i); 513 qemu_mutex_destroy(&comp_param[i].mutex); 514 qemu_cond_destroy(&comp_param[i].cond); 515 deflateEnd(&comp_param[i].stream); 516 g_free(comp_param[i].originbuf); 517 qemu_fclose(comp_param[i].file); 518 comp_param[i].file = NULL; 519 } 520 qemu_mutex_destroy(&comp_done_lock); 521 qemu_cond_destroy(&comp_done_cond); 522 g_free(compress_threads); 523 g_free(comp_param); 524 compress_threads = NULL; 525 comp_param = NULL; 526 } 527 528 static int compress_threads_save_setup(void) 529 { 530 int i, thread_count; 531 532 if (!migrate_use_compression()) { 533 return 0; 534 } 535 thread_count = migrate_compress_threads(); 536 compress_threads = g_new0(QemuThread, thread_count); 537 comp_param = g_new0(CompressParam, thread_count); 538 qemu_cond_init(&comp_done_cond); 539 qemu_mutex_init(&comp_done_lock); 540 for (i = 0; i < thread_count; i++) { 541 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 542 if (!comp_param[i].originbuf) { 543 goto exit; 544 } 545 546 if (deflateInit(&comp_param[i].stream, 547 migrate_compress_level()) != Z_OK) { 548 g_free(comp_param[i].originbuf); 549 goto exit; 550 } 551 552 /* comp_param[i].file is just used as a dummy buffer to save data, 553 * set its ops to empty. 554 */ 555 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false); 556 comp_param[i].done = true; 557 comp_param[i].quit = false; 558 qemu_mutex_init(&comp_param[i].mutex); 559 qemu_cond_init(&comp_param[i].cond); 560 qemu_thread_create(compress_threads + i, "compress", 561 do_data_compress, comp_param + i, 562 QEMU_THREAD_JOINABLE); 563 } 564 return 0; 565 566 exit: 567 compress_threads_save_cleanup(); 568 return -1; 569 } 570 571 /** 572 * save_page_header: write page header to wire 573 * 574 * If this is the 1st block, it also writes the block identification 575 * 576 * Returns the number of bytes written 577 * 578 * @f: QEMUFile where to send the data 579 * @block: block that contains the page we want to send 580 * @offset: offset inside the block for the page 581 * in the lower bits, it contains flags 582 */ 583 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 584 ram_addr_t offset) 585 { 586 size_t size, len; 587 588 if (block == rs->last_sent_block) { 589 offset |= RAM_SAVE_FLAG_CONTINUE; 590 } 591 qemu_put_be64(f, offset); 592 size = 8; 593 594 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 595 len = strlen(block->idstr); 596 qemu_put_byte(f, len); 597 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 598 size += 1 + len; 599 rs->last_sent_block = block; 600 } 601 return size; 602 } 603 604 /** 605 * mig_throttle_guest_down: throttle down the guest 606 * 607 * Reduce amount of guest cpu execution to hopefully slow down memory 608 * writes. If guest dirty memory rate is reduced below the rate at 609 * which we can transfer pages to the destination then we should be 610 * able to complete migration. Some workloads dirty memory way too 611 * fast and will not effectively converge, even with auto-converge. 612 */ 613 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 614 uint64_t bytes_dirty_threshold) 615 { 616 MigrationState *s = migrate_get_current(); 617 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 618 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 619 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 620 int pct_max = s->parameters.max_cpu_throttle; 621 622 uint64_t throttle_now = cpu_throttle_get_percentage(); 623 uint64_t cpu_now, cpu_ideal, throttle_inc; 624 625 /* We have not started throttling yet. Let's start it. */ 626 if (!cpu_throttle_active()) { 627 cpu_throttle_set(pct_initial); 628 } else { 629 /* Throttling already on, just increase the rate */ 630 if (!pct_tailslow) { 631 throttle_inc = pct_increment; 632 } else { 633 /* Compute the ideal CPU percentage used by Guest, which may 634 * make the dirty rate match the dirty rate threshold. */ 635 cpu_now = 100 - throttle_now; 636 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 637 bytes_dirty_period); 638 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 639 } 640 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 641 } 642 } 643 644 void mig_throttle_counter_reset(void) 645 { 646 RAMState *rs = ram_state; 647 648 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 649 rs->num_dirty_pages_period = 0; 650 rs->bytes_xfer_prev = ram_counters.transferred; 651 } 652 653 /** 654 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 655 * 656 * @rs: current RAM state 657 * @current_addr: address for the zero page 658 * 659 * Update the xbzrle cache to reflect a page that's been sent as all 0. 660 * The important thing is that a stale (not-yet-0'd) page be replaced 661 * by the new data. 662 * As a bonus, if the page wasn't in the cache it gets added so that 663 * when a small write is made into the 0'd page it gets XBZRLE sent. 664 */ 665 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 666 { 667 if (!rs->xbzrle_enabled) { 668 return; 669 } 670 671 /* We don't care if this fails to allocate a new cache page 672 * as long as it updated an old one */ 673 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 674 ram_counters.dirty_sync_count); 675 } 676 677 #define ENCODING_FLAG_XBZRLE 0x1 678 679 /** 680 * save_xbzrle_page: compress and send current page 681 * 682 * Returns: 1 means that we wrote the page 683 * 0 means that page is identical to the one already sent 684 * -1 means that xbzrle would be longer than normal 685 * 686 * @rs: current RAM state 687 * @current_data: pointer to the address of the page contents 688 * @current_addr: addr of the page 689 * @block: block that contains the page we want to send 690 * @offset: offset inside the block for the page 691 * @last_stage: if we are at the completion stage 692 */ 693 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 694 ram_addr_t current_addr, RAMBlock *block, 695 ram_addr_t offset, bool last_stage) 696 { 697 int encoded_len = 0, bytes_xbzrle; 698 uint8_t *prev_cached_page; 699 700 if (!cache_is_cached(XBZRLE.cache, current_addr, 701 ram_counters.dirty_sync_count)) { 702 xbzrle_counters.cache_miss++; 703 if (!last_stage) { 704 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 705 ram_counters.dirty_sync_count) == -1) { 706 return -1; 707 } else { 708 /* update *current_data when the page has been 709 inserted into cache */ 710 *current_data = get_cached_data(XBZRLE.cache, current_addr); 711 } 712 } 713 return -1; 714 } 715 716 /* 717 * Reaching here means the page has hit the xbzrle cache, no matter what 718 * encoding result it is (normal encoding, overflow or skipping the page), 719 * count the page as encoded. This is used to calculate the encoding rate. 720 * 721 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 722 * 2nd page turns out to be skipped (i.e. no new bytes written to the 723 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 724 * skipped page included. In this way, the encoding rate can tell if the 725 * guest page is good for xbzrle encoding. 726 */ 727 xbzrle_counters.pages++; 728 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 729 730 /* save current buffer into memory */ 731 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 732 733 /* XBZRLE encoding (if there is no overflow) */ 734 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 735 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 736 TARGET_PAGE_SIZE); 737 738 /* 739 * Update the cache contents, so that it corresponds to the data 740 * sent, in all cases except where we skip the page. 741 */ 742 if (!last_stage && encoded_len != 0) { 743 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 744 /* 745 * In the case where we couldn't compress, ensure that the caller 746 * sends the data from the cache, since the guest might have 747 * changed the RAM since we copied it. 748 */ 749 *current_data = prev_cached_page; 750 } 751 752 if (encoded_len == 0) { 753 trace_save_xbzrle_page_skipping(); 754 return 0; 755 } else if (encoded_len == -1) { 756 trace_save_xbzrle_page_overflow(); 757 xbzrle_counters.overflow++; 758 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 759 return -1; 760 } 761 762 /* Send XBZRLE based compressed page */ 763 bytes_xbzrle = save_page_header(rs, rs->f, block, 764 offset | RAM_SAVE_FLAG_XBZRLE); 765 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 766 qemu_put_be16(rs->f, encoded_len); 767 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 768 bytes_xbzrle += encoded_len + 1 + 2; 769 /* 770 * Like compressed_size (please see update_compress_thread_counts), 771 * the xbzrle encoded bytes don't count the 8 byte header with 772 * RAM_SAVE_FLAG_CONTINUE. 773 */ 774 xbzrle_counters.bytes += bytes_xbzrle - 8; 775 ram_counters.transferred += bytes_xbzrle; 776 777 return 1; 778 } 779 780 /** 781 * migration_bitmap_find_dirty: find the next dirty page from start 782 * 783 * Returns the page offset within memory region of the start of a dirty page 784 * 785 * @rs: current RAM state 786 * @rb: RAMBlock where to search for dirty pages 787 * @start: page where we start the search 788 */ 789 static inline 790 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 791 unsigned long start) 792 { 793 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 794 unsigned long *bitmap = rb->bmap; 795 796 if (ramblock_is_ignored(rb)) { 797 return size; 798 } 799 800 return find_next_bit(bitmap, size, start); 801 } 802 803 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 804 unsigned long page) 805 { 806 uint8_t shift; 807 hwaddr size, start; 808 809 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 810 return; 811 } 812 813 shift = rb->clear_bmap_shift; 814 /* 815 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 816 * can make things easier sometimes since then start address 817 * of the small chunk will always be 64 pages aligned so the 818 * bitmap will always be aligned to unsigned long. We should 819 * even be able to remove this restriction but I'm simply 820 * keeping it. 821 */ 822 assert(shift >= 6); 823 824 size = 1ULL << (TARGET_PAGE_BITS + shift); 825 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 826 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 827 memory_region_clear_dirty_bitmap(rb->mr, start, size); 828 } 829 830 static void 831 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 832 unsigned long start, 833 unsigned long npages) 834 { 835 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 836 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 837 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 838 839 /* 840 * Clear pages from start to start + npages - 1, so the end boundary is 841 * exclusive. 842 */ 843 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 844 migration_clear_memory_region_dirty_bitmap(rb, i); 845 } 846 } 847 848 /* 849 * colo_bitmap_find_diry:find contiguous dirty pages from start 850 * 851 * Returns the page offset within memory region of the start of the contiguout 852 * dirty page 853 * 854 * @rs: current RAM state 855 * @rb: RAMBlock where to search for dirty pages 856 * @start: page where we start the search 857 * @num: the number of contiguous dirty pages 858 */ 859 static inline 860 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 861 unsigned long start, unsigned long *num) 862 { 863 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 864 unsigned long *bitmap = rb->bmap; 865 unsigned long first, next; 866 867 *num = 0; 868 869 if (ramblock_is_ignored(rb)) { 870 return size; 871 } 872 873 first = find_next_bit(bitmap, size, start); 874 if (first >= size) { 875 return first; 876 } 877 next = find_next_zero_bit(bitmap, size, first + 1); 878 assert(next >= first); 879 *num = next - first; 880 return first; 881 } 882 883 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 884 RAMBlock *rb, 885 unsigned long page) 886 { 887 bool ret; 888 889 /* 890 * Clear dirty bitmap if needed. This _must_ be called before we 891 * send any of the page in the chunk because we need to make sure 892 * we can capture further page content changes when we sync dirty 893 * log the next time. So as long as we are going to send any of 894 * the page in the chunk we clear the remote dirty bitmap for all. 895 * Clearing it earlier won't be a problem, but too late will. 896 */ 897 migration_clear_memory_region_dirty_bitmap(rb, page); 898 899 ret = test_and_clear_bit(page, rb->bmap); 900 if (ret) { 901 rs->migration_dirty_pages--; 902 } 903 904 return ret; 905 } 906 907 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 908 void *opaque) 909 { 910 const hwaddr offset = section->offset_within_region; 911 const hwaddr size = int128_get64(section->size); 912 const unsigned long start = offset >> TARGET_PAGE_BITS; 913 const unsigned long npages = size >> TARGET_PAGE_BITS; 914 RAMBlock *rb = section->mr->ram_block; 915 uint64_t *cleared_bits = opaque; 916 917 /* 918 * We don't grab ram_state->bitmap_mutex because we expect to run 919 * only when starting migration or during postcopy recovery where 920 * we don't have concurrent access. 921 */ 922 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 923 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 924 } 925 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 926 bitmap_clear(rb->bmap, start, npages); 927 } 928 929 /* 930 * Exclude all dirty pages from migration that fall into a discarded range as 931 * managed by a RamDiscardManager responsible for the mapped memory region of 932 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 933 * 934 * Discarded pages ("logically unplugged") have undefined content and must 935 * not get migrated, because even reading these pages for migration might 936 * result in undesired behavior. 937 * 938 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 939 * 940 * Note: The result is only stable while migrating (precopy/postcopy). 941 */ 942 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 943 { 944 uint64_t cleared_bits = 0; 945 946 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 947 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 948 MemoryRegionSection section = { 949 .mr = rb->mr, 950 .offset_within_region = 0, 951 .size = int128_make64(qemu_ram_get_used_length(rb)), 952 }; 953 954 ram_discard_manager_replay_discarded(rdm, §ion, 955 dirty_bitmap_clear_section, 956 &cleared_bits); 957 } 958 return cleared_bits; 959 } 960 961 /* 962 * Check if a host-page aligned page falls into a discarded range as managed by 963 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 964 * 965 * Note: The result is only stable while migrating (precopy/postcopy). 966 */ 967 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 968 { 969 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 970 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 971 MemoryRegionSection section = { 972 .mr = rb->mr, 973 .offset_within_region = start, 974 .size = int128_make64(qemu_ram_pagesize(rb)), 975 }; 976 977 return !ram_discard_manager_is_populated(rdm, §ion); 978 } 979 return false; 980 } 981 982 /* Called with RCU critical section */ 983 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 984 { 985 uint64_t new_dirty_pages = 986 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 987 988 rs->migration_dirty_pages += new_dirty_pages; 989 rs->num_dirty_pages_period += new_dirty_pages; 990 } 991 992 /** 993 * ram_pagesize_summary: calculate all the pagesizes of a VM 994 * 995 * Returns a summary bitmap of the page sizes of all RAMBlocks 996 * 997 * For VMs with just normal pages this is equivalent to the host page 998 * size. If it's got some huge pages then it's the OR of all the 999 * different page sizes. 1000 */ 1001 uint64_t ram_pagesize_summary(void) 1002 { 1003 RAMBlock *block; 1004 uint64_t summary = 0; 1005 1006 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1007 summary |= block->page_size; 1008 } 1009 1010 return summary; 1011 } 1012 1013 uint64_t ram_get_total_transferred_pages(void) 1014 { 1015 return ram_counters.normal + ram_counters.duplicate + 1016 compression_counters.pages + xbzrle_counters.pages; 1017 } 1018 1019 static void migration_update_rates(RAMState *rs, int64_t end_time) 1020 { 1021 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1022 double compressed_size; 1023 1024 /* calculate period counters */ 1025 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1026 / (end_time - rs->time_last_bitmap_sync); 1027 1028 if (!page_count) { 1029 return; 1030 } 1031 1032 if (migrate_use_xbzrle()) { 1033 double encoded_size, unencoded_size; 1034 1035 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1036 rs->xbzrle_cache_miss_prev) / page_count; 1037 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1038 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1039 TARGET_PAGE_SIZE; 1040 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1041 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1042 xbzrle_counters.encoding_rate = 0; 1043 } else { 1044 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1045 } 1046 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1047 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1048 } 1049 1050 if (migrate_use_compression()) { 1051 compression_counters.busy_rate = (double)(compression_counters.busy - 1052 rs->compress_thread_busy_prev) / page_count; 1053 rs->compress_thread_busy_prev = compression_counters.busy; 1054 1055 compressed_size = compression_counters.compressed_size - 1056 rs->compressed_size_prev; 1057 if (compressed_size) { 1058 double uncompressed_size = (compression_counters.pages - 1059 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1060 1061 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1062 compression_counters.compression_rate = 1063 uncompressed_size / compressed_size; 1064 1065 rs->compress_pages_prev = compression_counters.pages; 1066 rs->compressed_size_prev = compression_counters.compressed_size; 1067 } 1068 } 1069 } 1070 1071 static void migration_trigger_throttle(RAMState *rs) 1072 { 1073 MigrationState *s = migrate_get_current(); 1074 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1075 1076 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 1077 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1078 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1079 1080 /* During block migration the auto-converge logic incorrectly detects 1081 * that ram migration makes no progress. Avoid this by disabling the 1082 * throttling logic during the bulk phase of block migration. */ 1083 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1084 /* The following detection logic can be refined later. For now: 1085 Check to see if the ratio between dirtied bytes and the approx. 1086 amount of bytes that just got transferred since the last time 1087 we were in this routine reaches the threshold. If that happens 1088 twice, start or increase throttling. */ 1089 1090 if ((bytes_dirty_period > bytes_dirty_threshold) && 1091 (++rs->dirty_rate_high_cnt >= 2)) { 1092 trace_migration_throttle(); 1093 rs->dirty_rate_high_cnt = 0; 1094 mig_throttle_guest_down(bytes_dirty_period, 1095 bytes_dirty_threshold); 1096 } 1097 } 1098 } 1099 1100 static void migration_bitmap_sync(RAMState *rs) 1101 { 1102 RAMBlock *block; 1103 int64_t end_time; 1104 1105 ram_counters.dirty_sync_count++; 1106 1107 if (!rs->time_last_bitmap_sync) { 1108 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1109 } 1110 1111 trace_migration_bitmap_sync_start(); 1112 memory_global_dirty_log_sync(); 1113 1114 qemu_mutex_lock(&rs->bitmap_mutex); 1115 WITH_RCU_READ_LOCK_GUARD() { 1116 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1117 ramblock_sync_dirty_bitmap(rs, block); 1118 } 1119 ram_counters.remaining = ram_bytes_remaining(); 1120 } 1121 qemu_mutex_unlock(&rs->bitmap_mutex); 1122 1123 memory_global_after_dirty_log_sync(); 1124 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1125 1126 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1127 1128 /* more than 1 second = 1000 millisecons */ 1129 if (end_time > rs->time_last_bitmap_sync + 1000) { 1130 migration_trigger_throttle(rs); 1131 1132 migration_update_rates(rs, end_time); 1133 1134 rs->target_page_count_prev = rs->target_page_count; 1135 1136 /* reset period counters */ 1137 rs->time_last_bitmap_sync = end_time; 1138 rs->num_dirty_pages_period = 0; 1139 rs->bytes_xfer_prev = ram_counters.transferred; 1140 } 1141 if (migrate_use_events()) { 1142 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1143 } 1144 } 1145 1146 static void migration_bitmap_sync_precopy(RAMState *rs) 1147 { 1148 Error *local_err = NULL; 1149 1150 /* 1151 * The current notifier usage is just an optimization to migration, so we 1152 * don't stop the normal migration process in the error case. 1153 */ 1154 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1155 error_report_err(local_err); 1156 local_err = NULL; 1157 } 1158 1159 migration_bitmap_sync(rs); 1160 1161 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1162 error_report_err(local_err); 1163 } 1164 } 1165 1166 /** 1167 * save_zero_page_to_file: send the zero page to the file 1168 * 1169 * Returns the size of data written to the file, 0 means the page is not 1170 * a zero page 1171 * 1172 * @rs: current RAM state 1173 * @file: the file where the data is saved 1174 * @block: block that contains the page we want to send 1175 * @offset: offset inside the block for the page 1176 */ 1177 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1178 RAMBlock *block, ram_addr_t offset) 1179 { 1180 uint8_t *p = block->host + offset; 1181 int len = 0; 1182 1183 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1184 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1185 qemu_put_byte(file, 0); 1186 len += 1; 1187 } 1188 return len; 1189 } 1190 1191 /** 1192 * save_zero_page: send the zero page to the stream 1193 * 1194 * Returns the number of pages written. 1195 * 1196 * @rs: current RAM state 1197 * @block: block that contains the page we want to send 1198 * @offset: offset inside the block for the page 1199 */ 1200 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1201 { 1202 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1203 1204 if (len) { 1205 ram_counters.duplicate++; 1206 ram_counters.transferred += len; 1207 return 1; 1208 } 1209 return -1; 1210 } 1211 1212 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1213 { 1214 if (!migrate_release_ram() || !migration_in_postcopy()) { 1215 return; 1216 } 1217 1218 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1219 } 1220 1221 /* 1222 * @pages: the number of pages written by the control path, 1223 * < 0 - error 1224 * > 0 - number of pages written 1225 * 1226 * Return true if the pages has been saved, otherwise false is returned. 1227 */ 1228 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1229 int *pages) 1230 { 1231 uint64_t bytes_xmit = 0; 1232 int ret; 1233 1234 *pages = -1; 1235 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1236 &bytes_xmit); 1237 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1238 return false; 1239 } 1240 1241 if (bytes_xmit) { 1242 ram_counters.transferred += bytes_xmit; 1243 *pages = 1; 1244 } 1245 1246 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1247 return true; 1248 } 1249 1250 if (bytes_xmit > 0) { 1251 ram_counters.normal++; 1252 } else if (bytes_xmit == 0) { 1253 ram_counters.duplicate++; 1254 } 1255 1256 return true; 1257 } 1258 1259 /* 1260 * directly send the page to the stream 1261 * 1262 * Returns the number of pages written. 1263 * 1264 * @rs: current RAM state 1265 * @block: block that contains the page we want to send 1266 * @offset: offset inside the block for the page 1267 * @buf: the page to be sent 1268 * @async: send to page asyncly 1269 */ 1270 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1271 uint8_t *buf, bool async) 1272 { 1273 ram_counters.transferred += save_page_header(rs, rs->f, block, 1274 offset | RAM_SAVE_FLAG_PAGE); 1275 if (async) { 1276 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1277 migrate_release_ram() & 1278 migration_in_postcopy()); 1279 } else { 1280 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1281 } 1282 ram_counters.transferred += TARGET_PAGE_SIZE; 1283 ram_counters.normal++; 1284 return 1; 1285 } 1286 1287 /** 1288 * ram_save_page: send the given page to the stream 1289 * 1290 * Returns the number of pages written. 1291 * < 0 - error 1292 * >=0 - Number of pages written - this might legally be 0 1293 * if xbzrle noticed the page was the same. 1294 * 1295 * @rs: current RAM state 1296 * @block: block that contains the page we want to send 1297 * @offset: offset inside the block for the page 1298 * @last_stage: if we are at the completion stage 1299 */ 1300 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1301 { 1302 int pages = -1; 1303 uint8_t *p; 1304 bool send_async = true; 1305 RAMBlock *block = pss->block; 1306 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1307 ram_addr_t current_addr = block->offset + offset; 1308 1309 p = block->host + offset; 1310 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1311 1312 XBZRLE_cache_lock(); 1313 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1314 pages = save_xbzrle_page(rs, &p, current_addr, block, 1315 offset, last_stage); 1316 if (!last_stage) { 1317 /* Can't send this cached data async, since the cache page 1318 * might get updated before it gets to the wire 1319 */ 1320 send_async = false; 1321 } 1322 } 1323 1324 /* XBZRLE overflow or normal page */ 1325 if (pages == -1) { 1326 pages = save_normal_page(rs, block, offset, p, send_async); 1327 } 1328 1329 XBZRLE_cache_unlock(); 1330 1331 return pages; 1332 } 1333 1334 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1335 ram_addr_t offset) 1336 { 1337 if (multifd_queue_page(rs->f, block, offset) < 0) { 1338 return -1; 1339 } 1340 ram_counters.normal++; 1341 1342 return 1; 1343 } 1344 1345 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1346 ram_addr_t offset, uint8_t *source_buf) 1347 { 1348 RAMState *rs = ram_state; 1349 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1350 bool zero_page = false; 1351 int ret; 1352 1353 if (save_zero_page_to_file(rs, f, block, offset)) { 1354 zero_page = true; 1355 goto exit; 1356 } 1357 1358 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1359 1360 /* 1361 * copy it to a internal buffer to avoid it being modified by VM 1362 * so that we can catch up the error during compression and 1363 * decompression 1364 */ 1365 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1366 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1367 if (ret < 0) { 1368 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1369 error_report("compressed data failed!"); 1370 return false; 1371 } 1372 1373 exit: 1374 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1375 return zero_page; 1376 } 1377 1378 static void 1379 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1380 { 1381 ram_counters.transferred += bytes_xmit; 1382 1383 if (param->zero_page) { 1384 ram_counters.duplicate++; 1385 return; 1386 } 1387 1388 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1389 compression_counters.compressed_size += bytes_xmit - 8; 1390 compression_counters.pages++; 1391 } 1392 1393 static bool save_page_use_compression(RAMState *rs); 1394 1395 static void flush_compressed_data(RAMState *rs) 1396 { 1397 int idx, len, thread_count; 1398 1399 if (!save_page_use_compression(rs)) { 1400 return; 1401 } 1402 thread_count = migrate_compress_threads(); 1403 1404 qemu_mutex_lock(&comp_done_lock); 1405 for (idx = 0; idx < thread_count; idx++) { 1406 while (!comp_param[idx].done) { 1407 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1408 } 1409 } 1410 qemu_mutex_unlock(&comp_done_lock); 1411 1412 for (idx = 0; idx < thread_count; idx++) { 1413 qemu_mutex_lock(&comp_param[idx].mutex); 1414 if (!comp_param[idx].quit) { 1415 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1416 /* 1417 * it's safe to fetch zero_page without holding comp_done_lock 1418 * as there is no further request submitted to the thread, 1419 * i.e, the thread should be waiting for a request at this point. 1420 */ 1421 update_compress_thread_counts(&comp_param[idx], len); 1422 } 1423 qemu_mutex_unlock(&comp_param[idx].mutex); 1424 } 1425 } 1426 1427 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1428 ram_addr_t offset) 1429 { 1430 param->block = block; 1431 param->offset = offset; 1432 } 1433 1434 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1435 ram_addr_t offset) 1436 { 1437 int idx, thread_count, bytes_xmit = -1, pages = -1; 1438 bool wait = migrate_compress_wait_thread(); 1439 1440 thread_count = migrate_compress_threads(); 1441 qemu_mutex_lock(&comp_done_lock); 1442 retry: 1443 for (idx = 0; idx < thread_count; idx++) { 1444 if (comp_param[idx].done) { 1445 comp_param[idx].done = false; 1446 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1447 qemu_mutex_lock(&comp_param[idx].mutex); 1448 set_compress_params(&comp_param[idx], block, offset); 1449 qemu_cond_signal(&comp_param[idx].cond); 1450 qemu_mutex_unlock(&comp_param[idx].mutex); 1451 pages = 1; 1452 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1453 break; 1454 } 1455 } 1456 1457 /* 1458 * wait for the free thread if the user specifies 'compress-wait-thread', 1459 * otherwise we will post the page out in the main thread as normal page. 1460 */ 1461 if (pages < 0 && wait) { 1462 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1463 goto retry; 1464 } 1465 qemu_mutex_unlock(&comp_done_lock); 1466 1467 return pages; 1468 } 1469 1470 /** 1471 * find_dirty_block: find the next dirty page and update any state 1472 * associated with the search process. 1473 * 1474 * Returns true if a page is found 1475 * 1476 * @rs: current RAM state 1477 * @pss: data about the state of the current dirty page scan 1478 * @again: set to false if the search has scanned the whole of RAM 1479 */ 1480 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1481 { 1482 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1483 if (pss->complete_round && pss->block == rs->last_seen_block && 1484 pss->page >= rs->last_page) { 1485 /* 1486 * We've been once around the RAM and haven't found anything. 1487 * Give up. 1488 */ 1489 *again = false; 1490 return false; 1491 } 1492 if (!offset_in_ramblock(pss->block, 1493 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1494 /* Didn't find anything in this RAM Block */ 1495 pss->page = 0; 1496 pss->block = QLIST_NEXT_RCU(pss->block, next); 1497 if (!pss->block) { 1498 /* 1499 * If memory migration starts over, we will meet a dirtied page 1500 * which may still exists in compression threads's ring, so we 1501 * should flush the compressed data to make sure the new page 1502 * is not overwritten by the old one in the destination. 1503 * 1504 * Also If xbzrle is on, stop using the data compression at this 1505 * point. In theory, xbzrle can do better than compression. 1506 */ 1507 flush_compressed_data(rs); 1508 1509 /* Hit the end of the list */ 1510 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1511 /* Flag that we've looped */ 1512 pss->complete_round = true; 1513 /* After the first round, enable XBZRLE. */ 1514 if (migrate_use_xbzrle()) { 1515 rs->xbzrle_enabled = true; 1516 } 1517 } 1518 /* Didn't find anything this time, but try again on the new block */ 1519 *again = true; 1520 return false; 1521 } else { 1522 /* Can go around again, but... */ 1523 *again = true; 1524 /* We've found something so probably don't need to */ 1525 return true; 1526 } 1527 } 1528 1529 /** 1530 * unqueue_page: gets a page of the queue 1531 * 1532 * Helper for 'get_queued_page' - gets a page off the queue 1533 * 1534 * Returns the block of the page (or NULL if none available) 1535 * 1536 * @rs: current RAM state 1537 * @offset: used to return the offset within the RAMBlock 1538 */ 1539 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1540 { 1541 RAMBlock *block = NULL; 1542 1543 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1544 return NULL; 1545 } 1546 1547 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1548 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1549 struct RAMSrcPageRequest *entry = 1550 QSIMPLEQ_FIRST(&rs->src_page_requests); 1551 block = entry->rb; 1552 *offset = entry->offset; 1553 1554 if (entry->len > TARGET_PAGE_SIZE) { 1555 entry->len -= TARGET_PAGE_SIZE; 1556 entry->offset += TARGET_PAGE_SIZE; 1557 } else { 1558 memory_region_unref(block->mr); 1559 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1560 g_free(entry); 1561 migration_consume_urgent_request(); 1562 } 1563 } 1564 1565 return block; 1566 } 1567 1568 #if defined(__linux__) 1569 /** 1570 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1571 * is found, return RAM block pointer and page offset 1572 * 1573 * Returns pointer to the RAMBlock containing faulting page, 1574 * NULL if no write faults are pending 1575 * 1576 * @rs: current RAM state 1577 * @offset: page offset from the beginning of the block 1578 */ 1579 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1580 { 1581 struct uffd_msg uffd_msg; 1582 void *page_address; 1583 RAMBlock *block; 1584 int res; 1585 1586 if (!migrate_background_snapshot()) { 1587 return NULL; 1588 } 1589 1590 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1591 if (res <= 0) { 1592 return NULL; 1593 } 1594 1595 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1596 block = qemu_ram_block_from_host(page_address, false, offset); 1597 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1598 return block; 1599 } 1600 1601 /** 1602 * ram_save_release_protection: release UFFD write protection after 1603 * a range of pages has been saved 1604 * 1605 * @rs: current RAM state 1606 * @pss: page-search-status structure 1607 * @start_page: index of the first page in the range relative to pss->block 1608 * 1609 * Returns 0 on success, negative value in case of an error 1610 */ 1611 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1612 unsigned long start_page) 1613 { 1614 int res = 0; 1615 1616 /* Check if page is from UFFD-managed region. */ 1617 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1618 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1619 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; 1620 1621 /* Flush async buffers before un-protect. */ 1622 qemu_fflush(rs->f); 1623 /* Un-protect memory range. */ 1624 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1625 false, false); 1626 } 1627 1628 return res; 1629 } 1630 1631 /* ram_write_tracking_available: check if kernel supports required UFFD features 1632 * 1633 * Returns true if supports, false otherwise 1634 */ 1635 bool ram_write_tracking_available(void) 1636 { 1637 uint64_t uffd_features; 1638 int res; 1639 1640 res = uffd_query_features(&uffd_features); 1641 return (res == 0 && 1642 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1643 } 1644 1645 /* ram_write_tracking_compatible: check if guest configuration is 1646 * compatible with 'write-tracking' 1647 * 1648 * Returns true if compatible, false otherwise 1649 */ 1650 bool ram_write_tracking_compatible(void) 1651 { 1652 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1653 int uffd_fd; 1654 RAMBlock *block; 1655 bool ret = false; 1656 1657 /* Open UFFD file descriptor */ 1658 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1659 if (uffd_fd < 0) { 1660 return false; 1661 } 1662 1663 RCU_READ_LOCK_GUARD(); 1664 1665 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1666 uint64_t uffd_ioctls; 1667 1668 /* Nothing to do with read-only and MMIO-writable regions */ 1669 if (block->mr->readonly || block->mr->rom_device) { 1670 continue; 1671 } 1672 /* Try to register block memory via UFFD-IO to track writes */ 1673 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1674 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1675 goto out; 1676 } 1677 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1678 goto out; 1679 } 1680 } 1681 ret = true; 1682 1683 out: 1684 uffd_close_fd(uffd_fd); 1685 return ret; 1686 } 1687 1688 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1689 ram_addr_t size) 1690 { 1691 /* 1692 * We read one byte of each page; this will preallocate page tables if 1693 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1694 * where no page was populated yet. This might require adaption when 1695 * supporting other mappings, like shmem. 1696 */ 1697 for (; offset < size; offset += block->page_size) { 1698 char tmp = *((char *)block->host + offset); 1699 1700 /* Don't optimize the read out */ 1701 asm volatile("" : "+r" (tmp)); 1702 } 1703 } 1704 1705 static inline int populate_read_section(MemoryRegionSection *section, 1706 void *opaque) 1707 { 1708 const hwaddr size = int128_get64(section->size); 1709 hwaddr offset = section->offset_within_region; 1710 RAMBlock *block = section->mr->ram_block; 1711 1712 populate_read_range(block, offset, size); 1713 return 0; 1714 } 1715 1716 /* 1717 * ram_block_populate_read: preallocate page tables and populate pages in the 1718 * RAM block by reading a byte of each page. 1719 * 1720 * Since it's solely used for userfault_fd WP feature, here we just 1721 * hardcode page size to qemu_real_host_page_size. 1722 * 1723 * @block: RAM block to populate 1724 */ 1725 static void ram_block_populate_read(RAMBlock *rb) 1726 { 1727 /* 1728 * Skip populating all pages that fall into a discarded range as managed by 1729 * a RamDiscardManager responsible for the mapped memory region of the 1730 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1731 * must not get populated automatically. We don't have to track 1732 * modifications via userfaultfd WP reliably, because these pages will 1733 * not be part of the migration stream either way -- see 1734 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1735 * 1736 * Note: The result is only stable while migrating (precopy/postcopy). 1737 */ 1738 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1739 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1740 MemoryRegionSection section = { 1741 .mr = rb->mr, 1742 .offset_within_region = 0, 1743 .size = rb->mr->size, 1744 }; 1745 1746 ram_discard_manager_replay_populated(rdm, §ion, 1747 populate_read_section, NULL); 1748 } else { 1749 populate_read_range(rb, 0, rb->used_length); 1750 } 1751 } 1752 1753 /* 1754 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1755 */ 1756 void ram_write_tracking_prepare(void) 1757 { 1758 RAMBlock *block; 1759 1760 RCU_READ_LOCK_GUARD(); 1761 1762 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1763 /* Nothing to do with read-only and MMIO-writable regions */ 1764 if (block->mr->readonly || block->mr->rom_device) { 1765 continue; 1766 } 1767 1768 /* 1769 * Populate pages of the RAM block before enabling userfault_fd 1770 * write protection. 1771 * 1772 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1773 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1774 * pages with pte_none() entries in page table. 1775 */ 1776 ram_block_populate_read(block); 1777 } 1778 } 1779 1780 /* 1781 * ram_write_tracking_start: start UFFD-WP memory tracking 1782 * 1783 * Returns 0 for success or negative value in case of error 1784 */ 1785 int ram_write_tracking_start(void) 1786 { 1787 int uffd_fd; 1788 RAMState *rs = ram_state; 1789 RAMBlock *block; 1790 1791 /* Open UFFD file descriptor */ 1792 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1793 if (uffd_fd < 0) { 1794 return uffd_fd; 1795 } 1796 rs->uffdio_fd = uffd_fd; 1797 1798 RCU_READ_LOCK_GUARD(); 1799 1800 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1801 /* Nothing to do with read-only and MMIO-writable regions */ 1802 if (block->mr->readonly || block->mr->rom_device) { 1803 continue; 1804 } 1805 1806 /* Register block memory with UFFD to track writes */ 1807 if (uffd_register_memory(rs->uffdio_fd, block->host, 1808 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1809 goto fail; 1810 } 1811 /* Apply UFFD write protection to the block memory range */ 1812 if (uffd_change_protection(rs->uffdio_fd, block->host, 1813 block->max_length, true, false)) { 1814 goto fail; 1815 } 1816 block->flags |= RAM_UF_WRITEPROTECT; 1817 memory_region_ref(block->mr); 1818 1819 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1820 block->host, block->max_length); 1821 } 1822 1823 return 0; 1824 1825 fail: 1826 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1827 1828 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1829 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1830 continue; 1831 } 1832 /* 1833 * In case some memory block failed to be write-protected 1834 * remove protection and unregister all succeeded RAM blocks 1835 */ 1836 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1837 false, false); 1838 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1839 /* Cleanup flags and remove reference */ 1840 block->flags &= ~RAM_UF_WRITEPROTECT; 1841 memory_region_unref(block->mr); 1842 } 1843 1844 uffd_close_fd(uffd_fd); 1845 rs->uffdio_fd = -1; 1846 return -1; 1847 } 1848 1849 /** 1850 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1851 */ 1852 void ram_write_tracking_stop(void) 1853 { 1854 RAMState *rs = ram_state; 1855 RAMBlock *block; 1856 1857 RCU_READ_LOCK_GUARD(); 1858 1859 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1860 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1861 continue; 1862 } 1863 /* Remove protection and unregister all affected RAM blocks */ 1864 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length, 1865 false, false); 1866 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1867 1868 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1869 block->host, block->max_length); 1870 1871 /* Cleanup flags and remove reference */ 1872 block->flags &= ~RAM_UF_WRITEPROTECT; 1873 memory_region_unref(block->mr); 1874 } 1875 1876 /* Finally close UFFD file descriptor */ 1877 uffd_close_fd(rs->uffdio_fd); 1878 rs->uffdio_fd = -1; 1879 } 1880 1881 #else 1882 /* No target OS support, stubs just fail or ignore */ 1883 1884 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1885 { 1886 (void) rs; 1887 (void) offset; 1888 1889 return NULL; 1890 } 1891 1892 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1893 unsigned long start_page) 1894 { 1895 (void) rs; 1896 (void) pss; 1897 (void) start_page; 1898 1899 return 0; 1900 } 1901 1902 bool ram_write_tracking_available(void) 1903 { 1904 return false; 1905 } 1906 1907 bool ram_write_tracking_compatible(void) 1908 { 1909 assert(0); 1910 return false; 1911 } 1912 1913 int ram_write_tracking_start(void) 1914 { 1915 assert(0); 1916 return -1; 1917 } 1918 1919 void ram_write_tracking_stop(void) 1920 { 1921 assert(0); 1922 } 1923 #endif /* defined(__linux__) */ 1924 1925 /** 1926 * get_queued_page: unqueue a page from the postcopy requests 1927 * 1928 * Skips pages that are already sent (!dirty) 1929 * 1930 * Returns true if a queued page is found 1931 * 1932 * @rs: current RAM state 1933 * @pss: data about the state of the current dirty page scan 1934 */ 1935 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1936 { 1937 RAMBlock *block; 1938 ram_addr_t offset; 1939 bool dirty; 1940 1941 do { 1942 block = unqueue_page(rs, &offset); 1943 /* 1944 * We're sending this page, and since it's postcopy nothing else 1945 * will dirty it, and we must make sure it doesn't get sent again 1946 * even if this queue request was received after the background 1947 * search already sent it. 1948 */ 1949 if (block) { 1950 unsigned long page; 1951 1952 page = offset >> TARGET_PAGE_BITS; 1953 dirty = test_bit(page, block->bmap); 1954 if (!dirty) { 1955 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1956 page); 1957 } else { 1958 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1959 } 1960 } 1961 1962 } while (block && !dirty); 1963 1964 if (!block) { 1965 /* 1966 * Poll write faults too if background snapshot is enabled; that's 1967 * when we have vcpus got blocked by the write protected pages. 1968 */ 1969 block = poll_fault_page(rs, &offset); 1970 } 1971 1972 if (block) { 1973 /* 1974 * We want the background search to continue from the queued page 1975 * since the guest is likely to want other pages near to the page 1976 * it just requested. 1977 */ 1978 pss->block = block; 1979 pss->page = offset >> TARGET_PAGE_BITS; 1980 1981 /* 1982 * This unqueued page would break the "one round" check, even is 1983 * really rare. 1984 */ 1985 pss->complete_round = false; 1986 } 1987 1988 return !!block; 1989 } 1990 1991 /** 1992 * migration_page_queue_free: drop any remaining pages in the ram 1993 * request queue 1994 * 1995 * It should be empty at the end anyway, but in error cases there may 1996 * be some left. in case that there is any page left, we drop it. 1997 * 1998 */ 1999 static void migration_page_queue_free(RAMState *rs) 2000 { 2001 struct RAMSrcPageRequest *mspr, *next_mspr; 2002 /* This queue generally should be empty - but in the case of a failed 2003 * migration might have some droppings in. 2004 */ 2005 RCU_READ_LOCK_GUARD(); 2006 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2007 memory_region_unref(mspr->rb->mr); 2008 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2009 g_free(mspr); 2010 } 2011 } 2012 2013 /** 2014 * ram_save_queue_pages: queue the page for transmission 2015 * 2016 * A request from postcopy destination for example. 2017 * 2018 * Returns zero on success or negative on error 2019 * 2020 * @rbname: Name of the RAMBLock of the request. NULL means the 2021 * same that last one. 2022 * @start: starting address from the start of the RAMBlock 2023 * @len: length (in bytes) to send 2024 */ 2025 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2026 { 2027 RAMBlock *ramblock; 2028 RAMState *rs = ram_state; 2029 2030 ram_counters.postcopy_requests++; 2031 RCU_READ_LOCK_GUARD(); 2032 2033 if (!rbname) { 2034 /* Reuse last RAMBlock */ 2035 ramblock = rs->last_req_rb; 2036 2037 if (!ramblock) { 2038 /* 2039 * Shouldn't happen, we can't reuse the last RAMBlock if 2040 * it's the 1st request. 2041 */ 2042 error_report("ram_save_queue_pages no previous block"); 2043 return -1; 2044 } 2045 } else { 2046 ramblock = qemu_ram_block_by_name(rbname); 2047 2048 if (!ramblock) { 2049 /* We shouldn't be asked for a non-existent RAMBlock */ 2050 error_report("ram_save_queue_pages no block '%s'", rbname); 2051 return -1; 2052 } 2053 rs->last_req_rb = ramblock; 2054 } 2055 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2056 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2057 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2058 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2059 __func__, start, len, ramblock->used_length); 2060 return -1; 2061 } 2062 2063 struct RAMSrcPageRequest *new_entry = 2064 g_malloc0(sizeof(struct RAMSrcPageRequest)); 2065 new_entry->rb = ramblock; 2066 new_entry->offset = start; 2067 new_entry->len = len; 2068 2069 memory_region_ref(ramblock->mr); 2070 qemu_mutex_lock(&rs->src_page_req_mutex); 2071 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2072 migration_make_urgent_request(); 2073 qemu_mutex_unlock(&rs->src_page_req_mutex); 2074 2075 return 0; 2076 } 2077 2078 static bool save_page_use_compression(RAMState *rs) 2079 { 2080 if (!migrate_use_compression()) { 2081 return false; 2082 } 2083 2084 /* 2085 * If xbzrle is enabled (e.g., after first round of migration), stop 2086 * using the data compression. In theory, xbzrle can do better than 2087 * compression. 2088 */ 2089 if (rs->xbzrle_enabled) { 2090 return false; 2091 } 2092 2093 return true; 2094 } 2095 2096 /* 2097 * try to compress the page before posting it out, return true if the page 2098 * has been properly handled by compression, otherwise needs other 2099 * paths to handle it 2100 */ 2101 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 2102 { 2103 if (!save_page_use_compression(rs)) { 2104 return false; 2105 } 2106 2107 /* 2108 * When starting the process of a new block, the first page of 2109 * the block should be sent out before other pages in the same 2110 * block, and all the pages in last block should have been sent 2111 * out, keeping this order is important, because the 'cont' flag 2112 * is used to avoid resending the block name. 2113 * 2114 * We post the fist page as normal page as compression will take 2115 * much CPU resource. 2116 */ 2117 if (block != rs->last_sent_block) { 2118 flush_compressed_data(rs); 2119 return false; 2120 } 2121 2122 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 2123 return true; 2124 } 2125 2126 compression_counters.busy++; 2127 return false; 2128 } 2129 2130 /** 2131 * ram_save_target_page: save one target page 2132 * 2133 * Returns the number of pages written 2134 * 2135 * @rs: current RAM state 2136 * @pss: data about the page we want to send 2137 * @last_stage: if we are at the completion stage 2138 */ 2139 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 2140 bool last_stage) 2141 { 2142 RAMBlock *block = pss->block; 2143 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2144 int res; 2145 2146 if (control_save_page(rs, block, offset, &res)) { 2147 return res; 2148 } 2149 2150 if (save_compress_page(rs, block, offset)) { 2151 return 1; 2152 } 2153 2154 res = save_zero_page(rs, block, offset); 2155 if (res > 0) { 2156 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2157 * page would be stale 2158 */ 2159 if (!save_page_use_compression(rs)) { 2160 XBZRLE_cache_lock(); 2161 xbzrle_cache_zero_page(rs, block->offset + offset); 2162 XBZRLE_cache_unlock(); 2163 } 2164 ram_release_pages(block->idstr, offset, res); 2165 return res; 2166 } 2167 2168 /* 2169 * Do not use multifd for: 2170 * 1. Compression as the first page in the new block should be posted out 2171 * before sending the compressed page 2172 * 2. In postcopy as one whole host page should be placed 2173 */ 2174 if (!save_page_use_compression(rs) && migrate_use_multifd() 2175 && !migration_in_postcopy()) { 2176 return ram_save_multifd_page(rs, block, offset); 2177 } 2178 2179 return ram_save_page(rs, pss, last_stage); 2180 } 2181 2182 /** 2183 * ram_save_host_page: save a whole host page 2184 * 2185 * Starting at *offset send pages up to the end of the current host 2186 * page. It's valid for the initial offset to point into the middle of 2187 * a host page in which case the remainder of the hostpage is sent. 2188 * Only dirty target pages are sent. Note that the host page size may 2189 * be a huge page for this block. 2190 * The saving stops at the boundary of the used_length of the block 2191 * if the RAMBlock isn't a multiple of the host page size. 2192 * 2193 * Returns the number of pages written or negative on error 2194 * 2195 * @rs: current RAM state 2196 * @ms: current migration state 2197 * @pss: data about the page we want to send 2198 * @last_stage: if we are at the completion stage 2199 */ 2200 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 2201 bool last_stage) 2202 { 2203 int tmppages, pages = 0; 2204 size_t pagesize_bits = 2205 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2206 unsigned long hostpage_boundary = 2207 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); 2208 unsigned long start_page = pss->page; 2209 int res; 2210 2211 if (ramblock_is_ignored(pss->block)) { 2212 error_report("block %s should not be migrated !", pss->block->idstr); 2213 return 0; 2214 } 2215 2216 do { 2217 /* Check the pages is dirty and if it is send it */ 2218 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2219 tmppages = ram_save_target_page(rs, pss, last_stage); 2220 if (tmppages < 0) { 2221 return tmppages; 2222 } 2223 2224 pages += tmppages; 2225 /* 2226 * Allow rate limiting to happen in the middle of huge pages if 2227 * something is sent in the current iteration. 2228 */ 2229 if (pagesize_bits > 1 && tmppages > 0) { 2230 migration_rate_limit(); 2231 } 2232 } 2233 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 2234 } while ((pss->page < hostpage_boundary) && 2235 offset_in_ramblock(pss->block, 2236 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2237 /* The offset we leave with is the min boundary of host page and block */ 2238 pss->page = MIN(pss->page, hostpage_boundary) - 1; 2239 2240 res = ram_save_release_protection(rs, pss, start_page); 2241 return (res < 0 ? res : pages); 2242 } 2243 2244 /** 2245 * ram_find_and_save_block: finds a dirty page and sends it to f 2246 * 2247 * Called within an RCU critical section. 2248 * 2249 * Returns the number of pages written where zero means no dirty pages, 2250 * or negative on error 2251 * 2252 * @rs: current RAM state 2253 * @last_stage: if we are at the completion stage 2254 * 2255 * On systems where host-page-size > target-page-size it will send all the 2256 * pages in a host page that are dirty. 2257 */ 2258 2259 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 2260 { 2261 PageSearchStatus pss; 2262 int pages = 0; 2263 bool again, found; 2264 2265 /* No dirty page as there is zero RAM */ 2266 if (!ram_bytes_total()) { 2267 return pages; 2268 } 2269 2270 pss.block = rs->last_seen_block; 2271 pss.page = rs->last_page; 2272 pss.complete_round = false; 2273 2274 if (!pss.block) { 2275 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2276 } 2277 2278 do { 2279 again = true; 2280 found = get_queued_page(rs, &pss); 2281 2282 if (!found) { 2283 /* priority queue empty, so just search for something dirty */ 2284 found = find_dirty_block(rs, &pss, &again); 2285 } 2286 2287 if (found) { 2288 pages = ram_save_host_page(rs, &pss, last_stage); 2289 } 2290 } while (!pages && again); 2291 2292 rs->last_seen_block = pss.block; 2293 rs->last_page = pss.page; 2294 2295 return pages; 2296 } 2297 2298 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2299 { 2300 uint64_t pages = size / TARGET_PAGE_SIZE; 2301 2302 if (zero) { 2303 ram_counters.duplicate += pages; 2304 } else { 2305 ram_counters.normal += pages; 2306 ram_counters.transferred += size; 2307 qemu_update_position(f, size); 2308 } 2309 } 2310 2311 static uint64_t ram_bytes_total_common(bool count_ignored) 2312 { 2313 RAMBlock *block; 2314 uint64_t total = 0; 2315 2316 RCU_READ_LOCK_GUARD(); 2317 2318 if (count_ignored) { 2319 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2320 total += block->used_length; 2321 } 2322 } else { 2323 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2324 total += block->used_length; 2325 } 2326 } 2327 return total; 2328 } 2329 2330 uint64_t ram_bytes_total(void) 2331 { 2332 return ram_bytes_total_common(false); 2333 } 2334 2335 static void xbzrle_load_setup(void) 2336 { 2337 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2338 } 2339 2340 static void xbzrle_load_cleanup(void) 2341 { 2342 g_free(XBZRLE.decoded_buf); 2343 XBZRLE.decoded_buf = NULL; 2344 } 2345 2346 static void ram_state_cleanup(RAMState **rsp) 2347 { 2348 if (*rsp) { 2349 migration_page_queue_free(*rsp); 2350 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2351 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2352 g_free(*rsp); 2353 *rsp = NULL; 2354 } 2355 } 2356 2357 static void xbzrle_cleanup(void) 2358 { 2359 XBZRLE_cache_lock(); 2360 if (XBZRLE.cache) { 2361 cache_fini(XBZRLE.cache); 2362 g_free(XBZRLE.encoded_buf); 2363 g_free(XBZRLE.current_buf); 2364 g_free(XBZRLE.zero_target_page); 2365 XBZRLE.cache = NULL; 2366 XBZRLE.encoded_buf = NULL; 2367 XBZRLE.current_buf = NULL; 2368 XBZRLE.zero_target_page = NULL; 2369 } 2370 XBZRLE_cache_unlock(); 2371 } 2372 2373 static void ram_save_cleanup(void *opaque) 2374 { 2375 RAMState **rsp = opaque; 2376 RAMBlock *block; 2377 2378 /* We don't use dirty log with background snapshots */ 2379 if (!migrate_background_snapshot()) { 2380 /* caller have hold iothread lock or is in a bh, so there is 2381 * no writing race against the migration bitmap 2382 */ 2383 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2384 /* 2385 * do not stop dirty log without starting it, since 2386 * memory_global_dirty_log_stop will assert that 2387 * memory_global_dirty_log_start/stop used in pairs 2388 */ 2389 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2390 } 2391 } 2392 2393 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2394 g_free(block->clear_bmap); 2395 block->clear_bmap = NULL; 2396 g_free(block->bmap); 2397 block->bmap = NULL; 2398 } 2399 2400 xbzrle_cleanup(); 2401 compress_threads_save_cleanup(); 2402 ram_state_cleanup(rsp); 2403 } 2404 2405 static void ram_state_reset(RAMState *rs) 2406 { 2407 rs->last_seen_block = NULL; 2408 rs->last_sent_block = NULL; 2409 rs->last_page = 0; 2410 rs->last_version = ram_list.version; 2411 rs->xbzrle_enabled = false; 2412 } 2413 2414 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2415 2416 /* 2417 * 'expected' is the value you expect the bitmap mostly to be full 2418 * of; it won't bother printing lines that are all this value. 2419 * If 'todump' is null the migration bitmap is dumped. 2420 */ 2421 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 2422 unsigned long pages) 2423 { 2424 int64_t cur; 2425 int64_t linelen = 128; 2426 char linebuf[129]; 2427 2428 for (cur = 0; cur < pages; cur += linelen) { 2429 int64_t curb; 2430 bool found = false; 2431 /* 2432 * Last line; catch the case where the line length 2433 * is longer than remaining ram 2434 */ 2435 if (cur + linelen > pages) { 2436 linelen = pages - cur; 2437 } 2438 for (curb = 0; curb < linelen; curb++) { 2439 bool thisbit = test_bit(cur + curb, todump); 2440 linebuf[curb] = thisbit ? '1' : '.'; 2441 found = found || (thisbit != expected); 2442 } 2443 if (found) { 2444 linebuf[curb] = '\0'; 2445 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 2446 } 2447 } 2448 } 2449 2450 /* **** functions for postcopy ***** */ 2451 2452 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2453 { 2454 struct RAMBlock *block; 2455 2456 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2457 unsigned long *bitmap = block->bmap; 2458 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2459 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2460 2461 while (run_start < range) { 2462 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2463 ram_discard_range(block->idstr, 2464 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2465 ((ram_addr_t)(run_end - run_start)) 2466 << TARGET_PAGE_BITS); 2467 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2468 } 2469 } 2470 } 2471 2472 /** 2473 * postcopy_send_discard_bm_ram: discard a RAMBlock 2474 * 2475 * Returns zero on success 2476 * 2477 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2478 * 2479 * @ms: current migration state 2480 * @block: RAMBlock to discard 2481 */ 2482 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2483 { 2484 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2485 unsigned long current; 2486 unsigned long *bitmap = block->bmap; 2487 2488 for (current = 0; current < end; ) { 2489 unsigned long one = find_next_bit(bitmap, end, current); 2490 unsigned long zero, discard_length; 2491 2492 if (one >= end) { 2493 break; 2494 } 2495 2496 zero = find_next_zero_bit(bitmap, end, one + 1); 2497 2498 if (zero >= end) { 2499 discard_length = end - one; 2500 } else { 2501 discard_length = zero - one; 2502 } 2503 postcopy_discard_send_range(ms, one, discard_length); 2504 current = one + discard_length; 2505 } 2506 2507 return 0; 2508 } 2509 2510 /** 2511 * postcopy_each_ram_send_discard: discard all RAMBlocks 2512 * 2513 * Returns 0 for success or negative for error 2514 * 2515 * Utility for the outgoing postcopy code. 2516 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2517 * passing it bitmap indexes and name. 2518 * (qemu_ram_foreach_block ends up passing unscaled lengths 2519 * which would mean postcopy code would have to deal with target page) 2520 * 2521 * @ms: current migration state 2522 */ 2523 static int postcopy_each_ram_send_discard(MigrationState *ms) 2524 { 2525 struct RAMBlock *block; 2526 int ret; 2527 2528 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2529 postcopy_discard_send_init(ms, block->idstr); 2530 2531 /* 2532 * Postcopy sends chunks of bitmap over the wire, but it 2533 * just needs indexes at this point, avoids it having 2534 * target page specific code. 2535 */ 2536 ret = postcopy_send_discard_bm_ram(ms, block); 2537 postcopy_discard_send_finish(ms); 2538 if (ret) { 2539 return ret; 2540 } 2541 } 2542 2543 return 0; 2544 } 2545 2546 /** 2547 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2548 * 2549 * Helper for postcopy_chunk_hostpages; it's called twice to 2550 * canonicalize the two bitmaps, that are similar, but one is 2551 * inverted. 2552 * 2553 * Postcopy requires that all target pages in a hostpage are dirty or 2554 * clean, not a mix. This function canonicalizes the bitmaps. 2555 * 2556 * @ms: current migration state 2557 * @block: block that contains the page we want to canonicalize 2558 */ 2559 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2560 { 2561 RAMState *rs = ram_state; 2562 unsigned long *bitmap = block->bmap; 2563 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2564 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2565 unsigned long run_start; 2566 2567 if (block->page_size == TARGET_PAGE_SIZE) { 2568 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2569 return; 2570 } 2571 2572 /* Find a dirty page */ 2573 run_start = find_next_bit(bitmap, pages, 0); 2574 2575 while (run_start < pages) { 2576 2577 /* 2578 * If the start of this run of pages is in the middle of a host 2579 * page, then we need to fixup this host page. 2580 */ 2581 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2582 /* Find the end of this run */ 2583 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2584 /* 2585 * If the end isn't at the start of a host page, then the 2586 * run doesn't finish at the end of a host page 2587 * and we need to discard. 2588 */ 2589 } 2590 2591 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2592 unsigned long page; 2593 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2594 host_ratio); 2595 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2596 2597 /* Clean up the bitmap */ 2598 for (page = fixup_start_addr; 2599 page < fixup_start_addr + host_ratio; page++) { 2600 /* 2601 * Remark them as dirty, updating the count for any pages 2602 * that weren't previously dirty. 2603 */ 2604 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2605 } 2606 } 2607 2608 /* Find the next dirty page for the next iteration */ 2609 run_start = find_next_bit(bitmap, pages, run_start); 2610 } 2611 } 2612 2613 /** 2614 * postcopy_chunk_hostpages: discard any partially sent host page 2615 * 2616 * Utility for the outgoing postcopy code. 2617 * 2618 * Discard any partially sent host-page size chunks, mark any partially 2619 * dirty host-page size chunks as all dirty. In this case the host-page 2620 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2621 * 2622 * Returns zero on success 2623 * 2624 * @ms: current migration state 2625 * @block: block we want to work with 2626 */ 2627 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2628 { 2629 postcopy_discard_send_init(ms, block->idstr); 2630 2631 /* 2632 * Ensure that all partially dirty host pages are made fully dirty. 2633 */ 2634 postcopy_chunk_hostpages_pass(ms, block); 2635 2636 postcopy_discard_send_finish(ms); 2637 return 0; 2638 } 2639 2640 /** 2641 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2642 * 2643 * Returns zero on success 2644 * 2645 * Transmit the set of pages to be discarded after precopy to the target 2646 * these are pages that: 2647 * a) Have been previously transmitted but are now dirty again 2648 * b) Pages that have never been transmitted, this ensures that 2649 * any pages on the destination that have been mapped by background 2650 * tasks get discarded (transparent huge pages is the specific concern) 2651 * Hopefully this is pretty sparse 2652 * 2653 * @ms: current migration state 2654 */ 2655 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2656 { 2657 RAMState *rs = ram_state; 2658 RAMBlock *block; 2659 int ret; 2660 2661 RCU_READ_LOCK_GUARD(); 2662 2663 /* This should be our last sync, the src is now paused */ 2664 migration_bitmap_sync(rs); 2665 2666 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2667 rs->last_seen_block = NULL; 2668 rs->last_sent_block = NULL; 2669 rs->last_page = 0; 2670 2671 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2672 /* Deal with TPS != HPS and huge pages */ 2673 ret = postcopy_chunk_hostpages(ms, block); 2674 if (ret) { 2675 return ret; 2676 } 2677 2678 #ifdef DEBUG_POSTCOPY 2679 ram_debug_dump_bitmap(block->bmap, true, 2680 block->used_length >> TARGET_PAGE_BITS); 2681 #endif 2682 } 2683 trace_ram_postcopy_send_discard_bitmap(); 2684 2685 return postcopy_each_ram_send_discard(ms); 2686 } 2687 2688 /** 2689 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2690 * 2691 * Returns zero on success 2692 * 2693 * @rbname: name of the RAMBlock of the request. NULL means the 2694 * same that last one. 2695 * @start: RAMBlock starting page 2696 * @length: RAMBlock size 2697 */ 2698 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2699 { 2700 trace_ram_discard_range(rbname, start, length); 2701 2702 RCU_READ_LOCK_GUARD(); 2703 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2704 2705 if (!rb) { 2706 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2707 return -1; 2708 } 2709 2710 /* 2711 * On source VM, we don't need to update the received bitmap since 2712 * we don't even have one. 2713 */ 2714 if (rb->receivedmap) { 2715 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2716 length >> qemu_target_page_bits()); 2717 } 2718 2719 return ram_block_discard_range(rb, start, length); 2720 } 2721 2722 /* 2723 * For every allocation, we will try not to crash the VM if the 2724 * allocation failed. 2725 */ 2726 static int xbzrle_init(void) 2727 { 2728 Error *local_err = NULL; 2729 2730 if (!migrate_use_xbzrle()) { 2731 return 0; 2732 } 2733 2734 XBZRLE_cache_lock(); 2735 2736 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2737 if (!XBZRLE.zero_target_page) { 2738 error_report("%s: Error allocating zero page", __func__); 2739 goto err_out; 2740 } 2741 2742 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2743 TARGET_PAGE_SIZE, &local_err); 2744 if (!XBZRLE.cache) { 2745 error_report_err(local_err); 2746 goto free_zero_page; 2747 } 2748 2749 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2750 if (!XBZRLE.encoded_buf) { 2751 error_report("%s: Error allocating encoded_buf", __func__); 2752 goto free_cache; 2753 } 2754 2755 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2756 if (!XBZRLE.current_buf) { 2757 error_report("%s: Error allocating current_buf", __func__); 2758 goto free_encoded_buf; 2759 } 2760 2761 /* We are all good */ 2762 XBZRLE_cache_unlock(); 2763 return 0; 2764 2765 free_encoded_buf: 2766 g_free(XBZRLE.encoded_buf); 2767 XBZRLE.encoded_buf = NULL; 2768 free_cache: 2769 cache_fini(XBZRLE.cache); 2770 XBZRLE.cache = NULL; 2771 free_zero_page: 2772 g_free(XBZRLE.zero_target_page); 2773 XBZRLE.zero_target_page = NULL; 2774 err_out: 2775 XBZRLE_cache_unlock(); 2776 return -ENOMEM; 2777 } 2778 2779 static int ram_state_init(RAMState **rsp) 2780 { 2781 *rsp = g_try_new0(RAMState, 1); 2782 2783 if (!*rsp) { 2784 error_report("%s: Init ramstate fail", __func__); 2785 return -1; 2786 } 2787 2788 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2789 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2790 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2791 2792 /* 2793 * Count the total number of pages used by ram blocks not including any 2794 * gaps due to alignment or unplugs. 2795 * This must match with the initial values of dirty bitmap. 2796 */ 2797 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2798 ram_state_reset(*rsp); 2799 2800 return 0; 2801 } 2802 2803 static void ram_list_init_bitmaps(void) 2804 { 2805 MigrationState *ms = migrate_get_current(); 2806 RAMBlock *block; 2807 unsigned long pages; 2808 uint8_t shift; 2809 2810 /* Skip setting bitmap if there is no RAM */ 2811 if (ram_bytes_total()) { 2812 shift = ms->clear_bitmap_shift; 2813 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2814 error_report("clear_bitmap_shift (%u) too big, using " 2815 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2816 shift = CLEAR_BITMAP_SHIFT_MAX; 2817 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2818 error_report("clear_bitmap_shift (%u) too small, using " 2819 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2820 shift = CLEAR_BITMAP_SHIFT_MIN; 2821 } 2822 2823 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2824 pages = block->max_length >> TARGET_PAGE_BITS; 2825 /* 2826 * The initial dirty bitmap for migration must be set with all 2827 * ones to make sure we'll migrate every guest RAM page to 2828 * destination. 2829 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2830 * new migration after a failed migration, ram_list. 2831 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2832 * guest memory. 2833 */ 2834 block->bmap = bitmap_new(pages); 2835 bitmap_set(block->bmap, 0, pages); 2836 block->clear_bmap_shift = shift; 2837 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2838 } 2839 } 2840 } 2841 2842 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2843 { 2844 unsigned long pages; 2845 RAMBlock *rb; 2846 2847 RCU_READ_LOCK_GUARD(); 2848 2849 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2850 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2851 rs->migration_dirty_pages -= pages; 2852 } 2853 } 2854 2855 static void ram_init_bitmaps(RAMState *rs) 2856 { 2857 /* For memory_global_dirty_log_start below. */ 2858 qemu_mutex_lock_iothread(); 2859 qemu_mutex_lock_ramlist(); 2860 2861 WITH_RCU_READ_LOCK_GUARD() { 2862 ram_list_init_bitmaps(); 2863 /* We don't use dirty log with background snapshots */ 2864 if (!migrate_background_snapshot()) { 2865 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 2866 migration_bitmap_sync_precopy(rs); 2867 } 2868 } 2869 qemu_mutex_unlock_ramlist(); 2870 qemu_mutex_unlock_iothread(); 2871 2872 /* 2873 * After an eventual first bitmap sync, fixup the initial bitmap 2874 * containing all 1s to exclude any discarded pages from migration. 2875 */ 2876 migration_bitmap_clear_discarded_pages(rs); 2877 } 2878 2879 static int ram_init_all(RAMState **rsp) 2880 { 2881 if (ram_state_init(rsp)) { 2882 return -1; 2883 } 2884 2885 if (xbzrle_init()) { 2886 ram_state_cleanup(rsp); 2887 return -1; 2888 } 2889 2890 ram_init_bitmaps(*rsp); 2891 2892 return 0; 2893 } 2894 2895 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2896 { 2897 RAMBlock *block; 2898 uint64_t pages = 0; 2899 2900 /* 2901 * Postcopy is not using xbzrle/compression, so no need for that. 2902 * Also, since source are already halted, we don't need to care 2903 * about dirty page logging as well. 2904 */ 2905 2906 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2907 pages += bitmap_count_one(block->bmap, 2908 block->used_length >> TARGET_PAGE_BITS); 2909 } 2910 2911 /* This may not be aligned with current bitmaps. Recalculate. */ 2912 rs->migration_dirty_pages = pages; 2913 2914 ram_state_reset(rs); 2915 2916 /* Update RAMState cache of output QEMUFile */ 2917 rs->f = out; 2918 2919 trace_ram_state_resume_prepare(pages); 2920 } 2921 2922 /* 2923 * This function clears bits of the free pages reported by the caller from the 2924 * migration dirty bitmap. @addr is the host address corresponding to the 2925 * start of the continuous guest free pages, and @len is the total bytes of 2926 * those pages. 2927 */ 2928 void qemu_guest_free_page_hint(void *addr, size_t len) 2929 { 2930 RAMBlock *block; 2931 ram_addr_t offset; 2932 size_t used_len, start, npages; 2933 MigrationState *s = migrate_get_current(); 2934 2935 /* This function is currently expected to be used during live migration */ 2936 if (!migration_is_setup_or_active(s->state)) { 2937 return; 2938 } 2939 2940 for (; len > 0; len -= used_len, addr += used_len) { 2941 block = qemu_ram_block_from_host(addr, false, &offset); 2942 if (unlikely(!block || offset >= block->used_length)) { 2943 /* 2944 * The implementation might not support RAMBlock resize during 2945 * live migration, but it could happen in theory with future 2946 * updates. So we add a check here to capture that case. 2947 */ 2948 error_report_once("%s unexpected error", __func__); 2949 return; 2950 } 2951 2952 if (len <= block->used_length - offset) { 2953 used_len = len; 2954 } else { 2955 used_len = block->used_length - offset; 2956 } 2957 2958 start = offset >> TARGET_PAGE_BITS; 2959 npages = used_len >> TARGET_PAGE_BITS; 2960 2961 qemu_mutex_lock(&ram_state->bitmap_mutex); 2962 /* 2963 * The skipped free pages are equavalent to be sent from clear_bmap's 2964 * perspective, so clear the bits from the memory region bitmap which 2965 * are initially set. Otherwise those skipped pages will be sent in 2966 * the next round after syncing from the memory region bitmap. 2967 */ 2968 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2969 ram_state->migration_dirty_pages -= 2970 bitmap_count_one_with_offset(block->bmap, start, npages); 2971 bitmap_clear(block->bmap, start, npages); 2972 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2973 } 2974 } 2975 2976 /* 2977 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2978 * long-running RCU critical section. When rcu-reclaims in the code 2979 * start to become numerous it will be necessary to reduce the 2980 * granularity of these critical sections. 2981 */ 2982 2983 /** 2984 * ram_save_setup: Setup RAM for migration 2985 * 2986 * Returns zero to indicate success and negative for error 2987 * 2988 * @f: QEMUFile where to send the data 2989 * @opaque: RAMState pointer 2990 */ 2991 static int ram_save_setup(QEMUFile *f, void *opaque) 2992 { 2993 RAMState **rsp = opaque; 2994 RAMBlock *block; 2995 2996 if (compress_threads_save_setup()) { 2997 return -1; 2998 } 2999 3000 /* migration has already setup the bitmap, reuse it. */ 3001 if (!migration_in_colo_state()) { 3002 if (ram_init_all(rsp) != 0) { 3003 compress_threads_save_cleanup(); 3004 return -1; 3005 } 3006 } 3007 (*rsp)->f = f; 3008 3009 WITH_RCU_READ_LOCK_GUARD() { 3010 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 3011 3012 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3013 qemu_put_byte(f, strlen(block->idstr)); 3014 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3015 qemu_put_be64(f, block->used_length); 3016 if (migrate_postcopy_ram() && block->page_size != 3017 qemu_host_page_size) { 3018 qemu_put_be64(f, block->page_size); 3019 } 3020 if (migrate_ignore_shared()) { 3021 qemu_put_be64(f, block->mr->addr); 3022 } 3023 } 3024 } 3025 3026 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3027 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3028 3029 multifd_send_sync_main(f); 3030 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3031 qemu_fflush(f); 3032 3033 return 0; 3034 } 3035 3036 /** 3037 * ram_save_iterate: iterative stage for migration 3038 * 3039 * Returns zero to indicate success and negative for error 3040 * 3041 * @f: QEMUFile where to send the data 3042 * @opaque: RAMState pointer 3043 */ 3044 static int ram_save_iterate(QEMUFile *f, void *opaque) 3045 { 3046 RAMState **temp = opaque; 3047 RAMState *rs = *temp; 3048 int ret = 0; 3049 int i; 3050 int64_t t0; 3051 int done = 0; 3052 3053 if (blk_mig_bulk_active()) { 3054 /* Avoid transferring ram during bulk phase of block migration as 3055 * the bulk phase will usually take a long time and transferring 3056 * ram updates during that time is pointless. */ 3057 goto out; 3058 } 3059 3060 /* 3061 * We'll take this lock a little bit long, but it's okay for two reasons. 3062 * Firstly, the only possible other thread to take it is who calls 3063 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3064 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3065 * guarantees that we'll at least released it in a regular basis. 3066 */ 3067 qemu_mutex_lock(&rs->bitmap_mutex); 3068 WITH_RCU_READ_LOCK_GUARD() { 3069 if (ram_list.version != rs->last_version) { 3070 ram_state_reset(rs); 3071 } 3072 3073 /* Read version before ram_list.blocks */ 3074 smp_rmb(); 3075 3076 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3077 3078 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3079 i = 0; 3080 while ((ret = qemu_file_rate_limit(f)) == 0 || 3081 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 3082 int pages; 3083 3084 if (qemu_file_get_error(f)) { 3085 break; 3086 } 3087 3088 pages = ram_find_and_save_block(rs, false); 3089 /* no more pages to sent */ 3090 if (pages == 0) { 3091 done = 1; 3092 break; 3093 } 3094 3095 if (pages < 0) { 3096 qemu_file_set_error(f, pages); 3097 break; 3098 } 3099 3100 rs->target_page_count += pages; 3101 3102 /* 3103 * During postcopy, it is necessary to make sure one whole host 3104 * page is sent in one chunk. 3105 */ 3106 if (migrate_postcopy_ram()) { 3107 flush_compressed_data(rs); 3108 } 3109 3110 /* 3111 * we want to check in the 1st loop, just in case it was the 1st 3112 * time and we had to sync the dirty bitmap. 3113 * qemu_clock_get_ns() is a bit expensive, so we only check each 3114 * some iterations 3115 */ 3116 if ((i & 63) == 0) { 3117 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3118 1000000; 3119 if (t1 > MAX_WAIT) { 3120 trace_ram_save_iterate_big_wait(t1, i); 3121 break; 3122 } 3123 } 3124 i++; 3125 } 3126 } 3127 qemu_mutex_unlock(&rs->bitmap_mutex); 3128 3129 /* 3130 * Must occur before EOS (or any QEMUFile operation) 3131 * because of RDMA protocol. 3132 */ 3133 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3134 3135 out: 3136 if (ret >= 0 3137 && migration_is_setup_or_active(migrate_get_current()->state)) { 3138 multifd_send_sync_main(rs->f); 3139 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3140 qemu_fflush(f); 3141 ram_counters.transferred += 8; 3142 3143 ret = qemu_file_get_error(f); 3144 } 3145 if (ret < 0) { 3146 return ret; 3147 } 3148 3149 return done; 3150 } 3151 3152 /** 3153 * ram_save_complete: function called to send the remaining amount of ram 3154 * 3155 * Returns zero to indicate success or negative on error 3156 * 3157 * Called with iothread lock 3158 * 3159 * @f: QEMUFile where to send the data 3160 * @opaque: RAMState pointer 3161 */ 3162 static int ram_save_complete(QEMUFile *f, void *opaque) 3163 { 3164 RAMState **temp = opaque; 3165 RAMState *rs = *temp; 3166 int ret = 0; 3167 3168 WITH_RCU_READ_LOCK_GUARD() { 3169 if (!migration_in_postcopy()) { 3170 migration_bitmap_sync_precopy(rs); 3171 } 3172 3173 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3174 3175 /* try transferring iterative blocks of memory */ 3176 3177 /* flush all remaining blocks regardless of rate limiting */ 3178 while (true) { 3179 int pages; 3180 3181 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 3182 /* no more blocks to sent */ 3183 if (pages == 0) { 3184 break; 3185 } 3186 if (pages < 0) { 3187 ret = pages; 3188 break; 3189 } 3190 } 3191 3192 flush_compressed_data(rs); 3193 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3194 } 3195 3196 if (ret >= 0) { 3197 multifd_send_sync_main(rs->f); 3198 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3199 qemu_fflush(f); 3200 } 3201 3202 return ret; 3203 } 3204 3205 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 3206 uint64_t *res_precopy_only, 3207 uint64_t *res_compatible, 3208 uint64_t *res_postcopy_only) 3209 { 3210 RAMState **temp = opaque; 3211 RAMState *rs = *temp; 3212 uint64_t remaining_size; 3213 3214 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3215 3216 if (!migration_in_postcopy() && 3217 remaining_size < max_size) { 3218 qemu_mutex_lock_iothread(); 3219 WITH_RCU_READ_LOCK_GUARD() { 3220 migration_bitmap_sync_precopy(rs); 3221 } 3222 qemu_mutex_unlock_iothread(); 3223 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3224 } 3225 3226 if (migrate_postcopy_ram()) { 3227 /* We can do postcopy, and all the data is postcopiable */ 3228 *res_compatible += remaining_size; 3229 } else { 3230 *res_precopy_only += remaining_size; 3231 } 3232 } 3233 3234 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3235 { 3236 unsigned int xh_len; 3237 int xh_flags; 3238 uint8_t *loaded_data; 3239 3240 /* extract RLE header */ 3241 xh_flags = qemu_get_byte(f); 3242 xh_len = qemu_get_be16(f); 3243 3244 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3245 error_report("Failed to load XBZRLE page - wrong compression!"); 3246 return -1; 3247 } 3248 3249 if (xh_len > TARGET_PAGE_SIZE) { 3250 error_report("Failed to load XBZRLE page - len overflow!"); 3251 return -1; 3252 } 3253 loaded_data = XBZRLE.decoded_buf; 3254 /* load data and decode */ 3255 /* it can change loaded_data to point to an internal buffer */ 3256 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3257 3258 /* decode RLE */ 3259 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3260 TARGET_PAGE_SIZE) == -1) { 3261 error_report("Failed to load XBZRLE page - decode error!"); 3262 return -1; 3263 } 3264 3265 return 0; 3266 } 3267 3268 /** 3269 * ram_block_from_stream: read a RAMBlock id from the migration stream 3270 * 3271 * Must be called from within a rcu critical section. 3272 * 3273 * Returns a pointer from within the RCU-protected ram_list. 3274 * 3275 * @f: QEMUFile where to read the data from 3276 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3277 */ 3278 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 3279 { 3280 static RAMBlock *block; 3281 char id[256]; 3282 uint8_t len; 3283 3284 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3285 if (!block) { 3286 error_report("Ack, bad migration stream!"); 3287 return NULL; 3288 } 3289 return block; 3290 } 3291 3292 len = qemu_get_byte(f); 3293 qemu_get_buffer(f, (uint8_t *)id, len); 3294 id[len] = 0; 3295 3296 block = qemu_ram_block_by_name(id); 3297 if (!block) { 3298 error_report("Can't find block %s", id); 3299 return NULL; 3300 } 3301 3302 if (ramblock_is_ignored(block)) { 3303 error_report("block %s should not be migrated !", id); 3304 return NULL; 3305 } 3306 3307 return block; 3308 } 3309 3310 static inline void *host_from_ram_block_offset(RAMBlock *block, 3311 ram_addr_t offset) 3312 { 3313 if (!offset_in_ramblock(block, offset)) { 3314 return NULL; 3315 } 3316 3317 return block->host + offset; 3318 } 3319 3320 static void *host_page_from_ram_block_offset(RAMBlock *block, 3321 ram_addr_t offset) 3322 { 3323 /* Note: Explicitly no check against offset_in_ramblock(). */ 3324 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3325 block->page_size); 3326 } 3327 3328 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3329 ram_addr_t offset) 3330 { 3331 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3332 } 3333 3334 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3335 ram_addr_t offset, bool record_bitmap) 3336 { 3337 if (!offset_in_ramblock(block, offset)) { 3338 return NULL; 3339 } 3340 if (!block->colo_cache) { 3341 error_report("%s: colo_cache is NULL in block :%s", 3342 __func__, block->idstr); 3343 return NULL; 3344 } 3345 3346 /* 3347 * During colo checkpoint, we need bitmap of these migrated pages. 3348 * It help us to decide which pages in ram cache should be flushed 3349 * into VM's RAM later. 3350 */ 3351 if (record_bitmap && 3352 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3353 ram_state->migration_dirty_pages++; 3354 } 3355 return block->colo_cache + offset; 3356 } 3357 3358 /** 3359 * ram_handle_compressed: handle the zero page case 3360 * 3361 * If a page (or a whole RDMA chunk) has been 3362 * determined to be zero, then zap it. 3363 * 3364 * @host: host address for the zero page 3365 * @ch: what the page is filled from. We only support zero 3366 * @size: size of the zero page 3367 */ 3368 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3369 { 3370 if (ch != 0 || !is_zero_range(host, size)) { 3371 memset(host, ch, size); 3372 } 3373 } 3374 3375 /* return the size after decompression, or negative value on error */ 3376 static int 3377 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3378 const uint8_t *source, size_t source_len) 3379 { 3380 int err; 3381 3382 err = inflateReset(stream); 3383 if (err != Z_OK) { 3384 return -1; 3385 } 3386 3387 stream->avail_in = source_len; 3388 stream->next_in = (uint8_t *)source; 3389 stream->avail_out = dest_len; 3390 stream->next_out = dest; 3391 3392 err = inflate(stream, Z_NO_FLUSH); 3393 if (err != Z_STREAM_END) { 3394 return -1; 3395 } 3396 3397 return stream->total_out; 3398 } 3399 3400 static void *do_data_decompress(void *opaque) 3401 { 3402 DecompressParam *param = opaque; 3403 unsigned long pagesize; 3404 uint8_t *des; 3405 int len, ret; 3406 3407 qemu_mutex_lock(¶m->mutex); 3408 while (!param->quit) { 3409 if (param->des) { 3410 des = param->des; 3411 len = param->len; 3412 param->des = 0; 3413 qemu_mutex_unlock(¶m->mutex); 3414 3415 pagesize = TARGET_PAGE_SIZE; 3416 3417 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3418 param->compbuf, len); 3419 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3420 error_report("decompress data failed"); 3421 qemu_file_set_error(decomp_file, ret); 3422 } 3423 3424 qemu_mutex_lock(&decomp_done_lock); 3425 param->done = true; 3426 qemu_cond_signal(&decomp_done_cond); 3427 qemu_mutex_unlock(&decomp_done_lock); 3428 3429 qemu_mutex_lock(¶m->mutex); 3430 } else { 3431 qemu_cond_wait(¶m->cond, ¶m->mutex); 3432 } 3433 } 3434 qemu_mutex_unlock(¶m->mutex); 3435 3436 return NULL; 3437 } 3438 3439 static int wait_for_decompress_done(void) 3440 { 3441 int idx, thread_count; 3442 3443 if (!migrate_use_compression()) { 3444 return 0; 3445 } 3446 3447 thread_count = migrate_decompress_threads(); 3448 qemu_mutex_lock(&decomp_done_lock); 3449 for (idx = 0; idx < thread_count; idx++) { 3450 while (!decomp_param[idx].done) { 3451 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3452 } 3453 } 3454 qemu_mutex_unlock(&decomp_done_lock); 3455 return qemu_file_get_error(decomp_file); 3456 } 3457 3458 static void compress_threads_load_cleanup(void) 3459 { 3460 int i, thread_count; 3461 3462 if (!migrate_use_compression()) { 3463 return; 3464 } 3465 thread_count = migrate_decompress_threads(); 3466 for (i = 0; i < thread_count; i++) { 3467 /* 3468 * we use it as a indicator which shows if the thread is 3469 * properly init'd or not 3470 */ 3471 if (!decomp_param[i].compbuf) { 3472 break; 3473 } 3474 3475 qemu_mutex_lock(&decomp_param[i].mutex); 3476 decomp_param[i].quit = true; 3477 qemu_cond_signal(&decomp_param[i].cond); 3478 qemu_mutex_unlock(&decomp_param[i].mutex); 3479 } 3480 for (i = 0; i < thread_count; i++) { 3481 if (!decomp_param[i].compbuf) { 3482 break; 3483 } 3484 3485 qemu_thread_join(decompress_threads + i); 3486 qemu_mutex_destroy(&decomp_param[i].mutex); 3487 qemu_cond_destroy(&decomp_param[i].cond); 3488 inflateEnd(&decomp_param[i].stream); 3489 g_free(decomp_param[i].compbuf); 3490 decomp_param[i].compbuf = NULL; 3491 } 3492 g_free(decompress_threads); 3493 g_free(decomp_param); 3494 decompress_threads = NULL; 3495 decomp_param = NULL; 3496 decomp_file = NULL; 3497 } 3498 3499 static int compress_threads_load_setup(QEMUFile *f) 3500 { 3501 int i, thread_count; 3502 3503 if (!migrate_use_compression()) { 3504 return 0; 3505 } 3506 3507 thread_count = migrate_decompress_threads(); 3508 decompress_threads = g_new0(QemuThread, thread_count); 3509 decomp_param = g_new0(DecompressParam, thread_count); 3510 qemu_mutex_init(&decomp_done_lock); 3511 qemu_cond_init(&decomp_done_cond); 3512 decomp_file = f; 3513 for (i = 0; i < thread_count; i++) { 3514 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3515 goto exit; 3516 } 3517 3518 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3519 qemu_mutex_init(&decomp_param[i].mutex); 3520 qemu_cond_init(&decomp_param[i].cond); 3521 decomp_param[i].done = true; 3522 decomp_param[i].quit = false; 3523 qemu_thread_create(decompress_threads + i, "decompress", 3524 do_data_decompress, decomp_param + i, 3525 QEMU_THREAD_JOINABLE); 3526 } 3527 return 0; 3528 exit: 3529 compress_threads_load_cleanup(); 3530 return -1; 3531 } 3532 3533 static void decompress_data_with_multi_threads(QEMUFile *f, 3534 void *host, int len) 3535 { 3536 int idx, thread_count; 3537 3538 thread_count = migrate_decompress_threads(); 3539 QEMU_LOCK_GUARD(&decomp_done_lock); 3540 while (true) { 3541 for (idx = 0; idx < thread_count; idx++) { 3542 if (decomp_param[idx].done) { 3543 decomp_param[idx].done = false; 3544 qemu_mutex_lock(&decomp_param[idx].mutex); 3545 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3546 decomp_param[idx].des = host; 3547 decomp_param[idx].len = len; 3548 qemu_cond_signal(&decomp_param[idx].cond); 3549 qemu_mutex_unlock(&decomp_param[idx].mutex); 3550 break; 3551 } 3552 } 3553 if (idx < thread_count) { 3554 break; 3555 } else { 3556 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3557 } 3558 } 3559 } 3560 3561 static void colo_init_ram_state(void) 3562 { 3563 ram_state_init(&ram_state); 3564 } 3565 3566 /* 3567 * colo cache: this is for secondary VM, we cache the whole 3568 * memory of the secondary VM, it is need to hold the global lock 3569 * to call this helper. 3570 */ 3571 int colo_init_ram_cache(void) 3572 { 3573 RAMBlock *block; 3574 3575 WITH_RCU_READ_LOCK_GUARD() { 3576 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3577 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3578 NULL, false, false); 3579 if (!block->colo_cache) { 3580 error_report("%s: Can't alloc memory for COLO cache of block %s," 3581 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3582 block->used_length); 3583 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3584 if (block->colo_cache) { 3585 qemu_anon_ram_free(block->colo_cache, block->used_length); 3586 block->colo_cache = NULL; 3587 } 3588 } 3589 return -errno; 3590 } 3591 if (!machine_dump_guest_core(current_machine)) { 3592 qemu_madvise(block->colo_cache, block->used_length, 3593 QEMU_MADV_DONTDUMP); 3594 } 3595 } 3596 } 3597 3598 /* 3599 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3600 * with to decide which page in cache should be flushed into SVM's RAM. Here 3601 * we use the same name 'ram_bitmap' as for migration. 3602 */ 3603 if (ram_bytes_total()) { 3604 RAMBlock *block; 3605 3606 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3607 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3608 block->bmap = bitmap_new(pages); 3609 } 3610 } 3611 3612 colo_init_ram_state(); 3613 return 0; 3614 } 3615 3616 /* TODO: duplicated with ram_init_bitmaps */ 3617 void colo_incoming_start_dirty_log(void) 3618 { 3619 RAMBlock *block = NULL; 3620 /* For memory_global_dirty_log_start below. */ 3621 qemu_mutex_lock_iothread(); 3622 qemu_mutex_lock_ramlist(); 3623 3624 memory_global_dirty_log_sync(); 3625 WITH_RCU_READ_LOCK_GUARD() { 3626 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3627 ramblock_sync_dirty_bitmap(ram_state, block); 3628 /* Discard this dirty bitmap record */ 3629 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3630 } 3631 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3632 } 3633 ram_state->migration_dirty_pages = 0; 3634 qemu_mutex_unlock_ramlist(); 3635 qemu_mutex_unlock_iothread(); 3636 } 3637 3638 /* It is need to hold the global lock to call this helper */ 3639 void colo_release_ram_cache(void) 3640 { 3641 RAMBlock *block; 3642 3643 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3644 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3645 g_free(block->bmap); 3646 block->bmap = NULL; 3647 } 3648 3649 WITH_RCU_READ_LOCK_GUARD() { 3650 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3651 if (block->colo_cache) { 3652 qemu_anon_ram_free(block->colo_cache, block->used_length); 3653 block->colo_cache = NULL; 3654 } 3655 } 3656 } 3657 ram_state_cleanup(&ram_state); 3658 } 3659 3660 /** 3661 * ram_load_setup: Setup RAM for migration incoming side 3662 * 3663 * Returns zero to indicate success and negative for error 3664 * 3665 * @f: QEMUFile where to receive the data 3666 * @opaque: RAMState pointer 3667 */ 3668 static int ram_load_setup(QEMUFile *f, void *opaque) 3669 { 3670 if (compress_threads_load_setup(f)) { 3671 return -1; 3672 } 3673 3674 xbzrle_load_setup(); 3675 ramblock_recv_map_init(); 3676 3677 return 0; 3678 } 3679 3680 static int ram_load_cleanup(void *opaque) 3681 { 3682 RAMBlock *rb; 3683 3684 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3685 qemu_ram_block_writeback(rb); 3686 } 3687 3688 xbzrle_load_cleanup(); 3689 compress_threads_load_cleanup(); 3690 3691 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3692 g_free(rb->receivedmap); 3693 rb->receivedmap = NULL; 3694 } 3695 3696 return 0; 3697 } 3698 3699 /** 3700 * ram_postcopy_incoming_init: allocate postcopy data structures 3701 * 3702 * Returns 0 for success and negative if there was one error 3703 * 3704 * @mis: current migration incoming state 3705 * 3706 * Allocate data structures etc needed by incoming migration with 3707 * postcopy-ram. postcopy-ram's similarly names 3708 * postcopy_ram_incoming_init does the work. 3709 */ 3710 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3711 { 3712 return postcopy_ram_incoming_init(mis); 3713 } 3714 3715 /** 3716 * ram_load_postcopy: load a page in postcopy case 3717 * 3718 * Returns 0 for success or -errno in case of error 3719 * 3720 * Called in postcopy mode by ram_load(). 3721 * rcu_read_lock is taken prior to this being called. 3722 * 3723 * @f: QEMUFile where to send the data 3724 */ 3725 static int ram_load_postcopy(QEMUFile *f) 3726 { 3727 int flags = 0, ret = 0; 3728 bool place_needed = false; 3729 bool matches_target_page_size = false; 3730 MigrationIncomingState *mis = migration_incoming_get_current(); 3731 /* Temporary page that is later 'placed' */ 3732 void *postcopy_host_page = mis->postcopy_tmp_page; 3733 void *host_page = NULL; 3734 bool all_zero = true; 3735 int target_pages = 0; 3736 3737 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3738 ram_addr_t addr; 3739 void *page_buffer = NULL; 3740 void *place_source = NULL; 3741 RAMBlock *block = NULL; 3742 uint8_t ch; 3743 int len; 3744 3745 addr = qemu_get_be64(f); 3746 3747 /* 3748 * If qemu file error, we should stop here, and then "addr" 3749 * may be invalid 3750 */ 3751 ret = qemu_file_get_error(f); 3752 if (ret) { 3753 break; 3754 } 3755 3756 flags = addr & ~TARGET_PAGE_MASK; 3757 addr &= TARGET_PAGE_MASK; 3758 3759 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3760 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3761 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3762 block = ram_block_from_stream(f, flags); 3763 if (!block) { 3764 ret = -EINVAL; 3765 break; 3766 } 3767 3768 /* 3769 * Relying on used_length is racy and can result in false positives. 3770 * We might place pages beyond used_length in case RAM was shrunk 3771 * while in postcopy, which is fine - trying to place via 3772 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3773 */ 3774 if (!block->host || addr >= block->postcopy_length) { 3775 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3776 ret = -EINVAL; 3777 break; 3778 } 3779 target_pages++; 3780 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3781 /* 3782 * Postcopy requires that we place whole host pages atomically; 3783 * these may be huge pages for RAMBlocks that are backed by 3784 * hugetlbfs. 3785 * To make it atomic, the data is read into a temporary page 3786 * that's moved into place later. 3787 * The migration protocol uses, possibly smaller, target-pages 3788 * however the source ensures it always sends all the components 3789 * of a host page in one chunk. 3790 */ 3791 page_buffer = postcopy_host_page + 3792 host_page_offset_from_ram_block_offset(block, addr); 3793 /* If all TP are zero then we can optimise the place */ 3794 if (target_pages == 1) { 3795 host_page = host_page_from_ram_block_offset(block, addr); 3796 } else if (host_page != host_page_from_ram_block_offset(block, 3797 addr)) { 3798 /* not the 1st TP within the HP */ 3799 error_report("Non-same host page %p/%p", host_page, 3800 host_page_from_ram_block_offset(block, addr)); 3801 ret = -EINVAL; 3802 break; 3803 } 3804 3805 /* 3806 * If it's the last part of a host page then we place the host 3807 * page 3808 */ 3809 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3810 place_needed = true; 3811 } 3812 place_source = postcopy_host_page; 3813 } 3814 3815 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3816 case RAM_SAVE_FLAG_ZERO: 3817 ch = qemu_get_byte(f); 3818 /* 3819 * Can skip to set page_buffer when 3820 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3821 */ 3822 if (ch || !matches_target_page_size) { 3823 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3824 } 3825 if (ch) { 3826 all_zero = false; 3827 } 3828 break; 3829 3830 case RAM_SAVE_FLAG_PAGE: 3831 all_zero = false; 3832 if (!matches_target_page_size) { 3833 /* For huge pages, we always use temporary buffer */ 3834 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3835 } else { 3836 /* 3837 * For small pages that matches target page size, we 3838 * avoid the qemu_file copy. Instead we directly use 3839 * the buffer of QEMUFile to place the page. Note: we 3840 * cannot do any QEMUFile operation before using that 3841 * buffer to make sure the buffer is valid when 3842 * placing the page. 3843 */ 3844 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3845 TARGET_PAGE_SIZE); 3846 } 3847 break; 3848 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3849 all_zero = false; 3850 len = qemu_get_be32(f); 3851 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3852 error_report("Invalid compressed data length: %d", len); 3853 ret = -EINVAL; 3854 break; 3855 } 3856 decompress_data_with_multi_threads(f, page_buffer, len); 3857 break; 3858 3859 case RAM_SAVE_FLAG_EOS: 3860 /* normal exit */ 3861 multifd_recv_sync_main(); 3862 break; 3863 default: 3864 error_report("Unknown combination of migration flags: 0x%x" 3865 " (postcopy mode)", flags); 3866 ret = -EINVAL; 3867 break; 3868 } 3869 3870 /* Got the whole host page, wait for decompress before placing. */ 3871 if (place_needed) { 3872 ret |= wait_for_decompress_done(); 3873 } 3874 3875 /* Detect for any possible file errors */ 3876 if (!ret && qemu_file_get_error(f)) { 3877 ret = qemu_file_get_error(f); 3878 } 3879 3880 if (!ret && place_needed) { 3881 if (all_zero) { 3882 ret = postcopy_place_page_zero(mis, host_page, block); 3883 } else { 3884 ret = postcopy_place_page(mis, host_page, place_source, 3885 block); 3886 } 3887 place_needed = false; 3888 target_pages = 0; 3889 /* Assume we have a zero page until we detect something different */ 3890 all_zero = true; 3891 } 3892 } 3893 3894 return ret; 3895 } 3896 3897 static bool postcopy_is_advised(void) 3898 { 3899 PostcopyState ps = postcopy_state_get(); 3900 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3901 } 3902 3903 static bool postcopy_is_running(void) 3904 { 3905 PostcopyState ps = postcopy_state_get(); 3906 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3907 } 3908 3909 /* 3910 * Flush content of RAM cache into SVM's memory. 3911 * Only flush the pages that be dirtied by PVM or SVM or both. 3912 */ 3913 void colo_flush_ram_cache(void) 3914 { 3915 RAMBlock *block = NULL; 3916 void *dst_host; 3917 void *src_host; 3918 unsigned long offset = 0; 3919 3920 memory_global_dirty_log_sync(); 3921 qemu_mutex_lock(&ram_state->bitmap_mutex); 3922 WITH_RCU_READ_LOCK_GUARD() { 3923 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3924 ramblock_sync_dirty_bitmap(ram_state, block); 3925 } 3926 } 3927 3928 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3929 WITH_RCU_READ_LOCK_GUARD() { 3930 block = QLIST_FIRST_RCU(&ram_list.blocks); 3931 3932 while (block) { 3933 unsigned long num = 0; 3934 3935 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3936 if (!offset_in_ramblock(block, 3937 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3938 offset = 0; 3939 num = 0; 3940 block = QLIST_NEXT_RCU(block, next); 3941 } else { 3942 unsigned long i = 0; 3943 3944 for (i = 0; i < num; i++) { 3945 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3946 } 3947 dst_host = block->host 3948 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3949 src_host = block->colo_cache 3950 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3951 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3952 offset += num; 3953 } 3954 } 3955 } 3956 trace_colo_flush_ram_cache_end(); 3957 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3958 } 3959 3960 /** 3961 * ram_load_precopy: load pages in precopy case 3962 * 3963 * Returns 0 for success or -errno in case of error 3964 * 3965 * Called in precopy mode by ram_load(). 3966 * rcu_read_lock is taken prior to this being called. 3967 * 3968 * @f: QEMUFile where to send the data 3969 */ 3970 static int ram_load_precopy(QEMUFile *f) 3971 { 3972 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3973 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3974 bool postcopy_advised = postcopy_is_advised(); 3975 if (!migrate_use_compression()) { 3976 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3977 } 3978 3979 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3980 ram_addr_t addr, total_ram_bytes; 3981 void *host = NULL, *host_bak = NULL; 3982 uint8_t ch; 3983 3984 /* 3985 * Yield periodically to let main loop run, but an iteration of 3986 * the main loop is expensive, so do it each some iterations 3987 */ 3988 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3989 aio_co_schedule(qemu_get_current_aio_context(), 3990 qemu_coroutine_self()); 3991 qemu_coroutine_yield(); 3992 } 3993 i++; 3994 3995 addr = qemu_get_be64(f); 3996 flags = addr & ~TARGET_PAGE_MASK; 3997 addr &= TARGET_PAGE_MASK; 3998 3999 if (flags & invalid_flags) { 4000 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4001 error_report("Received an unexpected compressed page"); 4002 } 4003 4004 ret = -EINVAL; 4005 break; 4006 } 4007 4008 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4009 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4010 RAMBlock *block = ram_block_from_stream(f, flags); 4011 4012 host = host_from_ram_block_offset(block, addr); 4013 /* 4014 * After going into COLO stage, we should not load the page 4015 * into SVM's memory directly, we put them into colo_cache firstly. 4016 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4017 * Previously, we copied all these memory in preparing stage of COLO 4018 * while we need to stop VM, which is a time-consuming process. 4019 * Here we optimize it by a trick, back-up every page while in 4020 * migration process while COLO is enabled, though it affects the 4021 * speed of the migration, but it obviously reduce the downtime of 4022 * back-up all SVM'S memory in COLO preparing stage. 4023 */ 4024 if (migration_incoming_colo_enabled()) { 4025 if (migration_incoming_in_colo_state()) { 4026 /* In COLO stage, put all pages into cache temporarily */ 4027 host = colo_cache_from_block_offset(block, addr, true); 4028 } else { 4029 /* 4030 * In migration stage but before COLO stage, 4031 * Put all pages into both cache and SVM's memory. 4032 */ 4033 host_bak = colo_cache_from_block_offset(block, addr, false); 4034 } 4035 } 4036 if (!host) { 4037 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4038 ret = -EINVAL; 4039 break; 4040 } 4041 if (!migration_incoming_in_colo_state()) { 4042 ramblock_recv_bitmap_set(block, host); 4043 } 4044 4045 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4046 } 4047 4048 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4049 case RAM_SAVE_FLAG_MEM_SIZE: 4050 /* Synchronize RAM block list */ 4051 total_ram_bytes = addr; 4052 while (!ret && total_ram_bytes) { 4053 RAMBlock *block; 4054 char id[256]; 4055 ram_addr_t length; 4056 4057 len = qemu_get_byte(f); 4058 qemu_get_buffer(f, (uint8_t *)id, len); 4059 id[len] = 0; 4060 length = qemu_get_be64(f); 4061 4062 block = qemu_ram_block_by_name(id); 4063 if (block && !qemu_ram_is_migratable(block)) { 4064 error_report("block %s should not be migrated !", id); 4065 ret = -EINVAL; 4066 } else if (block) { 4067 if (length != block->used_length) { 4068 Error *local_err = NULL; 4069 4070 ret = qemu_ram_resize(block, length, 4071 &local_err); 4072 if (local_err) { 4073 error_report_err(local_err); 4074 } 4075 } 4076 /* For postcopy we need to check hugepage sizes match */ 4077 if (postcopy_advised && migrate_postcopy_ram() && 4078 block->page_size != qemu_host_page_size) { 4079 uint64_t remote_page_size = qemu_get_be64(f); 4080 if (remote_page_size != block->page_size) { 4081 error_report("Mismatched RAM page size %s " 4082 "(local) %zd != %" PRId64, 4083 id, block->page_size, 4084 remote_page_size); 4085 ret = -EINVAL; 4086 } 4087 } 4088 if (migrate_ignore_shared()) { 4089 hwaddr addr = qemu_get_be64(f); 4090 if (ramblock_is_ignored(block) && 4091 block->mr->addr != addr) { 4092 error_report("Mismatched GPAs for block %s " 4093 "%" PRId64 "!= %" PRId64, 4094 id, (uint64_t)addr, 4095 (uint64_t)block->mr->addr); 4096 ret = -EINVAL; 4097 } 4098 } 4099 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4100 block->idstr); 4101 } else { 4102 error_report("Unknown ramblock \"%s\", cannot " 4103 "accept migration", id); 4104 ret = -EINVAL; 4105 } 4106 4107 total_ram_bytes -= length; 4108 } 4109 break; 4110 4111 case RAM_SAVE_FLAG_ZERO: 4112 ch = qemu_get_byte(f); 4113 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4114 break; 4115 4116 case RAM_SAVE_FLAG_PAGE: 4117 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4118 break; 4119 4120 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4121 len = qemu_get_be32(f); 4122 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4123 error_report("Invalid compressed data length: %d", len); 4124 ret = -EINVAL; 4125 break; 4126 } 4127 decompress_data_with_multi_threads(f, host, len); 4128 break; 4129 4130 case RAM_SAVE_FLAG_XBZRLE: 4131 if (load_xbzrle(f, addr, host) < 0) { 4132 error_report("Failed to decompress XBZRLE page at " 4133 RAM_ADDR_FMT, addr); 4134 ret = -EINVAL; 4135 break; 4136 } 4137 break; 4138 case RAM_SAVE_FLAG_EOS: 4139 /* normal exit */ 4140 multifd_recv_sync_main(); 4141 break; 4142 default: 4143 if (flags & RAM_SAVE_FLAG_HOOK) { 4144 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4145 } else { 4146 error_report("Unknown combination of migration flags: 0x%x", 4147 flags); 4148 ret = -EINVAL; 4149 } 4150 } 4151 if (!ret) { 4152 ret = qemu_file_get_error(f); 4153 } 4154 if (!ret && host_bak) { 4155 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4156 } 4157 } 4158 4159 ret |= wait_for_decompress_done(); 4160 return ret; 4161 } 4162 4163 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4164 { 4165 int ret = 0; 4166 static uint64_t seq_iter; 4167 /* 4168 * If system is running in postcopy mode, page inserts to host memory must 4169 * be atomic 4170 */ 4171 bool postcopy_running = postcopy_is_running(); 4172 4173 seq_iter++; 4174 4175 if (version_id != 4) { 4176 return -EINVAL; 4177 } 4178 4179 /* 4180 * This RCU critical section can be very long running. 4181 * When RCU reclaims in the code start to become numerous, 4182 * it will be necessary to reduce the granularity of this 4183 * critical section. 4184 */ 4185 WITH_RCU_READ_LOCK_GUARD() { 4186 if (postcopy_running) { 4187 ret = ram_load_postcopy(f); 4188 } else { 4189 ret = ram_load_precopy(f); 4190 } 4191 } 4192 trace_ram_load_complete(ret, seq_iter); 4193 4194 return ret; 4195 } 4196 4197 static bool ram_has_postcopy(void *opaque) 4198 { 4199 RAMBlock *rb; 4200 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4201 if (ramblock_is_pmem(rb)) { 4202 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4203 "is not supported now!", rb->idstr, rb->host); 4204 return false; 4205 } 4206 } 4207 4208 return migrate_postcopy_ram(); 4209 } 4210 4211 /* Sync all the dirty bitmap with destination VM. */ 4212 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4213 { 4214 RAMBlock *block; 4215 QEMUFile *file = s->to_dst_file; 4216 int ramblock_count = 0; 4217 4218 trace_ram_dirty_bitmap_sync_start(); 4219 4220 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4221 qemu_savevm_send_recv_bitmap(file, block->idstr); 4222 trace_ram_dirty_bitmap_request(block->idstr); 4223 ramblock_count++; 4224 } 4225 4226 trace_ram_dirty_bitmap_sync_wait(); 4227 4228 /* Wait until all the ramblocks' dirty bitmap synced */ 4229 while (ramblock_count--) { 4230 qemu_sem_wait(&s->rp_state.rp_sem); 4231 } 4232 4233 trace_ram_dirty_bitmap_sync_complete(); 4234 4235 return 0; 4236 } 4237 4238 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4239 { 4240 qemu_sem_post(&s->rp_state.rp_sem); 4241 } 4242 4243 /* 4244 * Read the received bitmap, revert it as the initial dirty bitmap. 4245 * This is only used when the postcopy migration is paused but wants 4246 * to resume from a middle point. 4247 */ 4248 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4249 { 4250 int ret = -EINVAL; 4251 /* from_dst_file is always valid because we're within rp_thread */ 4252 QEMUFile *file = s->rp_state.from_dst_file; 4253 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4254 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4255 uint64_t size, end_mark; 4256 4257 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4258 4259 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4260 error_report("%s: incorrect state %s", __func__, 4261 MigrationStatus_str(s->state)); 4262 return -EINVAL; 4263 } 4264 4265 /* 4266 * Note: see comments in ramblock_recv_bitmap_send() on why we 4267 * need the endianness conversion, and the paddings. 4268 */ 4269 local_size = ROUND_UP(local_size, 8); 4270 4271 /* Add paddings */ 4272 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4273 4274 size = qemu_get_be64(file); 4275 4276 /* The size of the bitmap should match with our ramblock */ 4277 if (size != local_size) { 4278 error_report("%s: ramblock '%s' bitmap size mismatch " 4279 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4280 block->idstr, size, local_size); 4281 ret = -EINVAL; 4282 goto out; 4283 } 4284 4285 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4286 end_mark = qemu_get_be64(file); 4287 4288 ret = qemu_file_get_error(file); 4289 if (ret || size != local_size) { 4290 error_report("%s: read bitmap failed for ramblock '%s': %d" 4291 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4292 __func__, block->idstr, ret, local_size, size); 4293 ret = -EIO; 4294 goto out; 4295 } 4296 4297 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4298 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4299 __func__, block->idstr, end_mark); 4300 ret = -EINVAL; 4301 goto out; 4302 } 4303 4304 /* 4305 * Endianness conversion. We are during postcopy (though paused). 4306 * The dirty bitmap won't change. We can directly modify it. 4307 */ 4308 bitmap_from_le(block->bmap, le_bitmap, nbits); 4309 4310 /* 4311 * What we received is "received bitmap". Revert it as the initial 4312 * dirty bitmap for this ramblock. 4313 */ 4314 bitmap_complement(block->bmap, block->bmap, nbits); 4315 4316 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4317 ramblock_dirty_bitmap_clear_discarded_pages(block); 4318 4319 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4320 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4321 4322 /* 4323 * We succeeded to sync bitmap for current ramblock. If this is 4324 * the last one to sync, we need to notify the main send thread. 4325 */ 4326 ram_dirty_bitmap_reload_notify(s); 4327 4328 ret = 0; 4329 out: 4330 g_free(le_bitmap); 4331 return ret; 4332 } 4333 4334 static int ram_resume_prepare(MigrationState *s, void *opaque) 4335 { 4336 RAMState *rs = *(RAMState **)opaque; 4337 int ret; 4338 4339 ret = ram_dirty_bitmap_sync_all(s, rs); 4340 if (ret) { 4341 return ret; 4342 } 4343 4344 ram_state_resume_prepare(rs, s->to_dst_file); 4345 4346 return 0; 4347 } 4348 4349 static SaveVMHandlers savevm_ram_handlers = { 4350 .save_setup = ram_save_setup, 4351 .save_live_iterate = ram_save_iterate, 4352 .save_live_complete_postcopy = ram_save_complete, 4353 .save_live_complete_precopy = ram_save_complete, 4354 .has_postcopy = ram_has_postcopy, 4355 .save_live_pending = ram_save_pending, 4356 .load_state = ram_load, 4357 .save_cleanup = ram_save_cleanup, 4358 .load_setup = ram_load_setup, 4359 .load_cleanup = ram_load_cleanup, 4360 .resume_prepare = ram_resume_prepare, 4361 }; 4362 4363 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4364 size_t old_size, size_t new_size) 4365 { 4366 PostcopyState ps = postcopy_state_get(); 4367 ram_addr_t offset; 4368 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4369 Error *err = NULL; 4370 4371 if (ramblock_is_ignored(rb)) { 4372 return; 4373 } 4374 4375 if (!migration_is_idle()) { 4376 /* 4377 * Precopy code on the source cannot deal with the size of RAM blocks 4378 * changing at random points in time - especially after sending the 4379 * RAM block sizes in the migration stream, they must no longer change. 4380 * Abort and indicate a proper reason. 4381 */ 4382 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4383 migration_cancel(err); 4384 error_free(err); 4385 } 4386 4387 switch (ps) { 4388 case POSTCOPY_INCOMING_ADVISE: 4389 /* 4390 * Update what ram_postcopy_incoming_init()->init_range() does at the 4391 * time postcopy was advised. Syncing RAM blocks with the source will 4392 * result in RAM resizes. 4393 */ 4394 if (old_size < new_size) { 4395 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4396 error_report("RAM block '%s' discard of resized RAM failed", 4397 rb->idstr); 4398 } 4399 } 4400 rb->postcopy_length = new_size; 4401 break; 4402 case POSTCOPY_INCOMING_NONE: 4403 case POSTCOPY_INCOMING_RUNNING: 4404 case POSTCOPY_INCOMING_END: 4405 /* 4406 * Once our guest is running, postcopy does no longer care about 4407 * resizes. When growing, the new memory was not available on the 4408 * source, no handler needed. 4409 */ 4410 break; 4411 default: 4412 error_report("RAM block '%s' resized during postcopy state: %d", 4413 rb->idstr, ps); 4414 exit(-1); 4415 } 4416 } 4417 4418 static RAMBlockNotifier ram_mig_ram_notifier = { 4419 .ram_block_resized = ram_mig_ram_block_resized, 4420 }; 4421 4422 void ram_mig_init(void) 4423 { 4424 qemu_mutex_init(&XBZRLE.lock); 4425 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4426 ram_block_notifier_add(&ram_mig_ram_notifier); 4427 } 4428