1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "cpu.h" 31 #include "qemu/cutils.h" 32 #include "qemu/bitops.h" 33 #include "qemu/bitmap.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration/register.h" 39 #include "migration/misc.h" 40 #include "qemu-file.h" 41 #include "postcopy-ram.h" 42 #include "page_cache.h" 43 #include "qemu/error-report.h" 44 #include "qapi/error.h" 45 #include "qapi/qapi-types-migration.h" 46 #include "qapi/qapi-events-migration.h" 47 #include "qapi/qmp/qerror.h" 48 #include "trace.h" 49 #include "exec/ram_addr.h" 50 #include "exec/target_page.h" 51 #include "qemu/rcu_queue.h" 52 #include "migration/colo.h" 53 #include "block.h" 54 #include "sysemu/sysemu.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #if defined(__linux__) 62 #include "qemu/userfaultfd.h" 63 #endif /* defined(__linux__) */ 64 65 /***********************************************************/ 66 /* ram save/restore */ 67 68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 69 * worked for pages that where filled with the same char. We switched 70 * it to only search for the zero value. And to avoid confusion with 71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 72 */ 73 74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 75 #define RAM_SAVE_FLAG_ZERO 0x02 76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 77 #define RAM_SAVE_FLAG_PAGE 0x08 78 #define RAM_SAVE_FLAG_EOS 0x10 79 #define RAM_SAVE_FLAG_CONTINUE 0x20 80 #define RAM_SAVE_FLAG_XBZRLE 0x40 81 /* 0x80 is reserved in migration.h start with 0x100 next */ 82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 83 84 static inline bool is_zero_range(uint8_t *p, uint64_t size) 85 { 86 return buffer_is_zero(p, size); 87 } 88 89 XBZRLECacheStats xbzrle_counters; 90 91 /* struct contains XBZRLE cache and a static page 92 used by the compression */ 93 static struct { 94 /* buffer used for XBZRLE encoding */ 95 uint8_t *encoded_buf; 96 /* buffer for storing page content */ 97 uint8_t *current_buf; 98 /* Cache for XBZRLE, Protected by lock. */ 99 PageCache *cache; 100 QemuMutex lock; 101 /* it will store a page full of zeros */ 102 uint8_t *zero_target_page; 103 /* buffer used for XBZRLE decoding */ 104 uint8_t *decoded_buf; 105 } XBZRLE; 106 107 static void XBZRLE_cache_lock(void) 108 { 109 if (migrate_use_xbzrle()) { 110 qemu_mutex_lock(&XBZRLE.lock); 111 } 112 } 113 114 static void XBZRLE_cache_unlock(void) 115 { 116 if (migrate_use_xbzrle()) { 117 qemu_mutex_unlock(&XBZRLE.lock); 118 } 119 } 120 121 /** 122 * xbzrle_cache_resize: resize the xbzrle cache 123 * 124 * This function is called from migrate_params_apply in main 125 * thread, possibly while a migration is in progress. A running 126 * migration may be using the cache and might finish during this call, 127 * hence changes to the cache are protected by XBZRLE.lock(). 128 * 129 * Returns 0 for success or -1 for error 130 * 131 * @new_size: new cache size 132 * @errp: set *errp if the check failed, with reason 133 */ 134 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 135 { 136 PageCache *new_cache; 137 int64_t ret = 0; 138 139 /* Check for truncation */ 140 if (new_size != (size_t)new_size) { 141 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 142 "exceeding address space"); 143 return -1; 144 } 145 146 if (new_size == migrate_xbzrle_cache_size()) { 147 /* nothing to do */ 148 return 0; 149 } 150 151 XBZRLE_cache_lock(); 152 153 if (XBZRLE.cache != NULL) { 154 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 155 if (!new_cache) { 156 ret = -1; 157 goto out; 158 } 159 160 cache_fini(XBZRLE.cache); 161 XBZRLE.cache = new_cache; 162 } 163 out: 164 XBZRLE_cache_unlock(); 165 return ret; 166 } 167 168 bool ramblock_is_ignored(RAMBlock *block) 169 { 170 return !qemu_ram_is_migratable(block) || 171 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 172 } 173 174 #undef RAMBLOCK_FOREACH 175 176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 177 { 178 RAMBlock *block; 179 int ret = 0; 180 181 RCU_READ_LOCK_GUARD(); 182 183 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 184 ret = func(block, opaque); 185 if (ret) { 186 break; 187 } 188 } 189 return ret; 190 } 191 192 static void ramblock_recv_map_init(void) 193 { 194 RAMBlock *rb; 195 196 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 197 assert(!rb->receivedmap); 198 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 199 } 200 } 201 202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 203 { 204 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 205 rb->receivedmap); 206 } 207 208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 209 { 210 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 211 } 212 213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 214 { 215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 216 } 217 218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 219 size_t nr) 220 { 221 bitmap_set_atomic(rb->receivedmap, 222 ramblock_recv_bitmap_offset(host_addr, rb), 223 nr); 224 } 225 226 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 227 228 /* 229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 230 * 231 * Returns >0 if success with sent bytes, or <0 if error. 232 */ 233 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 234 const char *block_name) 235 { 236 RAMBlock *block = qemu_ram_block_by_name(block_name); 237 unsigned long *le_bitmap, nbits; 238 uint64_t size; 239 240 if (!block) { 241 error_report("%s: invalid block name: %s", __func__, block_name); 242 return -1; 243 } 244 245 nbits = block->used_length >> TARGET_PAGE_BITS; 246 247 /* 248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 249 * machines we may need 4 more bytes for padding (see below 250 * comment). So extend it a bit before hand. 251 */ 252 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 253 254 /* 255 * Always use little endian when sending the bitmap. This is 256 * required that when source and destination VMs are not using the 257 * same endianness. (Note: big endian won't work.) 258 */ 259 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 260 261 /* Size of the bitmap, in bytes */ 262 size = DIV_ROUND_UP(nbits, 8); 263 264 /* 265 * size is always aligned to 8 bytes for 64bit machines, but it 266 * may not be true for 32bit machines. We need this padding to 267 * make sure the migration can survive even between 32bit and 268 * 64bit machines. 269 */ 270 size = ROUND_UP(size, 8); 271 272 qemu_put_be64(file, size); 273 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 274 /* 275 * Mark as an end, in case the middle part is screwed up due to 276 * some "mysterious" reason. 277 */ 278 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 279 qemu_fflush(file); 280 281 g_free(le_bitmap); 282 283 if (qemu_file_get_error(file)) { 284 return qemu_file_get_error(file); 285 } 286 287 return size + sizeof(size); 288 } 289 290 /* 291 * An outstanding page request, on the source, having been received 292 * and queued 293 */ 294 struct RAMSrcPageRequest { 295 RAMBlock *rb; 296 hwaddr offset; 297 hwaddr len; 298 299 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 300 }; 301 302 /* State of RAM for migration */ 303 struct RAMState { 304 /* QEMUFile used for this migration */ 305 QEMUFile *f; 306 /* UFFD file descriptor, used in 'write-tracking' migration */ 307 int uffdio_fd; 308 /* Last block that we have visited searching for dirty pages */ 309 RAMBlock *last_seen_block; 310 /* Last block from where we have sent data */ 311 RAMBlock *last_sent_block; 312 /* Last dirty target page we have sent */ 313 ram_addr_t last_page; 314 /* last ram version we have seen */ 315 uint32_t last_version; 316 /* We are in the first round */ 317 bool ram_bulk_stage; 318 /* The free page optimization is enabled */ 319 bool fpo_enabled; 320 /* How many times we have dirty too many pages */ 321 int dirty_rate_high_cnt; 322 /* these variables are used for bitmap sync */ 323 /* last time we did a full bitmap_sync */ 324 int64_t time_last_bitmap_sync; 325 /* bytes transferred at start_time */ 326 uint64_t bytes_xfer_prev; 327 /* number of dirty pages since start_time */ 328 uint64_t num_dirty_pages_period; 329 /* xbzrle misses since the beginning of the period */ 330 uint64_t xbzrle_cache_miss_prev; 331 /* Amount of xbzrle pages since the beginning of the period */ 332 uint64_t xbzrle_pages_prev; 333 /* Amount of xbzrle encoded bytes since the beginning of the period */ 334 uint64_t xbzrle_bytes_prev; 335 336 /* compression statistics since the beginning of the period */ 337 /* amount of count that no free thread to compress data */ 338 uint64_t compress_thread_busy_prev; 339 /* amount bytes after compression */ 340 uint64_t compressed_size_prev; 341 /* amount of compressed pages */ 342 uint64_t compress_pages_prev; 343 344 /* total handled target pages at the beginning of period */ 345 uint64_t target_page_count_prev; 346 /* total handled target pages since start */ 347 uint64_t target_page_count; 348 /* number of dirty bits in the bitmap */ 349 uint64_t migration_dirty_pages; 350 /* Protects modification of the bitmap and migration dirty pages */ 351 QemuMutex bitmap_mutex; 352 /* The RAMBlock used in the last src_page_requests */ 353 RAMBlock *last_req_rb; 354 /* Queue of outstanding page requests from the destination */ 355 QemuMutex src_page_req_mutex; 356 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 357 }; 358 typedef struct RAMState RAMState; 359 360 static RAMState *ram_state; 361 362 static NotifierWithReturnList precopy_notifier_list; 363 364 void precopy_infrastructure_init(void) 365 { 366 notifier_with_return_list_init(&precopy_notifier_list); 367 } 368 369 void precopy_add_notifier(NotifierWithReturn *n) 370 { 371 notifier_with_return_list_add(&precopy_notifier_list, n); 372 } 373 374 void precopy_remove_notifier(NotifierWithReturn *n) 375 { 376 notifier_with_return_remove(n); 377 } 378 379 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 380 { 381 PrecopyNotifyData pnd; 382 pnd.reason = reason; 383 pnd.errp = errp; 384 385 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 386 } 387 388 void precopy_enable_free_page_optimization(void) 389 { 390 if (!ram_state) { 391 return; 392 } 393 394 ram_state->fpo_enabled = true; 395 } 396 397 uint64_t ram_bytes_remaining(void) 398 { 399 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 400 0; 401 } 402 403 MigrationStats ram_counters; 404 405 /* used by the search for pages to send */ 406 struct PageSearchStatus { 407 /* Current block being searched */ 408 RAMBlock *block; 409 /* Current page to search from */ 410 unsigned long page; 411 /* Set once we wrap around */ 412 bool complete_round; 413 }; 414 typedef struct PageSearchStatus PageSearchStatus; 415 416 CompressionStats compression_counters; 417 418 struct CompressParam { 419 bool done; 420 bool quit; 421 bool zero_page; 422 QEMUFile *file; 423 QemuMutex mutex; 424 QemuCond cond; 425 RAMBlock *block; 426 ram_addr_t offset; 427 428 /* internally used fields */ 429 z_stream stream; 430 uint8_t *originbuf; 431 }; 432 typedef struct CompressParam CompressParam; 433 434 struct DecompressParam { 435 bool done; 436 bool quit; 437 QemuMutex mutex; 438 QemuCond cond; 439 void *des; 440 uint8_t *compbuf; 441 int len; 442 z_stream stream; 443 }; 444 typedef struct DecompressParam DecompressParam; 445 446 static CompressParam *comp_param; 447 static QemuThread *compress_threads; 448 /* comp_done_cond is used to wake up the migration thread when 449 * one of the compression threads has finished the compression. 450 * comp_done_lock is used to co-work with comp_done_cond. 451 */ 452 static QemuMutex comp_done_lock; 453 static QemuCond comp_done_cond; 454 /* The empty QEMUFileOps will be used by file in CompressParam */ 455 static const QEMUFileOps empty_ops = { }; 456 457 static QEMUFile *decomp_file; 458 static DecompressParam *decomp_param; 459 static QemuThread *decompress_threads; 460 static QemuMutex decomp_done_lock; 461 static QemuCond decomp_done_cond; 462 463 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 464 ram_addr_t offset, uint8_t *source_buf); 465 466 static void *do_data_compress(void *opaque) 467 { 468 CompressParam *param = opaque; 469 RAMBlock *block; 470 ram_addr_t offset; 471 bool zero_page; 472 473 qemu_mutex_lock(¶m->mutex); 474 while (!param->quit) { 475 if (param->block) { 476 block = param->block; 477 offset = param->offset; 478 param->block = NULL; 479 qemu_mutex_unlock(¶m->mutex); 480 481 zero_page = do_compress_ram_page(param->file, ¶m->stream, 482 block, offset, param->originbuf); 483 484 qemu_mutex_lock(&comp_done_lock); 485 param->done = true; 486 param->zero_page = zero_page; 487 qemu_cond_signal(&comp_done_cond); 488 qemu_mutex_unlock(&comp_done_lock); 489 490 qemu_mutex_lock(¶m->mutex); 491 } else { 492 qemu_cond_wait(¶m->cond, ¶m->mutex); 493 } 494 } 495 qemu_mutex_unlock(¶m->mutex); 496 497 return NULL; 498 } 499 500 static void compress_threads_save_cleanup(void) 501 { 502 int i, thread_count; 503 504 if (!migrate_use_compression() || !comp_param) { 505 return; 506 } 507 508 thread_count = migrate_compress_threads(); 509 for (i = 0; i < thread_count; i++) { 510 /* 511 * we use it as a indicator which shows if the thread is 512 * properly init'd or not 513 */ 514 if (!comp_param[i].file) { 515 break; 516 } 517 518 qemu_mutex_lock(&comp_param[i].mutex); 519 comp_param[i].quit = true; 520 qemu_cond_signal(&comp_param[i].cond); 521 qemu_mutex_unlock(&comp_param[i].mutex); 522 523 qemu_thread_join(compress_threads + i); 524 qemu_mutex_destroy(&comp_param[i].mutex); 525 qemu_cond_destroy(&comp_param[i].cond); 526 deflateEnd(&comp_param[i].stream); 527 g_free(comp_param[i].originbuf); 528 qemu_fclose(comp_param[i].file); 529 comp_param[i].file = NULL; 530 } 531 qemu_mutex_destroy(&comp_done_lock); 532 qemu_cond_destroy(&comp_done_cond); 533 g_free(compress_threads); 534 g_free(comp_param); 535 compress_threads = NULL; 536 comp_param = NULL; 537 } 538 539 static int compress_threads_save_setup(void) 540 { 541 int i, thread_count; 542 543 if (!migrate_use_compression()) { 544 return 0; 545 } 546 thread_count = migrate_compress_threads(); 547 compress_threads = g_new0(QemuThread, thread_count); 548 comp_param = g_new0(CompressParam, thread_count); 549 qemu_cond_init(&comp_done_cond); 550 qemu_mutex_init(&comp_done_lock); 551 for (i = 0; i < thread_count; i++) { 552 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 553 if (!comp_param[i].originbuf) { 554 goto exit; 555 } 556 557 if (deflateInit(&comp_param[i].stream, 558 migrate_compress_level()) != Z_OK) { 559 g_free(comp_param[i].originbuf); 560 goto exit; 561 } 562 563 /* comp_param[i].file is just used as a dummy buffer to save data, 564 * set its ops to empty. 565 */ 566 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 567 comp_param[i].done = true; 568 comp_param[i].quit = false; 569 qemu_mutex_init(&comp_param[i].mutex); 570 qemu_cond_init(&comp_param[i].cond); 571 qemu_thread_create(compress_threads + i, "compress", 572 do_data_compress, comp_param + i, 573 QEMU_THREAD_JOINABLE); 574 } 575 return 0; 576 577 exit: 578 compress_threads_save_cleanup(); 579 return -1; 580 } 581 582 /** 583 * save_page_header: write page header to wire 584 * 585 * If this is the 1st block, it also writes the block identification 586 * 587 * Returns the number of bytes written 588 * 589 * @f: QEMUFile where to send the data 590 * @block: block that contains the page we want to send 591 * @offset: offset inside the block for the page 592 * in the lower bits, it contains flags 593 */ 594 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 595 ram_addr_t offset) 596 { 597 size_t size, len; 598 599 if (block == rs->last_sent_block) { 600 offset |= RAM_SAVE_FLAG_CONTINUE; 601 } 602 qemu_put_be64(f, offset); 603 size = 8; 604 605 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 606 len = strlen(block->idstr); 607 qemu_put_byte(f, len); 608 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 609 size += 1 + len; 610 rs->last_sent_block = block; 611 } 612 return size; 613 } 614 615 /** 616 * mig_throttle_guest_down: throotle down the guest 617 * 618 * Reduce amount of guest cpu execution to hopefully slow down memory 619 * writes. If guest dirty memory rate is reduced below the rate at 620 * which we can transfer pages to the destination then we should be 621 * able to complete migration. Some workloads dirty memory way too 622 * fast and will not effectively converge, even with auto-converge. 623 */ 624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 625 uint64_t bytes_dirty_threshold) 626 { 627 MigrationState *s = migrate_get_current(); 628 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 629 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 630 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 631 int pct_max = s->parameters.max_cpu_throttle; 632 633 uint64_t throttle_now = cpu_throttle_get_percentage(); 634 uint64_t cpu_now, cpu_ideal, throttle_inc; 635 636 /* We have not started throttling yet. Let's start it. */ 637 if (!cpu_throttle_active()) { 638 cpu_throttle_set(pct_initial); 639 } else { 640 /* Throttling already on, just increase the rate */ 641 if (!pct_tailslow) { 642 throttle_inc = pct_increment; 643 } else { 644 /* Compute the ideal CPU percentage used by Guest, which may 645 * make the dirty rate match the dirty rate threshold. */ 646 cpu_now = 100 - throttle_now; 647 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 648 bytes_dirty_period); 649 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 650 } 651 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 652 } 653 } 654 655 /** 656 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 657 * 658 * @rs: current RAM state 659 * @current_addr: address for the zero page 660 * 661 * Update the xbzrle cache to reflect a page that's been sent as all 0. 662 * The important thing is that a stale (not-yet-0'd) page be replaced 663 * by the new data. 664 * As a bonus, if the page wasn't in the cache it gets added so that 665 * when a small write is made into the 0'd page it gets XBZRLE sent. 666 */ 667 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 668 { 669 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 670 return; 671 } 672 673 /* We don't care if this fails to allocate a new cache page 674 * as long as it updated an old one */ 675 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 676 ram_counters.dirty_sync_count); 677 } 678 679 #define ENCODING_FLAG_XBZRLE 0x1 680 681 /** 682 * save_xbzrle_page: compress and send current page 683 * 684 * Returns: 1 means that we wrote the page 685 * 0 means that page is identical to the one already sent 686 * -1 means that xbzrle would be longer than normal 687 * 688 * @rs: current RAM state 689 * @current_data: pointer to the address of the page contents 690 * @current_addr: addr of the page 691 * @block: block that contains the page we want to send 692 * @offset: offset inside the block for the page 693 * @last_stage: if we are at the completion stage 694 */ 695 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 696 ram_addr_t current_addr, RAMBlock *block, 697 ram_addr_t offset, bool last_stage) 698 { 699 int encoded_len = 0, bytes_xbzrle; 700 uint8_t *prev_cached_page; 701 702 if (!cache_is_cached(XBZRLE.cache, current_addr, 703 ram_counters.dirty_sync_count)) { 704 xbzrle_counters.cache_miss++; 705 if (!last_stage) { 706 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 707 ram_counters.dirty_sync_count) == -1) { 708 return -1; 709 } else { 710 /* update *current_data when the page has been 711 inserted into cache */ 712 *current_data = get_cached_data(XBZRLE.cache, current_addr); 713 } 714 } 715 return -1; 716 } 717 718 /* 719 * Reaching here means the page has hit the xbzrle cache, no matter what 720 * encoding result it is (normal encoding, overflow or skipping the page), 721 * count the page as encoded. This is used to calculate the encoding rate. 722 * 723 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 724 * 2nd page turns out to be skipped (i.e. no new bytes written to the 725 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 726 * skipped page included. In this way, the encoding rate can tell if the 727 * guest page is good for xbzrle encoding. 728 */ 729 xbzrle_counters.pages++; 730 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 731 732 /* save current buffer into memory */ 733 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 734 735 /* XBZRLE encoding (if there is no overflow) */ 736 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 737 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 738 TARGET_PAGE_SIZE); 739 740 /* 741 * Update the cache contents, so that it corresponds to the data 742 * sent, in all cases except where we skip the page. 743 */ 744 if (!last_stage && encoded_len != 0) { 745 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 746 /* 747 * In the case where we couldn't compress, ensure that the caller 748 * sends the data from the cache, since the guest might have 749 * changed the RAM since we copied it. 750 */ 751 *current_data = prev_cached_page; 752 } 753 754 if (encoded_len == 0) { 755 trace_save_xbzrle_page_skipping(); 756 return 0; 757 } else if (encoded_len == -1) { 758 trace_save_xbzrle_page_overflow(); 759 xbzrle_counters.overflow++; 760 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 761 return -1; 762 } 763 764 /* Send XBZRLE based compressed page */ 765 bytes_xbzrle = save_page_header(rs, rs->f, block, 766 offset | RAM_SAVE_FLAG_XBZRLE); 767 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 768 qemu_put_be16(rs->f, encoded_len); 769 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 770 bytes_xbzrle += encoded_len + 1 + 2; 771 /* 772 * Like compressed_size (please see update_compress_thread_counts), 773 * the xbzrle encoded bytes don't count the 8 byte header with 774 * RAM_SAVE_FLAG_CONTINUE. 775 */ 776 xbzrle_counters.bytes += bytes_xbzrle - 8; 777 ram_counters.transferred += bytes_xbzrle; 778 779 return 1; 780 } 781 782 /** 783 * migration_bitmap_find_dirty: find the next dirty page from start 784 * 785 * Returns the page offset within memory region of the start of a dirty page 786 * 787 * @rs: current RAM state 788 * @rb: RAMBlock where to search for dirty pages 789 * @start: page where we start the search 790 */ 791 static inline 792 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 793 unsigned long start) 794 { 795 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 796 unsigned long *bitmap = rb->bmap; 797 unsigned long next; 798 799 if (ramblock_is_ignored(rb)) { 800 return size; 801 } 802 803 /* 804 * When the free page optimization is enabled, we need to check the bitmap 805 * to send the non-free pages rather than all the pages in the bulk stage. 806 */ 807 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) { 808 next = start + 1; 809 } else { 810 next = find_next_bit(bitmap, size, start); 811 } 812 813 return next; 814 } 815 816 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 817 RAMBlock *rb, 818 unsigned long page) 819 { 820 bool ret; 821 822 QEMU_LOCK_GUARD(&rs->bitmap_mutex); 823 824 /* 825 * Clear dirty bitmap if needed. This _must_ be called before we 826 * send any of the page in the chunk because we need to make sure 827 * we can capture further page content changes when we sync dirty 828 * log the next time. So as long as we are going to send any of 829 * the page in the chunk we clear the remote dirty bitmap for all. 830 * Clearing it earlier won't be a problem, but too late will. 831 */ 832 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) { 833 uint8_t shift = rb->clear_bmap_shift; 834 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift); 835 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); 836 837 /* 838 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 839 * can make things easier sometimes since then start address 840 * of the small chunk will always be 64 pages aligned so the 841 * bitmap will always be aligned to unsigned long. We should 842 * even be able to remove this restriction but I'm simply 843 * keeping it. 844 */ 845 assert(shift >= 6); 846 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 847 memory_region_clear_dirty_bitmap(rb->mr, start, size); 848 } 849 850 ret = test_and_clear_bit(page, rb->bmap); 851 852 if (ret) { 853 rs->migration_dirty_pages--; 854 } 855 856 return ret; 857 } 858 859 /* Called with RCU critical section */ 860 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 861 { 862 uint64_t new_dirty_pages = 863 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 864 865 rs->migration_dirty_pages += new_dirty_pages; 866 rs->num_dirty_pages_period += new_dirty_pages; 867 } 868 869 /** 870 * ram_pagesize_summary: calculate all the pagesizes of a VM 871 * 872 * Returns a summary bitmap of the page sizes of all RAMBlocks 873 * 874 * For VMs with just normal pages this is equivalent to the host page 875 * size. If it's got some huge pages then it's the OR of all the 876 * different page sizes. 877 */ 878 uint64_t ram_pagesize_summary(void) 879 { 880 RAMBlock *block; 881 uint64_t summary = 0; 882 883 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 884 summary |= block->page_size; 885 } 886 887 return summary; 888 } 889 890 uint64_t ram_get_total_transferred_pages(void) 891 { 892 return ram_counters.normal + ram_counters.duplicate + 893 compression_counters.pages + xbzrle_counters.pages; 894 } 895 896 static void migration_update_rates(RAMState *rs, int64_t end_time) 897 { 898 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 899 double compressed_size; 900 901 /* calculate period counters */ 902 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 903 / (end_time - rs->time_last_bitmap_sync); 904 905 if (!page_count) { 906 return; 907 } 908 909 if (migrate_use_xbzrle()) { 910 double encoded_size, unencoded_size; 911 912 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 913 rs->xbzrle_cache_miss_prev) / page_count; 914 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 915 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 916 TARGET_PAGE_SIZE; 917 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 918 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 919 xbzrle_counters.encoding_rate = 0; 920 } else { 921 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 922 } 923 rs->xbzrle_pages_prev = xbzrle_counters.pages; 924 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 925 } 926 927 if (migrate_use_compression()) { 928 compression_counters.busy_rate = (double)(compression_counters.busy - 929 rs->compress_thread_busy_prev) / page_count; 930 rs->compress_thread_busy_prev = compression_counters.busy; 931 932 compressed_size = compression_counters.compressed_size - 933 rs->compressed_size_prev; 934 if (compressed_size) { 935 double uncompressed_size = (compression_counters.pages - 936 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 937 938 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 939 compression_counters.compression_rate = 940 uncompressed_size / compressed_size; 941 942 rs->compress_pages_prev = compression_counters.pages; 943 rs->compressed_size_prev = compression_counters.compressed_size; 944 } 945 } 946 } 947 948 static void migration_trigger_throttle(RAMState *rs) 949 { 950 MigrationState *s = migrate_get_current(); 951 uint64_t threshold = s->parameters.throttle_trigger_threshold; 952 953 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 954 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 955 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 956 957 /* During block migration the auto-converge logic incorrectly detects 958 * that ram migration makes no progress. Avoid this by disabling the 959 * throttling logic during the bulk phase of block migration. */ 960 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 961 /* The following detection logic can be refined later. For now: 962 Check to see if the ratio between dirtied bytes and the approx. 963 amount of bytes that just got transferred since the last time 964 we were in this routine reaches the threshold. If that happens 965 twice, start or increase throttling. */ 966 967 if ((bytes_dirty_period > bytes_dirty_threshold) && 968 (++rs->dirty_rate_high_cnt >= 2)) { 969 trace_migration_throttle(); 970 rs->dirty_rate_high_cnt = 0; 971 mig_throttle_guest_down(bytes_dirty_period, 972 bytes_dirty_threshold); 973 } 974 } 975 } 976 977 static void migration_bitmap_sync(RAMState *rs) 978 { 979 RAMBlock *block; 980 int64_t end_time; 981 982 ram_counters.dirty_sync_count++; 983 984 if (!rs->time_last_bitmap_sync) { 985 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 986 } 987 988 trace_migration_bitmap_sync_start(); 989 memory_global_dirty_log_sync(); 990 991 qemu_mutex_lock(&rs->bitmap_mutex); 992 WITH_RCU_READ_LOCK_GUARD() { 993 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 994 ramblock_sync_dirty_bitmap(rs, block); 995 } 996 ram_counters.remaining = ram_bytes_remaining(); 997 } 998 qemu_mutex_unlock(&rs->bitmap_mutex); 999 1000 memory_global_after_dirty_log_sync(); 1001 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1002 1003 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1004 1005 /* more than 1 second = 1000 millisecons */ 1006 if (end_time > rs->time_last_bitmap_sync + 1000) { 1007 migration_trigger_throttle(rs); 1008 1009 migration_update_rates(rs, end_time); 1010 1011 rs->target_page_count_prev = rs->target_page_count; 1012 1013 /* reset period counters */ 1014 rs->time_last_bitmap_sync = end_time; 1015 rs->num_dirty_pages_period = 0; 1016 rs->bytes_xfer_prev = ram_counters.transferred; 1017 } 1018 if (migrate_use_events()) { 1019 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1020 } 1021 } 1022 1023 static void migration_bitmap_sync_precopy(RAMState *rs) 1024 { 1025 Error *local_err = NULL; 1026 1027 /* 1028 * The current notifier usage is just an optimization to migration, so we 1029 * don't stop the normal migration process in the error case. 1030 */ 1031 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1032 error_report_err(local_err); 1033 local_err = NULL; 1034 } 1035 1036 migration_bitmap_sync(rs); 1037 1038 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1039 error_report_err(local_err); 1040 } 1041 } 1042 1043 /** 1044 * save_zero_page_to_file: send the zero page to the file 1045 * 1046 * Returns the size of data written to the file, 0 means the page is not 1047 * a zero page 1048 * 1049 * @rs: current RAM state 1050 * @file: the file where the data is saved 1051 * @block: block that contains the page we want to send 1052 * @offset: offset inside the block for the page 1053 */ 1054 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1055 RAMBlock *block, ram_addr_t offset) 1056 { 1057 uint8_t *p = block->host + offset; 1058 int len = 0; 1059 1060 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1061 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1062 qemu_put_byte(file, 0); 1063 len += 1; 1064 } 1065 return len; 1066 } 1067 1068 /** 1069 * save_zero_page: send the zero page to the stream 1070 * 1071 * Returns the number of pages written. 1072 * 1073 * @rs: current RAM state 1074 * @block: block that contains the page we want to send 1075 * @offset: offset inside the block for the page 1076 */ 1077 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1078 { 1079 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1080 1081 if (len) { 1082 ram_counters.duplicate++; 1083 ram_counters.transferred += len; 1084 return 1; 1085 } 1086 return -1; 1087 } 1088 1089 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1090 { 1091 if (!migrate_release_ram() || !migration_in_postcopy()) { 1092 return; 1093 } 1094 1095 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1096 } 1097 1098 /* 1099 * @pages: the number of pages written by the control path, 1100 * < 0 - error 1101 * > 0 - number of pages written 1102 * 1103 * Return true if the pages has been saved, otherwise false is returned. 1104 */ 1105 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1106 int *pages) 1107 { 1108 uint64_t bytes_xmit = 0; 1109 int ret; 1110 1111 *pages = -1; 1112 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1113 &bytes_xmit); 1114 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1115 return false; 1116 } 1117 1118 if (bytes_xmit) { 1119 ram_counters.transferred += bytes_xmit; 1120 *pages = 1; 1121 } 1122 1123 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1124 return true; 1125 } 1126 1127 if (bytes_xmit > 0) { 1128 ram_counters.normal++; 1129 } else if (bytes_xmit == 0) { 1130 ram_counters.duplicate++; 1131 } 1132 1133 return true; 1134 } 1135 1136 /* 1137 * directly send the page to the stream 1138 * 1139 * Returns the number of pages written. 1140 * 1141 * @rs: current RAM state 1142 * @block: block that contains the page we want to send 1143 * @offset: offset inside the block for the page 1144 * @buf: the page to be sent 1145 * @async: send to page asyncly 1146 */ 1147 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1148 uint8_t *buf, bool async) 1149 { 1150 ram_counters.transferred += save_page_header(rs, rs->f, block, 1151 offset | RAM_SAVE_FLAG_PAGE); 1152 if (async) { 1153 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1154 migrate_release_ram() & 1155 migration_in_postcopy()); 1156 } else { 1157 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1158 } 1159 ram_counters.transferred += TARGET_PAGE_SIZE; 1160 ram_counters.normal++; 1161 return 1; 1162 } 1163 1164 /** 1165 * ram_save_page: send the given page to the stream 1166 * 1167 * Returns the number of pages written. 1168 * < 0 - error 1169 * >=0 - Number of pages written - this might legally be 0 1170 * if xbzrle noticed the page was the same. 1171 * 1172 * @rs: current RAM state 1173 * @block: block that contains the page we want to send 1174 * @offset: offset inside the block for the page 1175 * @last_stage: if we are at the completion stage 1176 */ 1177 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1178 { 1179 int pages = -1; 1180 uint8_t *p; 1181 bool send_async = true; 1182 RAMBlock *block = pss->block; 1183 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1184 ram_addr_t current_addr = block->offset + offset; 1185 1186 p = block->host + offset; 1187 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1188 1189 XBZRLE_cache_lock(); 1190 if (!rs->ram_bulk_stage && !migration_in_postcopy() && 1191 migrate_use_xbzrle()) { 1192 pages = save_xbzrle_page(rs, &p, current_addr, block, 1193 offset, last_stage); 1194 if (!last_stage) { 1195 /* Can't send this cached data async, since the cache page 1196 * might get updated before it gets to the wire 1197 */ 1198 send_async = false; 1199 } 1200 } 1201 1202 /* XBZRLE overflow or normal page */ 1203 if (pages == -1) { 1204 pages = save_normal_page(rs, block, offset, p, send_async); 1205 } 1206 1207 XBZRLE_cache_unlock(); 1208 1209 return pages; 1210 } 1211 1212 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1213 ram_addr_t offset) 1214 { 1215 if (multifd_queue_page(rs->f, block, offset) < 0) { 1216 return -1; 1217 } 1218 ram_counters.normal++; 1219 1220 return 1; 1221 } 1222 1223 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1224 ram_addr_t offset, uint8_t *source_buf) 1225 { 1226 RAMState *rs = ram_state; 1227 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1228 bool zero_page = false; 1229 int ret; 1230 1231 if (save_zero_page_to_file(rs, f, block, offset)) { 1232 zero_page = true; 1233 goto exit; 1234 } 1235 1236 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1237 1238 /* 1239 * copy it to a internal buffer to avoid it being modified by VM 1240 * so that we can catch up the error during compression and 1241 * decompression 1242 */ 1243 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1244 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1245 if (ret < 0) { 1246 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1247 error_report("compressed data failed!"); 1248 return false; 1249 } 1250 1251 exit: 1252 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1253 return zero_page; 1254 } 1255 1256 static void 1257 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1258 { 1259 ram_counters.transferred += bytes_xmit; 1260 1261 if (param->zero_page) { 1262 ram_counters.duplicate++; 1263 return; 1264 } 1265 1266 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1267 compression_counters.compressed_size += bytes_xmit - 8; 1268 compression_counters.pages++; 1269 } 1270 1271 static bool save_page_use_compression(RAMState *rs); 1272 1273 static void flush_compressed_data(RAMState *rs) 1274 { 1275 int idx, len, thread_count; 1276 1277 if (!save_page_use_compression(rs)) { 1278 return; 1279 } 1280 thread_count = migrate_compress_threads(); 1281 1282 qemu_mutex_lock(&comp_done_lock); 1283 for (idx = 0; idx < thread_count; idx++) { 1284 while (!comp_param[idx].done) { 1285 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1286 } 1287 } 1288 qemu_mutex_unlock(&comp_done_lock); 1289 1290 for (idx = 0; idx < thread_count; idx++) { 1291 qemu_mutex_lock(&comp_param[idx].mutex); 1292 if (!comp_param[idx].quit) { 1293 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1294 /* 1295 * it's safe to fetch zero_page without holding comp_done_lock 1296 * as there is no further request submitted to the thread, 1297 * i.e, the thread should be waiting for a request at this point. 1298 */ 1299 update_compress_thread_counts(&comp_param[idx], len); 1300 } 1301 qemu_mutex_unlock(&comp_param[idx].mutex); 1302 } 1303 } 1304 1305 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1306 ram_addr_t offset) 1307 { 1308 param->block = block; 1309 param->offset = offset; 1310 } 1311 1312 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1313 ram_addr_t offset) 1314 { 1315 int idx, thread_count, bytes_xmit = -1, pages = -1; 1316 bool wait = migrate_compress_wait_thread(); 1317 1318 thread_count = migrate_compress_threads(); 1319 qemu_mutex_lock(&comp_done_lock); 1320 retry: 1321 for (idx = 0; idx < thread_count; idx++) { 1322 if (comp_param[idx].done) { 1323 comp_param[idx].done = false; 1324 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1325 qemu_mutex_lock(&comp_param[idx].mutex); 1326 set_compress_params(&comp_param[idx], block, offset); 1327 qemu_cond_signal(&comp_param[idx].cond); 1328 qemu_mutex_unlock(&comp_param[idx].mutex); 1329 pages = 1; 1330 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1331 break; 1332 } 1333 } 1334 1335 /* 1336 * wait for the free thread if the user specifies 'compress-wait-thread', 1337 * otherwise we will post the page out in the main thread as normal page. 1338 */ 1339 if (pages < 0 && wait) { 1340 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1341 goto retry; 1342 } 1343 qemu_mutex_unlock(&comp_done_lock); 1344 1345 return pages; 1346 } 1347 1348 /** 1349 * find_dirty_block: find the next dirty page and update any state 1350 * associated with the search process. 1351 * 1352 * Returns true if a page is found 1353 * 1354 * @rs: current RAM state 1355 * @pss: data about the state of the current dirty page scan 1356 * @again: set to false if the search has scanned the whole of RAM 1357 */ 1358 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1359 { 1360 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1361 if (pss->complete_round && pss->block == rs->last_seen_block && 1362 pss->page >= rs->last_page) { 1363 /* 1364 * We've been once around the RAM and haven't found anything. 1365 * Give up. 1366 */ 1367 *again = false; 1368 return false; 1369 } 1370 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS) 1371 >= pss->block->used_length) { 1372 /* Didn't find anything in this RAM Block */ 1373 pss->page = 0; 1374 pss->block = QLIST_NEXT_RCU(pss->block, next); 1375 if (!pss->block) { 1376 /* 1377 * If memory migration starts over, we will meet a dirtied page 1378 * which may still exists in compression threads's ring, so we 1379 * should flush the compressed data to make sure the new page 1380 * is not overwritten by the old one in the destination. 1381 * 1382 * Also If xbzrle is on, stop using the data compression at this 1383 * point. In theory, xbzrle can do better than compression. 1384 */ 1385 flush_compressed_data(rs); 1386 1387 /* Hit the end of the list */ 1388 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1389 /* Flag that we've looped */ 1390 pss->complete_round = true; 1391 rs->ram_bulk_stage = false; 1392 } 1393 /* Didn't find anything this time, but try again on the new block */ 1394 *again = true; 1395 return false; 1396 } else { 1397 /* Can go around again, but... */ 1398 *again = true; 1399 /* We've found something so probably don't need to */ 1400 return true; 1401 } 1402 } 1403 1404 /** 1405 * unqueue_page: gets a page of the queue 1406 * 1407 * Helper for 'get_queued_page' - gets a page off the queue 1408 * 1409 * Returns the block of the page (or NULL if none available) 1410 * 1411 * @rs: current RAM state 1412 * @offset: used to return the offset within the RAMBlock 1413 */ 1414 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1415 { 1416 RAMBlock *block = NULL; 1417 1418 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1419 return NULL; 1420 } 1421 1422 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1423 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1424 struct RAMSrcPageRequest *entry = 1425 QSIMPLEQ_FIRST(&rs->src_page_requests); 1426 block = entry->rb; 1427 *offset = entry->offset; 1428 1429 if (entry->len > TARGET_PAGE_SIZE) { 1430 entry->len -= TARGET_PAGE_SIZE; 1431 entry->offset += TARGET_PAGE_SIZE; 1432 } else { 1433 memory_region_unref(block->mr); 1434 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1435 g_free(entry); 1436 migration_consume_urgent_request(); 1437 } 1438 } 1439 1440 return block; 1441 } 1442 1443 #if defined(__linux__) 1444 /** 1445 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1446 * is found, return RAM block pointer and page offset 1447 * 1448 * Returns pointer to the RAMBlock containing faulting page, 1449 * NULL if no write faults are pending 1450 * 1451 * @rs: current RAM state 1452 * @offset: page offset from the beginning of the block 1453 */ 1454 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1455 { 1456 struct uffd_msg uffd_msg; 1457 void *page_address; 1458 RAMBlock *bs; 1459 int res; 1460 1461 if (!migrate_background_snapshot()) { 1462 return NULL; 1463 } 1464 1465 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1466 if (res <= 0) { 1467 return NULL; 1468 } 1469 1470 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1471 bs = qemu_ram_block_from_host(page_address, false, offset); 1472 assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); 1473 return bs; 1474 } 1475 1476 /** 1477 * ram_save_release_protection: release UFFD write protection after 1478 * a range of pages has been saved 1479 * 1480 * @rs: current RAM state 1481 * @pss: page-search-status structure 1482 * @start_page: index of the first page in the range relative to pss->block 1483 * 1484 * Returns 0 on success, negative value in case of an error 1485 */ 1486 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1487 unsigned long start_page) 1488 { 1489 int res = 0; 1490 1491 /* Check if page is from UFFD-managed region. */ 1492 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1493 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1494 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; 1495 1496 /* Flush async buffers before un-protect. */ 1497 qemu_fflush(rs->f); 1498 /* Un-protect memory range. */ 1499 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1500 false, false); 1501 } 1502 1503 return res; 1504 } 1505 1506 /* ram_write_tracking_available: check if kernel supports required UFFD features 1507 * 1508 * Returns true if supports, false otherwise 1509 */ 1510 bool ram_write_tracking_available(void) 1511 { 1512 uint64_t uffd_features; 1513 int res; 1514 1515 res = uffd_query_features(&uffd_features); 1516 return (res == 0 && 1517 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1518 } 1519 1520 /* ram_write_tracking_compatible: check if guest configuration is 1521 * compatible with 'write-tracking' 1522 * 1523 * Returns true if compatible, false otherwise 1524 */ 1525 bool ram_write_tracking_compatible(void) 1526 { 1527 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1528 int uffd_fd; 1529 RAMBlock *bs; 1530 bool ret = false; 1531 1532 /* Open UFFD file descriptor */ 1533 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1534 if (uffd_fd < 0) { 1535 return false; 1536 } 1537 1538 RCU_READ_LOCK_GUARD(); 1539 1540 RAMBLOCK_FOREACH_NOT_IGNORED(bs) { 1541 uint64_t uffd_ioctls; 1542 1543 /* Nothing to do with read-only and MMIO-writable regions */ 1544 if (bs->mr->readonly || bs->mr->rom_device) { 1545 continue; 1546 } 1547 /* Try to register block memory via UFFD-IO to track writes */ 1548 if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, 1549 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1550 goto out; 1551 } 1552 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1553 goto out; 1554 } 1555 } 1556 ret = true; 1557 1558 out: 1559 uffd_close_fd(uffd_fd); 1560 return ret; 1561 } 1562 1563 /* 1564 * ram_write_tracking_start: start UFFD-WP memory tracking 1565 * 1566 * Returns 0 for success or negative value in case of error 1567 */ 1568 int ram_write_tracking_start(void) 1569 { 1570 int uffd_fd; 1571 RAMState *rs = ram_state; 1572 RAMBlock *bs; 1573 1574 /* Open UFFD file descriptor */ 1575 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1576 if (uffd_fd < 0) { 1577 return uffd_fd; 1578 } 1579 rs->uffdio_fd = uffd_fd; 1580 1581 RCU_READ_LOCK_GUARD(); 1582 1583 RAMBLOCK_FOREACH_NOT_IGNORED(bs) { 1584 /* Nothing to do with read-only and MMIO-writable regions */ 1585 if (bs->mr->readonly || bs->mr->rom_device) { 1586 continue; 1587 } 1588 1589 /* Register block memory with UFFD to track writes */ 1590 if (uffd_register_memory(rs->uffdio_fd, bs->host, 1591 bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1592 goto fail; 1593 } 1594 /* Apply UFFD write protection to the block memory range */ 1595 if (uffd_change_protection(rs->uffdio_fd, bs->host, 1596 bs->max_length, true, false)) { 1597 goto fail; 1598 } 1599 bs->flags |= RAM_UF_WRITEPROTECT; 1600 memory_region_ref(bs->mr); 1601 1602 trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, 1603 bs->host, bs->max_length); 1604 } 1605 1606 return 0; 1607 1608 fail: 1609 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1610 1611 RAMBLOCK_FOREACH_NOT_IGNORED(bs) { 1612 if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { 1613 continue; 1614 } 1615 /* 1616 * In case some memory block failed to be write-protected 1617 * remove protection and unregister all succeeded RAM blocks 1618 */ 1619 uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); 1620 uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); 1621 /* Cleanup flags and remove reference */ 1622 bs->flags &= ~RAM_UF_WRITEPROTECT; 1623 memory_region_unref(bs->mr); 1624 } 1625 1626 uffd_close_fd(uffd_fd); 1627 rs->uffdio_fd = -1; 1628 return -1; 1629 } 1630 1631 /** 1632 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1633 */ 1634 void ram_write_tracking_stop(void) 1635 { 1636 RAMState *rs = ram_state; 1637 RAMBlock *bs; 1638 1639 RCU_READ_LOCK_GUARD(); 1640 1641 RAMBLOCK_FOREACH_NOT_IGNORED(bs) { 1642 if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { 1643 continue; 1644 } 1645 /* Remove protection and unregister all affected RAM blocks */ 1646 uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); 1647 uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); 1648 1649 trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, 1650 bs->host, bs->max_length); 1651 1652 /* Cleanup flags and remove reference */ 1653 bs->flags &= ~RAM_UF_WRITEPROTECT; 1654 memory_region_unref(bs->mr); 1655 } 1656 1657 /* Finally close UFFD file descriptor */ 1658 uffd_close_fd(rs->uffdio_fd); 1659 rs->uffdio_fd = -1; 1660 } 1661 1662 #else 1663 /* No target OS support, stubs just fail or ignore */ 1664 1665 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1666 { 1667 (void) rs; 1668 (void) offset; 1669 1670 return NULL; 1671 } 1672 1673 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1674 unsigned long start_page) 1675 { 1676 (void) rs; 1677 (void) pss; 1678 (void) start_page; 1679 1680 return 0; 1681 } 1682 1683 bool ram_write_tracking_available(void) 1684 { 1685 return false; 1686 } 1687 1688 bool ram_write_tracking_compatible(void) 1689 { 1690 assert(0); 1691 return false; 1692 } 1693 1694 int ram_write_tracking_start(void) 1695 { 1696 assert(0); 1697 return -1; 1698 } 1699 1700 void ram_write_tracking_stop(void) 1701 { 1702 assert(0); 1703 } 1704 #endif /* defined(__linux__) */ 1705 1706 /** 1707 * get_queued_page: unqueue a page from the postcopy requests 1708 * 1709 * Skips pages that are already sent (!dirty) 1710 * 1711 * Returns true if a queued page is found 1712 * 1713 * @rs: current RAM state 1714 * @pss: data about the state of the current dirty page scan 1715 */ 1716 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1717 { 1718 RAMBlock *block; 1719 ram_addr_t offset; 1720 bool dirty; 1721 1722 do { 1723 block = unqueue_page(rs, &offset); 1724 /* 1725 * We're sending this page, and since it's postcopy nothing else 1726 * will dirty it, and we must make sure it doesn't get sent again 1727 * even if this queue request was received after the background 1728 * search already sent it. 1729 */ 1730 if (block) { 1731 unsigned long page; 1732 1733 page = offset >> TARGET_PAGE_BITS; 1734 dirty = test_bit(page, block->bmap); 1735 if (!dirty) { 1736 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1737 page); 1738 } else { 1739 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1740 } 1741 } 1742 1743 } while (block && !dirty); 1744 1745 if (!block) { 1746 /* 1747 * Poll write faults too if background snapshot is enabled; that's 1748 * when we have vcpus got blocked by the write protected pages. 1749 */ 1750 block = poll_fault_page(rs, &offset); 1751 } 1752 1753 if (block) { 1754 /* 1755 * As soon as we start servicing pages out of order, then we have 1756 * to kill the bulk stage, since the bulk stage assumes 1757 * in (migration_bitmap_find_and_reset_dirty) that every page is 1758 * dirty, that's no longer true. 1759 */ 1760 rs->ram_bulk_stage = false; 1761 1762 /* 1763 * We want the background search to continue from the queued page 1764 * since the guest is likely to want other pages near to the page 1765 * it just requested. 1766 */ 1767 pss->block = block; 1768 pss->page = offset >> TARGET_PAGE_BITS; 1769 1770 /* 1771 * This unqueued page would break the "one round" check, even is 1772 * really rare. 1773 */ 1774 pss->complete_round = false; 1775 } 1776 1777 return !!block; 1778 } 1779 1780 /** 1781 * migration_page_queue_free: drop any remaining pages in the ram 1782 * request queue 1783 * 1784 * It should be empty at the end anyway, but in error cases there may 1785 * be some left. in case that there is any page left, we drop it. 1786 * 1787 */ 1788 static void migration_page_queue_free(RAMState *rs) 1789 { 1790 struct RAMSrcPageRequest *mspr, *next_mspr; 1791 /* This queue generally should be empty - but in the case of a failed 1792 * migration might have some droppings in. 1793 */ 1794 RCU_READ_LOCK_GUARD(); 1795 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1796 memory_region_unref(mspr->rb->mr); 1797 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1798 g_free(mspr); 1799 } 1800 } 1801 1802 /** 1803 * ram_save_queue_pages: queue the page for transmission 1804 * 1805 * A request from postcopy destination for example. 1806 * 1807 * Returns zero on success or negative on error 1808 * 1809 * @rbname: Name of the RAMBLock of the request. NULL means the 1810 * same that last one. 1811 * @start: starting address from the start of the RAMBlock 1812 * @len: length (in bytes) to send 1813 */ 1814 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1815 { 1816 RAMBlock *ramblock; 1817 RAMState *rs = ram_state; 1818 1819 ram_counters.postcopy_requests++; 1820 RCU_READ_LOCK_GUARD(); 1821 1822 if (!rbname) { 1823 /* Reuse last RAMBlock */ 1824 ramblock = rs->last_req_rb; 1825 1826 if (!ramblock) { 1827 /* 1828 * Shouldn't happen, we can't reuse the last RAMBlock if 1829 * it's the 1st request. 1830 */ 1831 error_report("ram_save_queue_pages no previous block"); 1832 return -1; 1833 } 1834 } else { 1835 ramblock = qemu_ram_block_by_name(rbname); 1836 1837 if (!ramblock) { 1838 /* We shouldn't be asked for a non-existent RAMBlock */ 1839 error_report("ram_save_queue_pages no block '%s'", rbname); 1840 return -1; 1841 } 1842 rs->last_req_rb = ramblock; 1843 } 1844 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1845 if (start + len > ramblock->used_length) { 1846 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1847 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1848 __func__, start, len, ramblock->used_length); 1849 return -1; 1850 } 1851 1852 struct RAMSrcPageRequest *new_entry = 1853 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1854 new_entry->rb = ramblock; 1855 new_entry->offset = start; 1856 new_entry->len = len; 1857 1858 memory_region_ref(ramblock->mr); 1859 qemu_mutex_lock(&rs->src_page_req_mutex); 1860 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1861 migration_make_urgent_request(); 1862 qemu_mutex_unlock(&rs->src_page_req_mutex); 1863 1864 return 0; 1865 } 1866 1867 static bool save_page_use_compression(RAMState *rs) 1868 { 1869 if (!migrate_use_compression()) { 1870 return false; 1871 } 1872 1873 /* 1874 * If xbzrle is on, stop using the data compression after first 1875 * round of migration even if compression is enabled. In theory, 1876 * xbzrle can do better than compression. 1877 */ 1878 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 1879 return true; 1880 } 1881 1882 return false; 1883 } 1884 1885 /* 1886 * try to compress the page before posting it out, return true if the page 1887 * has been properly handled by compression, otherwise needs other 1888 * paths to handle it 1889 */ 1890 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1891 { 1892 if (!save_page_use_compression(rs)) { 1893 return false; 1894 } 1895 1896 /* 1897 * When starting the process of a new block, the first page of 1898 * the block should be sent out before other pages in the same 1899 * block, and all the pages in last block should have been sent 1900 * out, keeping this order is important, because the 'cont' flag 1901 * is used to avoid resending the block name. 1902 * 1903 * We post the fist page as normal page as compression will take 1904 * much CPU resource. 1905 */ 1906 if (block != rs->last_sent_block) { 1907 flush_compressed_data(rs); 1908 return false; 1909 } 1910 1911 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 1912 return true; 1913 } 1914 1915 compression_counters.busy++; 1916 return false; 1917 } 1918 1919 /** 1920 * ram_save_target_page: save one target page 1921 * 1922 * Returns the number of pages written 1923 * 1924 * @rs: current RAM state 1925 * @pss: data about the page we want to send 1926 * @last_stage: if we are at the completion stage 1927 */ 1928 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1929 bool last_stage) 1930 { 1931 RAMBlock *block = pss->block; 1932 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1933 int res; 1934 1935 if (control_save_page(rs, block, offset, &res)) { 1936 return res; 1937 } 1938 1939 if (save_compress_page(rs, block, offset)) { 1940 return 1; 1941 } 1942 1943 res = save_zero_page(rs, block, offset); 1944 if (res > 0) { 1945 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1946 * page would be stale 1947 */ 1948 if (!save_page_use_compression(rs)) { 1949 XBZRLE_cache_lock(); 1950 xbzrle_cache_zero_page(rs, block->offset + offset); 1951 XBZRLE_cache_unlock(); 1952 } 1953 ram_release_pages(block->idstr, offset, res); 1954 return res; 1955 } 1956 1957 /* 1958 * Do not use multifd for: 1959 * 1. Compression as the first page in the new block should be posted out 1960 * before sending the compressed page 1961 * 2. In postcopy as one whole host page should be placed 1962 */ 1963 if (!save_page_use_compression(rs) && migrate_use_multifd() 1964 && !migration_in_postcopy()) { 1965 return ram_save_multifd_page(rs, block, offset); 1966 } 1967 1968 return ram_save_page(rs, pss, last_stage); 1969 } 1970 1971 /** 1972 * ram_save_host_page: save a whole host page 1973 * 1974 * Starting at *offset send pages up to the end of the current host 1975 * page. It's valid for the initial offset to point into the middle of 1976 * a host page in which case the remainder of the hostpage is sent. 1977 * Only dirty target pages are sent. Note that the host page size may 1978 * be a huge page for this block. 1979 * The saving stops at the boundary of the used_length of the block 1980 * if the RAMBlock isn't a multiple of the host page size. 1981 * 1982 * Returns the number of pages written or negative on error 1983 * 1984 * @rs: current RAM state 1985 * @ms: current migration state 1986 * @pss: data about the page we want to send 1987 * @last_stage: if we are at the completion stage 1988 */ 1989 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1990 bool last_stage) 1991 { 1992 int tmppages, pages = 0; 1993 size_t pagesize_bits = 1994 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1995 unsigned long start_page = pss->page; 1996 int res; 1997 1998 if (ramblock_is_ignored(pss->block)) { 1999 error_report("block %s should not be migrated !", pss->block->idstr); 2000 return 0; 2001 } 2002 2003 do { 2004 /* Check the pages is dirty and if it is send it */ 2005 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2006 pss->page++; 2007 continue; 2008 } 2009 2010 tmppages = ram_save_target_page(rs, pss, last_stage); 2011 if (tmppages < 0) { 2012 return tmppages; 2013 } 2014 2015 pages += tmppages; 2016 pss->page++; 2017 /* Allow rate limiting to happen in the middle of huge pages */ 2018 migration_rate_limit(); 2019 } while ((pss->page & (pagesize_bits - 1)) && 2020 offset_in_ramblock(pss->block, 2021 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2022 /* The offset we leave with is the last one we looked at */ 2023 pss->page--; 2024 2025 res = ram_save_release_protection(rs, pss, start_page); 2026 return (res < 0 ? res : pages); 2027 } 2028 2029 /** 2030 * ram_find_and_save_block: finds a dirty page and sends it to f 2031 * 2032 * Called within an RCU critical section. 2033 * 2034 * Returns the number of pages written where zero means no dirty pages, 2035 * or negative on error 2036 * 2037 * @rs: current RAM state 2038 * @last_stage: if we are at the completion stage 2039 * 2040 * On systems where host-page-size > target-page-size it will send all the 2041 * pages in a host page that are dirty. 2042 */ 2043 2044 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 2045 { 2046 PageSearchStatus pss; 2047 int pages = 0; 2048 bool again, found; 2049 2050 /* No dirty page as there is zero RAM */ 2051 if (!ram_bytes_total()) { 2052 return pages; 2053 } 2054 2055 pss.block = rs->last_seen_block; 2056 pss.page = rs->last_page; 2057 pss.complete_round = false; 2058 2059 if (!pss.block) { 2060 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2061 } 2062 2063 do { 2064 again = true; 2065 found = get_queued_page(rs, &pss); 2066 2067 if (!found) { 2068 /* priority queue empty, so just search for something dirty */ 2069 found = find_dirty_block(rs, &pss, &again); 2070 } 2071 2072 if (found) { 2073 pages = ram_save_host_page(rs, &pss, last_stage); 2074 } 2075 } while (!pages && again); 2076 2077 rs->last_seen_block = pss.block; 2078 rs->last_page = pss.page; 2079 2080 return pages; 2081 } 2082 2083 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2084 { 2085 uint64_t pages = size / TARGET_PAGE_SIZE; 2086 2087 if (zero) { 2088 ram_counters.duplicate += pages; 2089 } else { 2090 ram_counters.normal += pages; 2091 ram_counters.transferred += size; 2092 qemu_update_position(f, size); 2093 } 2094 } 2095 2096 static uint64_t ram_bytes_total_common(bool count_ignored) 2097 { 2098 RAMBlock *block; 2099 uint64_t total = 0; 2100 2101 RCU_READ_LOCK_GUARD(); 2102 2103 if (count_ignored) { 2104 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2105 total += block->used_length; 2106 } 2107 } else { 2108 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2109 total += block->used_length; 2110 } 2111 } 2112 return total; 2113 } 2114 2115 uint64_t ram_bytes_total(void) 2116 { 2117 return ram_bytes_total_common(false); 2118 } 2119 2120 static void xbzrle_load_setup(void) 2121 { 2122 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2123 } 2124 2125 static void xbzrle_load_cleanup(void) 2126 { 2127 g_free(XBZRLE.decoded_buf); 2128 XBZRLE.decoded_buf = NULL; 2129 } 2130 2131 static void ram_state_cleanup(RAMState **rsp) 2132 { 2133 if (*rsp) { 2134 migration_page_queue_free(*rsp); 2135 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2136 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2137 g_free(*rsp); 2138 *rsp = NULL; 2139 } 2140 } 2141 2142 static void xbzrle_cleanup(void) 2143 { 2144 XBZRLE_cache_lock(); 2145 if (XBZRLE.cache) { 2146 cache_fini(XBZRLE.cache); 2147 g_free(XBZRLE.encoded_buf); 2148 g_free(XBZRLE.current_buf); 2149 g_free(XBZRLE.zero_target_page); 2150 XBZRLE.cache = NULL; 2151 XBZRLE.encoded_buf = NULL; 2152 XBZRLE.current_buf = NULL; 2153 XBZRLE.zero_target_page = NULL; 2154 } 2155 XBZRLE_cache_unlock(); 2156 } 2157 2158 static void ram_save_cleanup(void *opaque) 2159 { 2160 RAMState **rsp = opaque; 2161 RAMBlock *block; 2162 2163 /* We don't use dirty log with background snapshots */ 2164 if (!migrate_background_snapshot()) { 2165 /* caller have hold iothread lock or is in a bh, so there is 2166 * no writing race against the migration bitmap 2167 */ 2168 memory_global_dirty_log_stop(); 2169 } 2170 2171 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2172 g_free(block->clear_bmap); 2173 block->clear_bmap = NULL; 2174 g_free(block->bmap); 2175 block->bmap = NULL; 2176 } 2177 2178 xbzrle_cleanup(); 2179 compress_threads_save_cleanup(); 2180 ram_state_cleanup(rsp); 2181 } 2182 2183 static void ram_state_reset(RAMState *rs) 2184 { 2185 rs->last_seen_block = NULL; 2186 rs->last_sent_block = NULL; 2187 rs->last_page = 0; 2188 rs->last_version = ram_list.version; 2189 rs->ram_bulk_stage = true; 2190 rs->fpo_enabled = false; 2191 } 2192 2193 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2194 2195 /* 2196 * 'expected' is the value you expect the bitmap mostly to be full 2197 * of; it won't bother printing lines that are all this value. 2198 * If 'todump' is null the migration bitmap is dumped. 2199 */ 2200 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 2201 unsigned long pages) 2202 { 2203 int64_t cur; 2204 int64_t linelen = 128; 2205 char linebuf[129]; 2206 2207 for (cur = 0; cur < pages; cur += linelen) { 2208 int64_t curb; 2209 bool found = false; 2210 /* 2211 * Last line; catch the case where the line length 2212 * is longer than remaining ram 2213 */ 2214 if (cur + linelen > pages) { 2215 linelen = pages - cur; 2216 } 2217 for (curb = 0; curb < linelen; curb++) { 2218 bool thisbit = test_bit(cur + curb, todump); 2219 linebuf[curb] = thisbit ? '1' : '.'; 2220 found = found || (thisbit != expected); 2221 } 2222 if (found) { 2223 linebuf[curb] = '\0'; 2224 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 2225 } 2226 } 2227 } 2228 2229 /* **** functions for postcopy ***** */ 2230 2231 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2232 { 2233 struct RAMBlock *block; 2234 2235 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2236 unsigned long *bitmap = block->bmap; 2237 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2238 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2239 2240 while (run_start < range) { 2241 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2242 ram_discard_range(block->idstr, 2243 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2244 ((ram_addr_t)(run_end - run_start)) 2245 << TARGET_PAGE_BITS); 2246 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2247 } 2248 } 2249 } 2250 2251 /** 2252 * postcopy_send_discard_bm_ram: discard a RAMBlock 2253 * 2254 * Returns zero on success 2255 * 2256 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2257 * 2258 * @ms: current migration state 2259 * @block: RAMBlock to discard 2260 */ 2261 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2262 { 2263 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2264 unsigned long current; 2265 unsigned long *bitmap = block->bmap; 2266 2267 for (current = 0; current < end; ) { 2268 unsigned long one = find_next_bit(bitmap, end, current); 2269 unsigned long zero, discard_length; 2270 2271 if (one >= end) { 2272 break; 2273 } 2274 2275 zero = find_next_zero_bit(bitmap, end, one + 1); 2276 2277 if (zero >= end) { 2278 discard_length = end - one; 2279 } else { 2280 discard_length = zero - one; 2281 } 2282 postcopy_discard_send_range(ms, one, discard_length); 2283 current = one + discard_length; 2284 } 2285 2286 return 0; 2287 } 2288 2289 /** 2290 * postcopy_each_ram_send_discard: discard all RAMBlocks 2291 * 2292 * Returns 0 for success or negative for error 2293 * 2294 * Utility for the outgoing postcopy code. 2295 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2296 * passing it bitmap indexes and name. 2297 * (qemu_ram_foreach_block ends up passing unscaled lengths 2298 * which would mean postcopy code would have to deal with target page) 2299 * 2300 * @ms: current migration state 2301 */ 2302 static int postcopy_each_ram_send_discard(MigrationState *ms) 2303 { 2304 struct RAMBlock *block; 2305 int ret; 2306 2307 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2308 postcopy_discard_send_init(ms, block->idstr); 2309 2310 /* 2311 * Postcopy sends chunks of bitmap over the wire, but it 2312 * just needs indexes at this point, avoids it having 2313 * target page specific code. 2314 */ 2315 ret = postcopy_send_discard_bm_ram(ms, block); 2316 postcopy_discard_send_finish(ms); 2317 if (ret) { 2318 return ret; 2319 } 2320 } 2321 2322 return 0; 2323 } 2324 2325 /** 2326 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2327 * 2328 * Helper for postcopy_chunk_hostpages; it's called twice to 2329 * canonicalize the two bitmaps, that are similar, but one is 2330 * inverted. 2331 * 2332 * Postcopy requires that all target pages in a hostpage are dirty or 2333 * clean, not a mix. This function canonicalizes the bitmaps. 2334 * 2335 * @ms: current migration state 2336 * @block: block that contains the page we want to canonicalize 2337 */ 2338 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2339 { 2340 RAMState *rs = ram_state; 2341 unsigned long *bitmap = block->bmap; 2342 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2343 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2344 unsigned long run_start; 2345 2346 if (block->page_size == TARGET_PAGE_SIZE) { 2347 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2348 return; 2349 } 2350 2351 /* Find a dirty page */ 2352 run_start = find_next_bit(bitmap, pages, 0); 2353 2354 while (run_start < pages) { 2355 2356 /* 2357 * If the start of this run of pages is in the middle of a host 2358 * page, then we need to fixup this host page. 2359 */ 2360 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2361 /* Find the end of this run */ 2362 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2363 /* 2364 * If the end isn't at the start of a host page, then the 2365 * run doesn't finish at the end of a host page 2366 * and we need to discard. 2367 */ 2368 } 2369 2370 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2371 unsigned long page; 2372 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2373 host_ratio); 2374 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2375 2376 /* Clean up the bitmap */ 2377 for (page = fixup_start_addr; 2378 page < fixup_start_addr + host_ratio; page++) { 2379 /* 2380 * Remark them as dirty, updating the count for any pages 2381 * that weren't previously dirty. 2382 */ 2383 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2384 } 2385 } 2386 2387 /* Find the next dirty page for the next iteration */ 2388 run_start = find_next_bit(bitmap, pages, run_start); 2389 } 2390 } 2391 2392 /** 2393 * postcopy_chunk_hostpages: discard any partially sent host page 2394 * 2395 * Utility for the outgoing postcopy code. 2396 * 2397 * Discard any partially sent host-page size chunks, mark any partially 2398 * dirty host-page size chunks as all dirty. In this case the host-page 2399 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2400 * 2401 * Returns zero on success 2402 * 2403 * @ms: current migration state 2404 * @block: block we want to work with 2405 */ 2406 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2407 { 2408 postcopy_discard_send_init(ms, block->idstr); 2409 2410 /* 2411 * Ensure that all partially dirty host pages are made fully dirty. 2412 */ 2413 postcopy_chunk_hostpages_pass(ms, block); 2414 2415 postcopy_discard_send_finish(ms); 2416 return 0; 2417 } 2418 2419 /** 2420 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2421 * 2422 * Returns zero on success 2423 * 2424 * Transmit the set of pages to be discarded after precopy to the target 2425 * these are pages that: 2426 * a) Have been previously transmitted but are now dirty again 2427 * b) Pages that have never been transmitted, this ensures that 2428 * any pages on the destination that have been mapped by background 2429 * tasks get discarded (transparent huge pages is the specific concern) 2430 * Hopefully this is pretty sparse 2431 * 2432 * @ms: current migration state 2433 */ 2434 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2435 { 2436 RAMState *rs = ram_state; 2437 RAMBlock *block; 2438 int ret; 2439 2440 RCU_READ_LOCK_GUARD(); 2441 2442 /* This should be our last sync, the src is now paused */ 2443 migration_bitmap_sync(rs); 2444 2445 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2446 rs->last_seen_block = NULL; 2447 rs->last_sent_block = NULL; 2448 rs->last_page = 0; 2449 2450 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2451 /* Deal with TPS != HPS and huge pages */ 2452 ret = postcopy_chunk_hostpages(ms, block); 2453 if (ret) { 2454 return ret; 2455 } 2456 2457 #ifdef DEBUG_POSTCOPY 2458 ram_debug_dump_bitmap(block->bmap, true, 2459 block->used_length >> TARGET_PAGE_BITS); 2460 #endif 2461 } 2462 trace_ram_postcopy_send_discard_bitmap(); 2463 2464 return postcopy_each_ram_send_discard(ms); 2465 } 2466 2467 /** 2468 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2469 * 2470 * Returns zero on success 2471 * 2472 * @rbname: name of the RAMBlock of the request. NULL means the 2473 * same that last one. 2474 * @start: RAMBlock starting page 2475 * @length: RAMBlock size 2476 */ 2477 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2478 { 2479 trace_ram_discard_range(rbname, start, length); 2480 2481 RCU_READ_LOCK_GUARD(); 2482 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2483 2484 if (!rb) { 2485 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2486 return -1; 2487 } 2488 2489 /* 2490 * On source VM, we don't need to update the received bitmap since 2491 * we don't even have one. 2492 */ 2493 if (rb->receivedmap) { 2494 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2495 length >> qemu_target_page_bits()); 2496 } 2497 2498 return ram_block_discard_range(rb, start, length); 2499 } 2500 2501 /* 2502 * For every allocation, we will try not to crash the VM if the 2503 * allocation failed. 2504 */ 2505 static int xbzrle_init(void) 2506 { 2507 Error *local_err = NULL; 2508 2509 if (!migrate_use_xbzrle()) { 2510 return 0; 2511 } 2512 2513 XBZRLE_cache_lock(); 2514 2515 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2516 if (!XBZRLE.zero_target_page) { 2517 error_report("%s: Error allocating zero page", __func__); 2518 goto err_out; 2519 } 2520 2521 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2522 TARGET_PAGE_SIZE, &local_err); 2523 if (!XBZRLE.cache) { 2524 error_report_err(local_err); 2525 goto free_zero_page; 2526 } 2527 2528 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2529 if (!XBZRLE.encoded_buf) { 2530 error_report("%s: Error allocating encoded_buf", __func__); 2531 goto free_cache; 2532 } 2533 2534 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2535 if (!XBZRLE.current_buf) { 2536 error_report("%s: Error allocating current_buf", __func__); 2537 goto free_encoded_buf; 2538 } 2539 2540 /* We are all good */ 2541 XBZRLE_cache_unlock(); 2542 return 0; 2543 2544 free_encoded_buf: 2545 g_free(XBZRLE.encoded_buf); 2546 XBZRLE.encoded_buf = NULL; 2547 free_cache: 2548 cache_fini(XBZRLE.cache); 2549 XBZRLE.cache = NULL; 2550 free_zero_page: 2551 g_free(XBZRLE.zero_target_page); 2552 XBZRLE.zero_target_page = NULL; 2553 err_out: 2554 XBZRLE_cache_unlock(); 2555 return -ENOMEM; 2556 } 2557 2558 static int ram_state_init(RAMState **rsp) 2559 { 2560 *rsp = g_try_new0(RAMState, 1); 2561 2562 if (!*rsp) { 2563 error_report("%s: Init ramstate fail", __func__); 2564 return -1; 2565 } 2566 2567 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2568 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2569 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2570 2571 /* 2572 * Count the total number of pages used by ram blocks not including any 2573 * gaps due to alignment or unplugs. 2574 * This must match with the initial values of dirty bitmap. 2575 */ 2576 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2577 ram_state_reset(*rsp); 2578 2579 return 0; 2580 } 2581 2582 static void ram_list_init_bitmaps(void) 2583 { 2584 MigrationState *ms = migrate_get_current(); 2585 RAMBlock *block; 2586 unsigned long pages; 2587 uint8_t shift; 2588 2589 /* Skip setting bitmap if there is no RAM */ 2590 if (ram_bytes_total()) { 2591 shift = ms->clear_bitmap_shift; 2592 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2593 error_report("clear_bitmap_shift (%u) too big, using " 2594 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2595 shift = CLEAR_BITMAP_SHIFT_MAX; 2596 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2597 error_report("clear_bitmap_shift (%u) too small, using " 2598 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2599 shift = CLEAR_BITMAP_SHIFT_MIN; 2600 } 2601 2602 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2603 pages = block->max_length >> TARGET_PAGE_BITS; 2604 /* 2605 * The initial dirty bitmap for migration must be set with all 2606 * ones to make sure we'll migrate every guest RAM page to 2607 * destination. 2608 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2609 * new migration after a failed migration, ram_list. 2610 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2611 * guest memory. 2612 */ 2613 block->bmap = bitmap_new(pages); 2614 bitmap_set(block->bmap, 0, pages); 2615 block->clear_bmap_shift = shift; 2616 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2617 } 2618 } 2619 } 2620 2621 static void ram_init_bitmaps(RAMState *rs) 2622 { 2623 /* For memory_global_dirty_log_start below. */ 2624 qemu_mutex_lock_iothread(); 2625 qemu_mutex_lock_ramlist(); 2626 2627 WITH_RCU_READ_LOCK_GUARD() { 2628 ram_list_init_bitmaps(); 2629 /* We don't use dirty log with background snapshots */ 2630 if (!migrate_background_snapshot()) { 2631 memory_global_dirty_log_start(); 2632 migration_bitmap_sync_precopy(rs); 2633 } 2634 } 2635 qemu_mutex_unlock_ramlist(); 2636 qemu_mutex_unlock_iothread(); 2637 } 2638 2639 static int ram_init_all(RAMState **rsp) 2640 { 2641 if (ram_state_init(rsp)) { 2642 return -1; 2643 } 2644 2645 if (xbzrle_init()) { 2646 ram_state_cleanup(rsp); 2647 return -1; 2648 } 2649 2650 ram_init_bitmaps(*rsp); 2651 2652 return 0; 2653 } 2654 2655 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2656 { 2657 RAMBlock *block; 2658 uint64_t pages = 0; 2659 2660 /* 2661 * Postcopy is not using xbzrle/compression, so no need for that. 2662 * Also, since source are already halted, we don't need to care 2663 * about dirty page logging as well. 2664 */ 2665 2666 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2667 pages += bitmap_count_one(block->bmap, 2668 block->used_length >> TARGET_PAGE_BITS); 2669 } 2670 2671 /* This may not be aligned with current bitmaps. Recalculate. */ 2672 rs->migration_dirty_pages = pages; 2673 2674 rs->last_seen_block = NULL; 2675 rs->last_sent_block = NULL; 2676 rs->last_page = 0; 2677 rs->last_version = ram_list.version; 2678 /* 2679 * Disable the bulk stage, otherwise we'll resend the whole RAM no 2680 * matter what we have sent. 2681 */ 2682 rs->ram_bulk_stage = false; 2683 2684 /* Update RAMState cache of output QEMUFile */ 2685 rs->f = out; 2686 2687 trace_ram_state_resume_prepare(pages); 2688 } 2689 2690 /* 2691 * This function clears bits of the free pages reported by the caller from the 2692 * migration dirty bitmap. @addr is the host address corresponding to the 2693 * start of the continuous guest free pages, and @len is the total bytes of 2694 * those pages. 2695 */ 2696 void qemu_guest_free_page_hint(void *addr, size_t len) 2697 { 2698 RAMBlock *block; 2699 ram_addr_t offset; 2700 size_t used_len, start, npages; 2701 MigrationState *s = migrate_get_current(); 2702 2703 /* This function is currently expected to be used during live migration */ 2704 if (!migration_is_setup_or_active(s->state)) { 2705 return; 2706 } 2707 2708 for (; len > 0; len -= used_len, addr += used_len) { 2709 block = qemu_ram_block_from_host(addr, false, &offset); 2710 if (unlikely(!block || offset >= block->used_length)) { 2711 /* 2712 * The implementation might not support RAMBlock resize during 2713 * live migration, but it could happen in theory with future 2714 * updates. So we add a check here to capture that case. 2715 */ 2716 error_report_once("%s unexpected error", __func__); 2717 return; 2718 } 2719 2720 if (len <= block->used_length - offset) { 2721 used_len = len; 2722 } else { 2723 used_len = block->used_length - offset; 2724 } 2725 2726 start = offset >> TARGET_PAGE_BITS; 2727 npages = used_len >> TARGET_PAGE_BITS; 2728 2729 qemu_mutex_lock(&ram_state->bitmap_mutex); 2730 ram_state->migration_dirty_pages -= 2731 bitmap_count_one_with_offset(block->bmap, start, npages); 2732 bitmap_clear(block->bmap, start, npages); 2733 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2734 } 2735 } 2736 2737 /* 2738 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2739 * long-running RCU critical section. When rcu-reclaims in the code 2740 * start to become numerous it will be necessary to reduce the 2741 * granularity of these critical sections. 2742 */ 2743 2744 /** 2745 * ram_save_setup: Setup RAM for migration 2746 * 2747 * Returns zero to indicate success and negative for error 2748 * 2749 * @f: QEMUFile where to send the data 2750 * @opaque: RAMState pointer 2751 */ 2752 static int ram_save_setup(QEMUFile *f, void *opaque) 2753 { 2754 RAMState **rsp = opaque; 2755 RAMBlock *block; 2756 2757 if (compress_threads_save_setup()) { 2758 return -1; 2759 } 2760 2761 /* migration has already setup the bitmap, reuse it. */ 2762 if (!migration_in_colo_state()) { 2763 if (ram_init_all(rsp) != 0) { 2764 compress_threads_save_cleanup(); 2765 return -1; 2766 } 2767 } 2768 (*rsp)->f = f; 2769 2770 WITH_RCU_READ_LOCK_GUARD() { 2771 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2772 2773 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2774 qemu_put_byte(f, strlen(block->idstr)); 2775 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2776 qemu_put_be64(f, block->used_length); 2777 if (migrate_postcopy_ram() && block->page_size != 2778 qemu_host_page_size) { 2779 qemu_put_be64(f, block->page_size); 2780 } 2781 if (migrate_ignore_shared()) { 2782 qemu_put_be64(f, block->mr->addr); 2783 } 2784 } 2785 } 2786 2787 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2788 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2789 2790 multifd_send_sync_main(f); 2791 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2792 qemu_fflush(f); 2793 2794 return 0; 2795 } 2796 2797 /** 2798 * ram_save_iterate: iterative stage for migration 2799 * 2800 * Returns zero to indicate success and negative for error 2801 * 2802 * @f: QEMUFile where to send the data 2803 * @opaque: RAMState pointer 2804 */ 2805 static int ram_save_iterate(QEMUFile *f, void *opaque) 2806 { 2807 RAMState **temp = opaque; 2808 RAMState *rs = *temp; 2809 int ret = 0; 2810 int i; 2811 int64_t t0; 2812 int done = 0; 2813 2814 if (blk_mig_bulk_active()) { 2815 /* Avoid transferring ram during bulk phase of block migration as 2816 * the bulk phase will usually take a long time and transferring 2817 * ram updates during that time is pointless. */ 2818 goto out; 2819 } 2820 2821 WITH_RCU_READ_LOCK_GUARD() { 2822 if (ram_list.version != rs->last_version) { 2823 ram_state_reset(rs); 2824 } 2825 2826 /* Read version before ram_list.blocks */ 2827 smp_rmb(); 2828 2829 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2830 2831 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2832 i = 0; 2833 while ((ret = qemu_file_rate_limit(f)) == 0 || 2834 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 2835 int pages; 2836 2837 if (qemu_file_get_error(f)) { 2838 break; 2839 } 2840 2841 pages = ram_find_and_save_block(rs, false); 2842 /* no more pages to sent */ 2843 if (pages == 0) { 2844 done = 1; 2845 break; 2846 } 2847 2848 if (pages < 0) { 2849 qemu_file_set_error(f, pages); 2850 break; 2851 } 2852 2853 rs->target_page_count += pages; 2854 2855 /* 2856 * During postcopy, it is necessary to make sure one whole host 2857 * page is sent in one chunk. 2858 */ 2859 if (migrate_postcopy_ram()) { 2860 flush_compressed_data(rs); 2861 } 2862 2863 /* 2864 * we want to check in the 1st loop, just in case it was the 1st 2865 * time and we had to sync the dirty bitmap. 2866 * qemu_clock_get_ns() is a bit expensive, so we only check each 2867 * some iterations 2868 */ 2869 if ((i & 63) == 0) { 2870 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 2871 1000000; 2872 if (t1 > MAX_WAIT) { 2873 trace_ram_save_iterate_big_wait(t1, i); 2874 break; 2875 } 2876 } 2877 i++; 2878 } 2879 } 2880 2881 /* 2882 * Must occur before EOS (or any QEMUFile operation) 2883 * because of RDMA protocol. 2884 */ 2885 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2886 2887 out: 2888 if (ret >= 0 2889 && migration_is_setup_or_active(migrate_get_current()->state)) { 2890 multifd_send_sync_main(rs->f); 2891 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2892 qemu_fflush(f); 2893 ram_counters.transferred += 8; 2894 2895 ret = qemu_file_get_error(f); 2896 } 2897 if (ret < 0) { 2898 return ret; 2899 } 2900 2901 return done; 2902 } 2903 2904 /** 2905 * ram_save_complete: function called to send the remaining amount of ram 2906 * 2907 * Returns zero to indicate success or negative on error 2908 * 2909 * Called with iothread lock 2910 * 2911 * @f: QEMUFile where to send the data 2912 * @opaque: RAMState pointer 2913 */ 2914 static int ram_save_complete(QEMUFile *f, void *opaque) 2915 { 2916 RAMState **temp = opaque; 2917 RAMState *rs = *temp; 2918 int ret = 0; 2919 2920 WITH_RCU_READ_LOCK_GUARD() { 2921 if (!migration_in_postcopy()) { 2922 migration_bitmap_sync_precopy(rs); 2923 } 2924 2925 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2926 2927 /* try transferring iterative blocks of memory */ 2928 2929 /* flush all remaining blocks regardless of rate limiting */ 2930 while (true) { 2931 int pages; 2932 2933 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2934 /* no more blocks to sent */ 2935 if (pages == 0) { 2936 break; 2937 } 2938 if (pages < 0) { 2939 ret = pages; 2940 break; 2941 } 2942 } 2943 2944 flush_compressed_data(rs); 2945 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2946 } 2947 2948 if (ret >= 0) { 2949 multifd_send_sync_main(rs->f); 2950 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2951 qemu_fflush(f); 2952 } 2953 2954 return ret; 2955 } 2956 2957 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2958 uint64_t *res_precopy_only, 2959 uint64_t *res_compatible, 2960 uint64_t *res_postcopy_only) 2961 { 2962 RAMState **temp = opaque; 2963 RAMState *rs = *temp; 2964 uint64_t remaining_size; 2965 2966 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2967 2968 if (!migration_in_postcopy() && 2969 remaining_size < max_size) { 2970 qemu_mutex_lock_iothread(); 2971 WITH_RCU_READ_LOCK_GUARD() { 2972 migration_bitmap_sync_precopy(rs); 2973 } 2974 qemu_mutex_unlock_iothread(); 2975 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2976 } 2977 2978 if (migrate_postcopy_ram()) { 2979 /* We can do postcopy, and all the data is postcopiable */ 2980 *res_compatible += remaining_size; 2981 } else { 2982 *res_precopy_only += remaining_size; 2983 } 2984 } 2985 2986 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2987 { 2988 unsigned int xh_len; 2989 int xh_flags; 2990 uint8_t *loaded_data; 2991 2992 /* extract RLE header */ 2993 xh_flags = qemu_get_byte(f); 2994 xh_len = qemu_get_be16(f); 2995 2996 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2997 error_report("Failed to load XBZRLE page - wrong compression!"); 2998 return -1; 2999 } 3000 3001 if (xh_len > TARGET_PAGE_SIZE) { 3002 error_report("Failed to load XBZRLE page - len overflow!"); 3003 return -1; 3004 } 3005 loaded_data = XBZRLE.decoded_buf; 3006 /* load data and decode */ 3007 /* it can change loaded_data to point to an internal buffer */ 3008 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3009 3010 /* decode RLE */ 3011 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3012 TARGET_PAGE_SIZE) == -1) { 3013 error_report("Failed to load XBZRLE page - decode error!"); 3014 return -1; 3015 } 3016 3017 return 0; 3018 } 3019 3020 /** 3021 * ram_block_from_stream: read a RAMBlock id from the migration stream 3022 * 3023 * Must be called from within a rcu critical section. 3024 * 3025 * Returns a pointer from within the RCU-protected ram_list. 3026 * 3027 * @f: QEMUFile where to read the data from 3028 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3029 */ 3030 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 3031 { 3032 static RAMBlock *block; 3033 char id[256]; 3034 uint8_t len; 3035 3036 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3037 if (!block) { 3038 error_report("Ack, bad migration stream!"); 3039 return NULL; 3040 } 3041 return block; 3042 } 3043 3044 len = qemu_get_byte(f); 3045 qemu_get_buffer(f, (uint8_t *)id, len); 3046 id[len] = 0; 3047 3048 block = qemu_ram_block_by_name(id); 3049 if (!block) { 3050 error_report("Can't find block %s", id); 3051 return NULL; 3052 } 3053 3054 if (ramblock_is_ignored(block)) { 3055 error_report("block %s should not be migrated !", id); 3056 return NULL; 3057 } 3058 3059 return block; 3060 } 3061 3062 static inline void *host_from_ram_block_offset(RAMBlock *block, 3063 ram_addr_t offset) 3064 { 3065 if (!offset_in_ramblock(block, offset)) { 3066 return NULL; 3067 } 3068 3069 return block->host + offset; 3070 } 3071 3072 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3073 ram_addr_t offset, bool record_bitmap) 3074 { 3075 if (!offset_in_ramblock(block, offset)) { 3076 return NULL; 3077 } 3078 if (!block->colo_cache) { 3079 error_report("%s: colo_cache is NULL in block :%s", 3080 __func__, block->idstr); 3081 return NULL; 3082 } 3083 3084 /* 3085 * During colo checkpoint, we need bitmap of these migrated pages. 3086 * It help us to decide which pages in ram cache should be flushed 3087 * into VM's RAM later. 3088 */ 3089 if (record_bitmap && 3090 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3091 ram_state->migration_dirty_pages++; 3092 } 3093 return block->colo_cache + offset; 3094 } 3095 3096 /** 3097 * ram_handle_compressed: handle the zero page case 3098 * 3099 * If a page (or a whole RDMA chunk) has been 3100 * determined to be zero, then zap it. 3101 * 3102 * @host: host address for the zero page 3103 * @ch: what the page is filled from. We only support zero 3104 * @size: size of the zero page 3105 */ 3106 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3107 { 3108 if (ch != 0 || !is_zero_range(host, size)) { 3109 memset(host, ch, size); 3110 } 3111 } 3112 3113 /* return the size after decompression, or negative value on error */ 3114 static int 3115 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3116 const uint8_t *source, size_t source_len) 3117 { 3118 int err; 3119 3120 err = inflateReset(stream); 3121 if (err != Z_OK) { 3122 return -1; 3123 } 3124 3125 stream->avail_in = source_len; 3126 stream->next_in = (uint8_t *)source; 3127 stream->avail_out = dest_len; 3128 stream->next_out = dest; 3129 3130 err = inflate(stream, Z_NO_FLUSH); 3131 if (err != Z_STREAM_END) { 3132 return -1; 3133 } 3134 3135 return stream->total_out; 3136 } 3137 3138 static void *do_data_decompress(void *opaque) 3139 { 3140 DecompressParam *param = opaque; 3141 unsigned long pagesize; 3142 uint8_t *des; 3143 int len, ret; 3144 3145 qemu_mutex_lock(¶m->mutex); 3146 while (!param->quit) { 3147 if (param->des) { 3148 des = param->des; 3149 len = param->len; 3150 param->des = 0; 3151 qemu_mutex_unlock(¶m->mutex); 3152 3153 pagesize = TARGET_PAGE_SIZE; 3154 3155 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3156 param->compbuf, len); 3157 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3158 error_report("decompress data failed"); 3159 qemu_file_set_error(decomp_file, ret); 3160 } 3161 3162 qemu_mutex_lock(&decomp_done_lock); 3163 param->done = true; 3164 qemu_cond_signal(&decomp_done_cond); 3165 qemu_mutex_unlock(&decomp_done_lock); 3166 3167 qemu_mutex_lock(¶m->mutex); 3168 } else { 3169 qemu_cond_wait(¶m->cond, ¶m->mutex); 3170 } 3171 } 3172 qemu_mutex_unlock(¶m->mutex); 3173 3174 return NULL; 3175 } 3176 3177 static int wait_for_decompress_done(void) 3178 { 3179 int idx, thread_count; 3180 3181 if (!migrate_use_compression()) { 3182 return 0; 3183 } 3184 3185 thread_count = migrate_decompress_threads(); 3186 qemu_mutex_lock(&decomp_done_lock); 3187 for (idx = 0; idx < thread_count; idx++) { 3188 while (!decomp_param[idx].done) { 3189 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3190 } 3191 } 3192 qemu_mutex_unlock(&decomp_done_lock); 3193 return qemu_file_get_error(decomp_file); 3194 } 3195 3196 static void compress_threads_load_cleanup(void) 3197 { 3198 int i, thread_count; 3199 3200 if (!migrate_use_compression()) { 3201 return; 3202 } 3203 thread_count = migrate_decompress_threads(); 3204 for (i = 0; i < thread_count; i++) { 3205 /* 3206 * we use it as a indicator which shows if the thread is 3207 * properly init'd or not 3208 */ 3209 if (!decomp_param[i].compbuf) { 3210 break; 3211 } 3212 3213 qemu_mutex_lock(&decomp_param[i].mutex); 3214 decomp_param[i].quit = true; 3215 qemu_cond_signal(&decomp_param[i].cond); 3216 qemu_mutex_unlock(&decomp_param[i].mutex); 3217 } 3218 for (i = 0; i < thread_count; i++) { 3219 if (!decomp_param[i].compbuf) { 3220 break; 3221 } 3222 3223 qemu_thread_join(decompress_threads + i); 3224 qemu_mutex_destroy(&decomp_param[i].mutex); 3225 qemu_cond_destroy(&decomp_param[i].cond); 3226 inflateEnd(&decomp_param[i].stream); 3227 g_free(decomp_param[i].compbuf); 3228 decomp_param[i].compbuf = NULL; 3229 } 3230 g_free(decompress_threads); 3231 g_free(decomp_param); 3232 decompress_threads = NULL; 3233 decomp_param = NULL; 3234 decomp_file = NULL; 3235 } 3236 3237 static int compress_threads_load_setup(QEMUFile *f) 3238 { 3239 int i, thread_count; 3240 3241 if (!migrate_use_compression()) { 3242 return 0; 3243 } 3244 3245 thread_count = migrate_decompress_threads(); 3246 decompress_threads = g_new0(QemuThread, thread_count); 3247 decomp_param = g_new0(DecompressParam, thread_count); 3248 qemu_mutex_init(&decomp_done_lock); 3249 qemu_cond_init(&decomp_done_cond); 3250 decomp_file = f; 3251 for (i = 0; i < thread_count; i++) { 3252 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3253 goto exit; 3254 } 3255 3256 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3257 qemu_mutex_init(&decomp_param[i].mutex); 3258 qemu_cond_init(&decomp_param[i].cond); 3259 decomp_param[i].done = true; 3260 decomp_param[i].quit = false; 3261 qemu_thread_create(decompress_threads + i, "decompress", 3262 do_data_decompress, decomp_param + i, 3263 QEMU_THREAD_JOINABLE); 3264 } 3265 return 0; 3266 exit: 3267 compress_threads_load_cleanup(); 3268 return -1; 3269 } 3270 3271 static void decompress_data_with_multi_threads(QEMUFile *f, 3272 void *host, int len) 3273 { 3274 int idx, thread_count; 3275 3276 thread_count = migrate_decompress_threads(); 3277 QEMU_LOCK_GUARD(&decomp_done_lock); 3278 while (true) { 3279 for (idx = 0; idx < thread_count; idx++) { 3280 if (decomp_param[idx].done) { 3281 decomp_param[idx].done = false; 3282 qemu_mutex_lock(&decomp_param[idx].mutex); 3283 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3284 decomp_param[idx].des = host; 3285 decomp_param[idx].len = len; 3286 qemu_cond_signal(&decomp_param[idx].cond); 3287 qemu_mutex_unlock(&decomp_param[idx].mutex); 3288 break; 3289 } 3290 } 3291 if (idx < thread_count) { 3292 break; 3293 } else { 3294 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3295 } 3296 } 3297 } 3298 3299 /* 3300 * we must set ram_bulk_stage to false, otherwise in 3301 * migation_bitmap_find_dirty the bitmap will be unused and 3302 * all the pages in ram cache wil be flushed to the ram of 3303 * secondary VM. 3304 */ 3305 static void colo_init_ram_state(void) 3306 { 3307 ram_state_init(&ram_state); 3308 ram_state->ram_bulk_stage = false; 3309 } 3310 3311 /* 3312 * colo cache: this is for secondary VM, we cache the whole 3313 * memory of the secondary VM, it is need to hold the global lock 3314 * to call this helper. 3315 */ 3316 int colo_init_ram_cache(void) 3317 { 3318 RAMBlock *block; 3319 3320 WITH_RCU_READ_LOCK_GUARD() { 3321 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3322 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3323 NULL, 3324 false); 3325 if (!block->colo_cache) { 3326 error_report("%s: Can't alloc memory for COLO cache of block %s," 3327 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3328 block->used_length); 3329 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3330 if (block->colo_cache) { 3331 qemu_anon_ram_free(block->colo_cache, block->used_length); 3332 block->colo_cache = NULL; 3333 } 3334 } 3335 return -errno; 3336 } 3337 } 3338 } 3339 3340 /* 3341 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3342 * with to decide which page in cache should be flushed into SVM's RAM. Here 3343 * we use the same name 'ram_bitmap' as for migration. 3344 */ 3345 if (ram_bytes_total()) { 3346 RAMBlock *block; 3347 3348 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3349 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3350 block->bmap = bitmap_new(pages); 3351 } 3352 } 3353 3354 colo_init_ram_state(); 3355 return 0; 3356 } 3357 3358 /* TODO: duplicated with ram_init_bitmaps */ 3359 void colo_incoming_start_dirty_log(void) 3360 { 3361 RAMBlock *block = NULL; 3362 /* For memory_global_dirty_log_start below. */ 3363 qemu_mutex_lock_iothread(); 3364 qemu_mutex_lock_ramlist(); 3365 3366 memory_global_dirty_log_sync(); 3367 WITH_RCU_READ_LOCK_GUARD() { 3368 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3369 ramblock_sync_dirty_bitmap(ram_state, block); 3370 /* Discard this dirty bitmap record */ 3371 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3372 } 3373 memory_global_dirty_log_start(); 3374 } 3375 ram_state->migration_dirty_pages = 0; 3376 qemu_mutex_unlock_ramlist(); 3377 qemu_mutex_unlock_iothread(); 3378 } 3379 3380 /* It is need to hold the global lock to call this helper */ 3381 void colo_release_ram_cache(void) 3382 { 3383 RAMBlock *block; 3384 3385 memory_global_dirty_log_stop(); 3386 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3387 g_free(block->bmap); 3388 block->bmap = NULL; 3389 } 3390 3391 WITH_RCU_READ_LOCK_GUARD() { 3392 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3393 if (block->colo_cache) { 3394 qemu_anon_ram_free(block->colo_cache, block->used_length); 3395 block->colo_cache = NULL; 3396 } 3397 } 3398 } 3399 ram_state_cleanup(&ram_state); 3400 } 3401 3402 /** 3403 * ram_load_setup: Setup RAM for migration incoming side 3404 * 3405 * Returns zero to indicate success and negative for error 3406 * 3407 * @f: QEMUFile where to receive the data 3408 * @opaque: RAMState pointer 3409 */ 3410 static int ram_load_setup(QEMUFile *f, void *opaque) 3411 { 3412 if (compress_threads_load_setup(f)) { 3413 return -1; 3414 } 3415 3416 xbzrle_load_setup(); 3417 ramblock_recv_map_init(); 3418 3419 return 0; 3420 } 3421 3422 static int ram_load_cleanup(void *opaque) 3423 { 3424 RAMBlock *rb; 3425 3426 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3427 qemu_ram_block_writeback(rb); 3428 } 3429 3430 xbzrle_load_cleanup(); 3431 compress_threads_load_cleanup(); 3432 3433 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3434 g_free(rb->receivedmap); 3435 rb->receivedmap = NULL; 3436 } 3437 3438 return 0; 3439 } 3440 3441 /** 3442 * ram_postcopy_incoming_init: allocate postcopy data structures 3443 * 3444 * Returns 0 for success and negative if there was one error 3445 * 3446 * @mis: current migration incoming state 3447 * 3448 * Allocate data structures etc needed by incoming migration with 3449 * postcopy-ram. postcopy-ram's similarly names 3450 * postcopy_ram_incoming_init does the work. 3451 */ 3452 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3453 { 3454 return postcopy_ram_incoming_init(mis); 3455 } 3456 3457 /** 3458 * ram_load_postcopy: load a page in postcopy case 3459 * 3460 * Returns 0 for success or -errno in case of error 3461 * 3462 * Called in postcopy mode by ram_load(). 3463 * rcu_read_lock is taken prior to this being called. 3464 * 3465 * @f: QEMUFile where to send the data 3466 */ 3467 static int ram_load_postcopy(QEMUFile *f) 3468 { 3469 int flags = 0, ret = 0; 3470 bool place_needed = false; 3471 bool matches_target_page_size = false; 3472 MigrationIncomingState *mis = migration_incoming_get_current(); 3473 /* Temporary page that is later 'placed' */ 3474 void *postcopy_host_page = mis->postcopy_tmp_page; 3475 void *this_host = NULL; 3476 bool all_zero = true; 3477 int target_pages = 0; 3478 3479 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3480 ram_addr_t addr; 3481 void *host = NULL; 3482 void *page_buffer = NULL; 3483 void *place_source = NULL; 3484 RAMBlock *block = NULL; 3485 uint8_t ch; 3486 int len; 3487 3488 addr = qemu_get_be64(f); 3489 3490 /* 3491 * If qemu file error, we should stop here, and then "addr" 3492 * may be invalid 3493 */ 3494 ret = qemu_file_get_error(f); 3495 if (ret) { 3496 break; 3497 } 3498 3499 flags = addr & ~TARGET_PAGE_MASK; 3500 addr &= TARGET_PAGE_MASK; 3501 3502 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3503 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3504 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3505 block = ram_block_from_stream(f, flags); 3506 3507 host = host_from_ram_block_offset(block, addr); 3508 if (!host) { 3509 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3510 ret = -EINVAL; 3511 break; 3512 } 3513 target_pages++; 3514 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3515 /* 3516 * Postcopy requires that we place whole host pages atomically; 3517 * these may be huge pages for RAMBlocks that are backed by 3518 * hugetlbfs. 3519 * To make it atomic, the data is read into a temporary page 3520 * that's moved into place later. 3521 * The migration protocol uses, possibly smaller, target-pages 3522 * however the source ensures it always sends all the components 3523 * of a host page in one chunk. 3524 */ 3525 page_buffer = postcopy_host_page + 3526 ((uintptr_t)host & (block->page_size - 1)); 3527 if (target_pages == 1) { 3528 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3529 block->page_size); 3530 } else { 3531 /* not the 1st TP within the HP */ 3532 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) != 3533 (uintptr_t)this_host) { 3534 error_report("Non-same host page %p/%p", 3535 host, this_host); 3536 ret = -EINVAL; 3537 break; 3538 } 3539 } 3540 3541 /* 3542 * If it's the last part of a host page then we place the host 3543 * page 3544 */ 3545 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3546 place_needed = true; 3547 } 3548 place_source = postcopy_host_page; 3549 } 3550 3551 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3552 case RAM_SAVE_FLAG_ZERO: 3553 ch = qemu_get_byte(f); 3554 /* 3555 * Can skip to set page_buffer when 3556 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3557 */ 3558 if (ch || !matches_target_page_size) { 3559 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3560 } 3561 if (ch) { 3562 all_zero = false; 3563 } 3564 break; 3565 3566 case RAM_SAVE_FLAG_PAGE: 3567 all_zero = false; 3568 if (!matches_target_page_size) { 3569 /* For huge pages, we always use temporary buffer */ 3570 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3571 } else { 3572 /* 3573 * For small pages that matches target page size, we 3574 * avoid the qemu_file copy. Instead we directly use 3575 * the buffer of QEMUFile to place the page. Note: we 3576 * cannot do any QEMUFile operation before using that 3577 * buffer to make sure the buffer is valid when 3578 * placing the page. 3579 */ 3580 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3581 TARGET_PAGE_SIZE); 3582 } 3583 break; 3584 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3585 all_zero = false; 3586 len = qemu_get_be32(f); 3587 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3588 error_report("Invalid compressed data length: %d", len); 3589 ret = -EINVAL; 3590 break; 3591 } 3592 decompress_data_with_multi_threads(f, page_buffer, len); 3593 break; 3594 3595 case RAM_SAVE_FLAG_EOS: 3596 /* normal exit */ 3597 multifd_recv_sync_main(); 3598 break; 3599 default: 3600 error_report("Unknown combination of migration flags: 0x%x" 3601 " (postcopy mode)", flags); 3602 ret = -EINVAL; 3603 break; 3604 } 3605 3606 /* Got the whole host page, wait for decompress before placing. */ 3607 if (place_needed) { 3608 ret |= wait_for_decompress_done(); 3609 } 3610 3611 /* Detect for any possible file errors */ 3612 if (!ret && qemu_file_get_error(f)) { 3613 ret = qemu_file_get_error(f); 3614 } 3615 3616 if (!ret && place_needed) { 3617 /* This gets called at the last target page in the host page */ 3618 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3619 block->page_size); 3620 3621 if (all_zero) { 3622 ret = postcopy_place_page_zero(mis, place_dest, 3623 block); 3624 } else { 3625 ret = postcopy_place_page(mis, place_dest, 3626 place_source, block); 3627 } 3628 place_needed = false; 3629 target_pages = 0; 3630 /* Assume we have a zero page until we detect something different */ 3631 all_zero = true; 3632 } 3633 } 3634 3635 return ret; 3636 } 3637 3638 static bool postcopy_is_advised(void) 3639 { 3640 PostcopyState ps = postcopy_state_get(); 3641 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3642 } 3643 3644 static bool postcopy_is_running(void) 3645 { 3646 PostcopyState ps = postcopy_state_get(); 3647 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3648 } 3649 3650 /* 3651 * Flush content of RAM cache into SVM's memory. 3652 * Only flush the pages that be dirtied by PVM or SVM or both. 3653 */ 3654 void colo_flush_ram_cache(void) 3655 { 3656 RAMBlock *block = NULL; 3657 void *dst_host; 3658 void *src_host; 3659 unsigned long offset = 0; 3660 3661 memory_global_dirty_log_sync(); 3662 WITH_RCU_READ_LOCK_GUARD() { 3663 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3664 ramblock_sync_dirty_bitmap(ram_state, block); 3665 } 3666 } 3667 3668 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3669 WITH_RCU_READ_LOCK_GUARD() { 3670 block = QLIST_FIRST_RCU(&ram_list.blocks); 3671 3672 while (block) { 3673 offset = migration_bitmap_find_dirty(ram_state, block, offset); 3674 3675 if (((ram_addr_t)offset) << TARGET_PAGE_BITS 3676 >= block->used_length) { 3677 offset = 0; 3678 block = QLIST_NEXT_RCU(block, next); 3679 } else { 3680 migration_bitmap_clear_dirty(ram_state, block, offset); 3681 dst_host = block->host 3682 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3683 src_host = block->colo_cache 3684 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3685 memcpy(dst_host, src_host, TARGET_PAGE_SIZE); 3686 } 3687 } 3688 } 3689 trace_colo_flush_ram_cache_end(); 3690 } 3691 3692 /** 3693 * ram_load_precopy: load pages in precopy case 3694 * 3695 * Returns 0 for success or -errno in case of error 3696 * 3697 * Called in precopy mode by ram_load(). 3698 * rcu_read_lock is taken prior to this being called. 3699 * 3700 * @f: QEMUFile where to send the data 3701 */ 3702 static int ram_load_precopy(QEMUFile *f) 3703 { 3704 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3705 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3706 bool postcopy_advised = postcopy_is_advised(); 3707 if (!migrate_use_compression()) { 3708 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3709 } 3710 3711 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3712 ram_addr_t addr, total_ram_bytes; 3713 void *host = NULL, *host_bak = NULL; 3714 uint8_t ch; 3715 3716 /* 3717 * Yield periodically to let main loop run, but an iteration of 3718 * the main loop is expensive, so do it each some iterations 3719 */ 3720 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3721 aio_co_schedule(qemu_get_current_aio_context(), 3722 qemu_coroutine_self()); 3723 qemu_coroutine_yield(); 3724 } 3725 i++; 3726 3727 addr = qemu_get_be64(f); 3728 flags = addr & ~TARGET_PAGE_MASK; 3729 addr &= TARGET_PAGE_MASK; 3730 3731 if (flags & invalid_flags) { 3732 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3733 error_report("Received an unexpected compressed page"); 3734 } 3735 3736 ret = -EINVAL; 3737 break; 3738 } 3739 3740 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3741 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3742 RAMBlock *block = ram_block_from_stream(f, flags); 3743 3744 host = host_from_ram_block_offset(block, addr); 3745 /* 3746 * After going into COLO stage, we should not load the page 3747 * into SVM's memory directly, we put them into colo_cache firstly. 3748 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3749 * Previously, we copied all these memory in preparing stage of COLO 3750 * while we need to stop VM, which is a time-consuming process. 3751 * Here we optimize it by a trick, back-up every page while in 3752 * migration process while COLO is enabled, though it affects the 3753 * speed of the migration, but it obviously reduce the downtime of 3754 * back-up all SVM'S memory in COLO preparing stage. 3755 */ 3756 if (migration_incoming_colo_enabled()) { 3757 if (migration_incoming_in_colo_state()) { 3758 /* In COLO stage, put all pages into cache temporarily */ 3759 host = colo_cache_from_block_offset(block, addr, true); 3760 } else { 3761 /* 3762 * In migration stage but before COLO stage, 3763 * Put all pages into both cache and SVM's memory. 3764 */ 3765 host_bak = colo_cache_from_block_offset(block, addr, false); 3766 } 3767 } 3768 if (!host) { 3769 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3770 ret = -EINVAL; 3771 break; 3772 } 3773 if (!migration_incoming_in_colo_state()) { 3774 ramblock_recv_bitmap_set(block, host); 3775 } 3776 3777 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3778 } 3779 3780 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3781 case RAM_SAVE_FLAG_MEM_SIZE: 3782 /* Synchronize RAM block list */ 3783 total_ram_bytes = addr; 3784 while (!ret && total_ram_bytes) { 3785 RAMBlock *block; 3786 char id[256]; 3787 ram_addr_t length; 3788 3789 len = qemu_get_byte(f); 3790 qemu_get_buffer(f, (uint8_t *)id, len); 3791 id[len] = 0; 3792 length = qemu_get_be64(f); 3793 3794 block = qemu_ram_block_by_name(id); 3795 if (block && !qemu_ram_is_migratable(block)) { 3796 error_report("block %s should not be migrated !", id); 3797 ret = -EINVAL; 3798 } else if (block) { 3799 if (length != block->used_length) { 3800 Error *local_err = NULL; 3801 3802 ret = qemu_ram_resize(block, length, 3803 &local_err); 3804 if (local_err) { 3805 error_report_err(local_err); 3806 } 3807 } 3808 /* For postcopy we need to check hugepage sizes match */ 3809 if (postcopy_advised && migrate_postcopy_ram() && 3810 block->page_size != qemu_host_page_size) { 3811 uint64_t remote_page_size = qemu_get_be64(f); 3812 if (remote_page_size != block->page_size) { 3813 error_report("Mismatched RAM page size %s " 3814 "(local) %zd != %" PRId64, 3815 id, block->page_size, 3816 remote_page_size); 3817 ret = -EINVAL; 3818 } 3819 } 3820 if (migrate_ignore_shared()) { 3821 hwaddr addr = qemu_get_be64(f); 3822 if (ramblock_is_ignored(block) && 3823 block->mr->addr != addr) { 3824 error_report("Mismatched GPAs for block %s " 3825 "%" PRId64 "!= %" PRId64, 3826 id, (uint64_t)addr, 3827 (uint64_t)block->mr->addr); 3828 ret = -EINVAL; 3829 } 3830 } 3831 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3832 block->idstr); 3833 } else { 3834 error_report("Unknown ramblock \"%s\", cannot " 3835 "accept migration", id); 3836 ret = -EINVAL; 3837 } 3838 3839 total_ram_bytes -= length; 3840 } 3841 break; 3842 3843 case RAM_SAVE_FLAG_ZERO: 3844 ch = qemu_get_byte(f); 3845 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3846 break; 3847 3848 case RAM_SAVE_FLAG_PAGE: 3849 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3850 break; 3851 3852 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3853 len = qemu_get_be32(f); 3854 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3855 error_report("Invalid compressed data length: %d", len); 3856 ret = -EINVAL; 3857 break; 3858 } 3859 decompress_data_with_multi_threads(f, host, len); 3860 break; 3861 3862 case RAM_SAVE_FLAG_XBZRLE: 3863 if (load_xbzrle(f, addr, host) < 0) { 3864 error_report("Failed to decompress XBZRLE page at " 3865 RAM_ADDR_FMT, addr); 3866 ret = -EINVAL; 3867 break; 3868 } 3869 break; 3870 case RAM_SAVE_FLAG_EOS: 3871 /* normal exit */ 3872 multifd_recv_sync_main(); 3873 break; 3874 default: 3875 if (flags & RAM_SAVE_FLAG_HOOK) { 3876 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3877 } else { 3878 error_report("Unknown combination of migration flags: 0x%x", 3879 flags); 3880 ret = -EINVAL; 3881 } 3882 } 3883 if (!ret) { 3884 ret = qemu_file_get_error(f); 3885 } 3886 if (!ret && host_bak) { 3887 memcpy(host_bak, host, TARGET_PAGE_SIZE); 3888 } 3889 } 3890 3891 ret |= wait_for_decompress_done(); 3892 return ret; 3893 } 3894 3895 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3896 { 3897 int ret = 0; 3898 static uint64_t seq_iter; 3899 /* 3900 * If system is running in postcopy mode, page inserts to host memory must 3901 * be atomic 3902 */ 3903 bool postcopy_running = postcopy_is_running(); 3904 3905 seq_iter++; 3906 3907 if (version_id != 4) { 3908 return -EINVAL; 3909 } 3910 3911 /* 3912 * This RCU critical section can be very long running. 3913 * When RCU reclaims in the code start to become numerous, 3914 * it will be necessary to reduce the granularity of this 3915 * critical section. 3916 */ 3917 WITH_RCU_READ_LOCK_GUARD() { 3918 if (postcopy_running) { 3919 ret = ram_load_postcopy(f); 3920 } else { 3921 ret = ram_load_precopy(f); 3922 } 3923 } 3924 trace_ram_load_complete(ret, seq_iter); 3925 3926 return ret; 3927 } 3928 3929 static bool ram_has_postcopy(void *opaque) 3930 { 3931 RAMBlock *rb; 3932 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3933 if (ramblock_is_pmem(rb)) { 3934 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 3935 "is not supported now!", rb->idstr, rb->host); 3936 return false; 3937 } 3938 } 3939 3940 return migrate_postcopy_ram(); 3941 } 3942 3943 /* Sync all the dirty bitmap with destination VM. */ 3944 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 3945 { 3946 RAMBlock *block; 3947 QEMUFile *file = s->to_dst_file; 3948 int ramblock_count = 0; 3949 3950 trace_ram_dirty_bitmap_sync_start(); 3951 3952 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3953 qemu_savevm_send_recv_bitmap(file, block->idstr); 3954 trace_ram_dirty_bitmap_request(block->idstr); 3955 ramblock_count++; 3956 } 3957 3958 trace_ram_dirty_bitmap_sync_wait(); 3959 3960 /* Wait until all the ramblocks' dirty bitmap synced */ 3961 while (ramblock_count--) { 3962 qemu_sem_wait(&s->rp_state.rp_sem); 3963 } 3964 3965 trace_ram_dirty_bitmap_sync_complete(); 3966 3967 return 0; 3968 } 3969 3970 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 3971 { 3972 qemu_sem_post(&s->rp_state.rp_sem); 3973 } 3974 3975 /* 3976 * Read the received bitmap, revert it as the initial dirty bitmap. 3977 * This is only used when the postcopy migration is paused but wants 3978 * to resume from a middle point. 3979 */ 3980 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 3981 { 3982 int ret = -EINVAL; 3983 QEMUFile *file = s->rp_state.from_dst_file; 3984 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 3985 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 3986 uint64_t size, end_mark; 3987 3988 trace_ram_dirty_bitmap_reload_begin(block->idstr); 3989 3990 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 3991 error_report("%s: incorrect state %s", __func__, 3992 MigrationStatus_str(s->state)); 3993 return -EINVAL; 3994 } 3995 3996 /* 3997 * Note: see comments in ramblock_recv_bitmap_send() on why we 3998 * need the endianness conversion, and the paddings. 3999 */ 4000 local_size = ROUND_UP(local_size, 8); 4001 4002 /* Add paddings */ 4003 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4004 4005 size = qemu_get_be64(file); 4006 4007 /* The size of the bitmap should match with our ramblock */ 4008 if (size != local_size) { 4009 error_report("%s: ramblock '%s' bitmap size mismatch " 4010 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4011 block->idstr, size, local_size); 4012 ret = -EINVAL; 4013 goto out; 4014 } 4015 4016 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4017 end_mark = qemu_get_be64(file); 4018 4019 ret = qemu_file_get_error(file); 4020 if (ret || size != local_size) { 4021 error_report("%s: read bitmap failed for ramblock '%s': %d" 4022 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4023 __func__, block->idstr, ret, local_size, size); 4024 ret = -EIO; 4025 goto out; 4026 } 4027 4028 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4029 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4030 __func__, block->idstr, end_mark); 4031 ret = -EINVAL; 4032 goto out; 4033 } 4034 4035 /* 4036 * Endianness conversion. We are during postcopy (though paused). 4037 * The dirty bitmap won't change. We can directly modify it. 4038 */ 4039 bitmap_from_le(block->bmap, le_bitmap, nbits); 4040 4041 /* 4042 * What we received is "received bitmap". Revert it as the initial 4043 * dirty bitmap for this ramblock. 4044 */ 4045 bitmap_complement(block->bmap, block->bmap, nbits); 4046 4047 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4048 4049 /* 4050 * We succeeded to sync bitmap for current ramblock. If this is 4051 * the last one to sync, we need to notify the main send thread. 4052 */ 4053 ram_dirty_bitmap_reload_notify(s); 4054 4055 ret = 0; 4056 out: 4057 g_free(le_bitmap); 4058 return ret; 4059 } 4060 4061 static int ram_resume_prepare(MigrationState *s, void *opaque) 4062 { 4063 RAMState *rs = *(RAMState **)opaque; 4064 int ret; 4065 4066 ret = ram_dirty_bitmap_sync_all(s, rs); 4067 if (ret) { 4068 return ret; 4069 } 4070 4071 ram_state_resume_prepare(rs, s->to_dst_file); 4072 4073 return 0; 4074 } 4075 4076 static SaveVMHandlers savevm_ram_handlers = { 4077 .save_setup = ram_save_setup, 4078 .save_live_iterate = ram_save_iterate, 4079 .save_live_complete_postcopy = ram_save_complete, 4080 .save_live_complete_precopy = ram_save_complete, 4081 .has_postcopy = ram_has_postcopy, 4082 .save_live_pending = ram_save_pending, 4083 .load_state = ram_load, 4084 .save_cleanup = ram_save_cleanup, 4085 .load_setup = ram_load_setup, 4086 .load_cleanup = ram_load_cleanup, 4087 .resume_prepare = ram_resume_prepare, 4088 }; 4089 4090 void ram_mig_init(void) 4091 { 4092 qemu_mutex_init(&XBZRLE.lock); 4093 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4094 } 4095