1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "cpu.h" 31 #include "qemu/cutils.h" 32 #include "qemu/bitops.h" 33 #include "qemu/bitmap.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration/register.h" 39 #include "migration/misc.h" 40 #include "qemu-file.h" 41 #include "postcopy-ram.h" 42 #include "page_cache.h" 43 #include "qemu/error-report.h" 44 #include "qapi/error.h" 45 #include "qapi/qapi-types-migration.h" 46 #include "qapi/qapi-events-migration.h" 47 #include "qapi/qmp/qerror.h" 48 #include "trace.h" 49 #include "exec/ram_addr.h" 50 #include "exec/target_page.h" 51 #include "qemu/rcu_queue.h" 52 #include "migration/colo.h" 53 #include "block.h" 54 #include "sysemu/sysemu.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #if defined(__linux__) 62 #include "qemu/userfaultfd.h" 63 #endif /* defined(__linux__) */ 64 65 /***********************************************************/ 66 /* ram save/restore */ 67 68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 69 * worked for pages that where filled with the same char. We switched 70 * it to only search for the zero value. And to avoid confusion with 71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 72 */ 73 74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 75 #define RAM_SAVE_FLAG_ZERO 0x02 76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 77 #define RAM_SAVE_FLAG_PAGE 0x08 78 #define RAM_SAVE_FLAG_EOS 0x10 79 #define RAM_SAVE_FLAG_CONTINUE 0x20 80 #define RAM_SAVE_FLAG_XBZRLE 0x40 81 /* 0x80 is reserved in migration.h start with 0x100 next */ 82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 83 84 static inline bool is_zero_range(uint8_t *p, uint64_t size) 85 { 86 return buffer_is_zero(p, size); 87 } 88 89 XBZRLECacheStats xbzrle_counters; 90 91 /* struct contains XBZRLE cache and a static page 92 used by the compression */ 93 static struct { 94 /* buffer used for XBZRLE encoding */ 95 uint8_t *encoded_buf; 96 /* buffer for storing page content */ 97 uint8_t *current_buf; 98 /* Cache for XBZRLE, Protected by lock. */ 99 PageCache *cache; 100 QemuMutex lock; 101 /* it will store a page full of zeros */ 102 uint8_t *zero_target_page; 103 /* buffer used for XBZRLE decoding */ 104 uint8_t *decoded_buf; 105 } XBZRLE; 106 107 static void XBZRLE_cache_lock(void) 108 { 109 if (migrate_use_xbzrle()) { 110 qemu_mutex_lock(&XBZRLE.lock); 111 } 112 } 113 114 static void XBZRLE_cache_unlock(void) 115 { 116 if (migrate_use_xbzrle()) { 117 qemu_mutex_unlock(&XBZRLE.lock); 118 } 119 } 120 121 /** 122 * xbzrle_cache_resize: resize the xbzrle cache 123 * 124 * This function is called from qmp_migrate_set_cache_size in main 125 * thread, possibly while a migration is in progress. A running 126 * migration may be using the cache and might finish during this call, 127 * hence changes to the cache are protected by XBZRLE.lock(). 128 * 129 * Returns 0 for success or -1 for error 130 * 131 * @new_size: new cache size 132 * @errp: set *errp if the check failed, with reason 133 */ 134 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 135 { 136 PageCache *new_cache; 137 int64_t ret = 0; 138 139 /* Check for truncation */ 140 if (new_size != (size_t)new_size) { 141 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 142 "exceeding address space"); 143 return -1; 144 } 145 146 if (new_size == migrate_xbzrle_cache_size()) { 147 /* nothing to do */ 148 return 0; 149 } 150 151 XBZRLE_cache_lock(); 152 153 if (XBZRLE.cache != NULL) { 154 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 155 if (!new_cache) { 156 ret = -1; 157 goto out; 158 } 159 160 cache_fini(XBZRLE.cache); 161 XBZRLE.cache = new_cache; 162 } 163 out: 164 XBZRLE_cache_unlock(); 165 return ret; 166 } 167 168 bool ramblock_is_ignored(RAMBlock *block) 169 { 170 return !qemu_ram_is_migratable(block) || 171 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 172 } 173 174 #undef RAMBLOCK_FOREACH 175 176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 177 { 178 RAMBlock *block; 179 int ret = 0; 180 181 RCU_READ_LOCK_GUARD(); 182 183 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 184 ret = func(block, opaque); 185 if (ret) { 186 break; 187 } 188 } 189 return ret; 190 } 191 192 static void ramblock_recv_map_init(void) 193 { 194 RAMBlock *rb; 195 196 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 197 assert(!rb->receivedmap); 198 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 199 } 200 } 201 202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 203 { 204 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 205 rb->receivedmap); 206 } 207 208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 209 { 210 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 211 } 212 213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 214 { 215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 216 } 217 218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 219 size_t nr) 220 { 221 bitmap_set_atomic(rb->receivedmap, 222 ramblock_recv_bitmap_offset(host_addr, rb), 223 nr); 224 } 225 226 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 227 228 /* 229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 230 * 231 * Returns >0 if success with sent bytes, or <0 if error. 232 */ 233 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 234 const char *block_name) 235 { 236 RAMBlock *block = qemu_ram_block_by_name(block_name); 237 unsigned long *le_bitmap, nbits; 238 uint64_t size; 239 240 if (!block) { 241 error_report("%s: invalid block name: %s", __func__, block_name); 242 return -1; 243 } 244 245 nbits = block->used_length >> TARGET_PAGE_BITS; 246 247 /* 248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 249 * machines we may need 4 more bytes for padding (see below 250 * comment). So extend it a bit before hand. 251 */ 252 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 253 254 /* 255 * Always use little endian when sending the bitmap. This is 256 * required that when source and destination VMs are not using the 257 * same endianness. (Note: big endian won't work.) 258 */ 259 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 260 261 /* Size of the bitmap, in bytes */ 262 size = DIV_ROUND_UP(nbits, 8); 263 264 /* 265 * size is always aligned to 8 bytes for 64bit machines, but it 266 * may not be true for 32bit machines. We need this padding to 267 * make sure the migration can survive even between 32bit and 268 * 64bit machines. 269 */ 270 size = ROUND_UP(size, 8); 271 272 qemu_put_be64(file, size); 273 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 274 /* 275 * Mark as an end, in case the middle part is screwed up due to 276 * some "mysterious" reason. 277 */ 278 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 279 qemu_fflush(file); 280 281 g_free(le_bitmap); 282 283 if (qemu_file_get_error(file)) { 284 return qemu_file_get_error(file); 285 } 286 287 return size + sizeof(size); 288 } 289 290 /* 291 * An outstanding page request, on the source, having been received 292 * and queued 293 */ 294 struct RAMSrcPageRequest { 295 RAMBlock *rb; 296 hwaddr offset; 297 hwaddr len; 298 299 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 300 }; 301 302 /* State of RAM for migration */ 303 struct RAMState { 304 /* QEMUFile used for this migration */ 305 QEMUFile *f; 306 /* UFFD file descriptor, used in 'write-tracking' migration */ 307 int uffdio_fd; 308 /* Last block that we have visited searching for dirty pages */ 309 RAMBlock *last_seen_block; 310 /* Last block from where we have sent data */ 311 RAMBlock *last_sent_block; 312 /* Last dirty target page we have sent */ 313 ram_addr_t last_page; 314 /* last ram version we have seen */ 315 uint32_t last_version; 316 /* We are in the first round */ 317 bool ram_bulk_stage; 318 /* The free page optimization is enabled */ 319 bool fpo_enabled; 320 /* How many times we have dirty too many pages */ 321 int dirty_rate_high_cnt; 322 /* these variables are used for bitmap sync */ 323 /* last time we did a full bitmap_sync */ 324 int64_t time_last_bitmap_sync; 325 /* bytes transferred at start_time */ 326 uint64_t bytes_xfer_prev; 327 /* number of dirty pages since start_time */ 328 uint64_t num_dirty_pages_period; 329 /* xbzrle misses since the beginning of the period */ 330 uint64_t xbzrle_cache_miss_prev; 331 /* Amount of xbzrle pages since the beginning of the period */ 332 uint64_t xbzrle_pages_prev; 333 /* Amount of xbzrle encoded bytes since the beginning of the period */ 334 uint64_t xbzrle_bytes_prev; 335 336 /* compression statistics since the beginning of the period */ 337 /* amount of count that no free thread to compress data */ 338 uint64_t compress_thread_busy_prev; 339 /* amount bytes after compression */ 340 uint64_t compressed_size_prev; 341 /* amount of compressed pages */ 342 uint64_t compress_pages_prev; 343 344 /* total handled target pages at the beginning of period */ 345 uint64_t target_page_count_prev; 346 /* total handled target pages since start */ 347 uint64_t target_page_count; 348 /* number of dirty bits in the bitmap */ 349 uint64_t migration_dirty_pages; 350 /* Protects modification of the bitmap and migration dirty pages */ 351 QemuMutex bitmap_mutex; 352 /* The RAMBlock used in the last src_page_requests */ 353 RAMBlock *last_req_rb; 354 /* Queue of outstanding page requests from the destination */ 355 QemuMutex src_page_req_mutex; 356 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 357 }; 358 typedef struct RAMState RAMState; 359 360 static RAMState *ram_state; 361 362 static NotifierWithReturnList precopy_notifier_list; 363 364 void precopy_infrastructure_init(void) 365 { 366 notifier_with_return_list_init(&precopy_notifier_list); 367 } 368 369 void precopy_add_notifier(NotifierWithReturn *n) 370 { 371 notifier_with_return_list_add(&precopy_notifier_list, n); 372 } 373 374 void precopy_remove_notifier(NotifierWithReturn *n) 375 { 376 notifier_with_return_remove(n); 377 } 378 379 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 380 { 381 PrecopyNotifyData pnd; 382 pnd.reason = reason; 383 pnd.errp = errp; 384 385 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 386 } 387 388 void precopy_enable_free_page_optimization(void) 389 { 390 if (!ram_state) { 391 return; 392 } 393 394 ram_state->fpo_enabled = true; 395 } 396 397 uint64_t ram_bytes_remaining(void) 398 { 399 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 400 0; 401 } 402 403 MigrationStats ram_counters; 404 405 /* used by the search for pages to send */ 406 struct PageSearchStatus { 407 /* Current block being searched */ 408 RAMBlock *block; 409 /* Current page to search from */ 410 unsigned long page; 411 /* Set once we wrap around */ 412 bool complete_round; 413 }; 414 typedef struct PageSearchStatus PageSearchStatus; 415 416 CompressionStats compression_counters; 417 418 struct CompressParam { 419 bool done; 420 bool quit; 421 bool zero_page; 422 QEMUFile *file; 423 QemuMutex mutex; 424 QemuCond cond; 425 RAMBlock *block; 426 ram_addr_t offset; 427 428 /* internally used fields */ 429 z_stream stream; 430 uint8_t *originbuf; 431 }; 432 typedef struct CompressParam CompressParam; 433 434 struct DecompressParam { 435 bool done; 436 bool quit; 437 QemuMutex mutex; 438 QemuCond cond; 439 void *des; 440 uint8_t *compbuf; 441 int len; 442 z_stream stream; 443 }; 444 typedef struct DecompressParam DecompressParam; 445 446 static CompressParam *comp_param; 447 static QemuThread *compress_threads; 448 /* comp_done_cond is used to wake up the migration thread when 449 * one of the compression threads has finished the compression. 450 * comp_done_lock is used to co-work with comp_done_cond. 451 */ 452 static QemuMutex comp_done_lock; 453 static QemuCond comp_done_cond; 454 /* The empty QEMUFileOps will be used by file in CompressParam */ 455 static const QEMUFileOps empty_ops = { }; 456 457 static QEMUFile *decomp_file; 458 static DecompressParam *decomp_param; 459 static QemuThread *decompress_threads; 460 static QemuMutex decomp_done_lock; 461 static QemuCond decomp_done_cond; 462 463 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 464 ram_addr_t offset, uint8_t *source_buf); 465 466 static void *do_data_compress(void *opaque) 467 { 468 CompressParam *param = opaque; 469 RAMBlock *block; 470 ram_addr_t offset; 471 bool zero_page; 472 473 qemu_mutex_lock(¶m->mutex); 474 while (!param->quit) { 475 if (param->block) { 476 block = param->block; 477 offset = param->offset; 478 param->block = NULL; 479 qemu_mutex_unlock(¶m->mutex); 480 481 zero_page = do_compress_ram_page(param->file, ¶m->stream, 482 block, offset, param->originbuf); 483 484 qemu_mutex_lock(&comp_done_lock); 485 param->done = true; 486 param->zero_page = zero_page; 487 qemu_cond_signal(&comp_done_cond); 488 qemu_mutex_unlock(&comp_done_lock); 489 490 qemu_mutex_lock(¶m->mutex); 491 } else { 492 qemu_cond_wait(¶m->cond, ¶m->mutex); 493 } 494 } 495 qemu_mutex_unlock(¶m->mutex); 496 497 return NULL; 498 } 499 500 static void compress_threads_save_cleanup(void) 501 { 502 int i, thread_count; 503 504 if (!migrate_use_compression() || !comp_param) { 505 return; 506 } 507 508 thread_count = migrate_compress_threads(); 509 for (i = 0; i < thread_count; i++) { 510 /* 511 * we use it as a indicator which shows if the thread is 512 * properly init'd or not 513 */ 514 if (!comp_param[i].file) { 515 break; 516 } 517 518 qemu_mutex_lock(&comp_param[i].mutex); 519 comp_param[i].quit = true; 520 qemu_cond_signal(&comp_param[i].cond); 521 qemu_mutex_unlock(&comp_param[i].mutex); 522 523 qemu_thread_join(compress_threads + i); 524 qemu_mutex_destroy(&comp_param[i].mutex); 525 qemu_cond_destroy(&comp_param[i].cond); 526 deflateEnd(&comp_param[i].stream); 527 g_free(comp_param[i].originbuf); 528 qemu_fclose(comp_param[i].file); 529 comp_param[i].file = NULL; 530 } 531 qemu_mutex_destroy(&comp_done_lock); 532 qemu_cond_destroy(&comp_done_cond); 533 g_free(compress_threads); 534 g_free(comp_param); 535 compress_threads = NULL; 536 comp_param = NULL; 537 } 538 539 static int compress_threads_save_setup(void) 540 { 541 int i, thread_count; 542 543 if (!migrate_use_compression()) { 544 return 0; 545 } 546 thread_count = migrate_compress_threads(); 547 compress_threads = g_new0(QemuThread, thread_count); 548 comp_param = g_new0(CompressParam, thread_count); 549 qemu_cond_init(&comp_done_cond); 550 qemu_mutex_init(&comp_done_lock); 551 for (i = 0; i < thread_count; i++) { 552 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 553 if (!comp_param[i].originbuf) { 554 goto exit; 555 } 556 557 if (deflateInit(&comp_param[i].stream, 558 migrate_compress_level()) != Z_OK) { 559 g_free(comp_param[i].originbuf); 560 goto exit; 561 } 562 563 /* comp_param[i].file is just used as a dummy buffer to save data, 564 * set its ops to empty. 565 */ 566 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 567 comp_param[i].done = true; 568 comp_param[i].quit = false; 569 qemu_mutex_init(&comp_param[i].mutex); 570 qemu_cond_init(&comp_param[i].cond); 571 qemu_thread_create(compress_threads + i, "compress", 572 do_data_compress, comp_param + i, 573 QEMU_THREAD_JOINABLE); 574 } 575 return 0; 576 577 exit: 578 compress_threads_save_cleanup(); 579 return -1; 580 } 581 582 /** 583 * save_page_header: write page header to wire 584 * 585 * If this is the 1st block, it also writes the block identification 586 * 587 * Returns the number of bytes written 588 * 589 * @f: QEMUFile where to send the data 590 * @block: block that contains the page we want to send 591 * @offset: offset inside the block for the page 592 * in the lower bits, it contains flags 593 */ 594 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 595 ram_addr_t offset) 596 { 597 size_t size, len; 598 599 if (block == rs->last_sent_block) { 600 offset |= RAM_SAVE_FLAG_CONTINUE; 601 } 602 qemu_put_be64(f, offset); 603 size = 8; 604 605 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 606 len = strlen(block->idstr); 607 qemu_put_byte(f, len); 608 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 609 size += 1 + len; 610 rs->last_sent_block = block; 611 } 612 return size; 613 } 614 615 /** 616 * mig_throttle_guest_down: throotle down the guest 617 * 618 * Reduce amount of guest cpu execution to hopefully slow down memory 619 * writes. If guest dirty memory rate is reduced below the rate at 620 * which we can transfer pages to the destination then we should be 621 * able to complete migration. Some workloads dirty memory way too 622 * fast and will not effectively converge, even with auto-converge. 623 */ 624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 625 uint64_t bytes_dirty_threshold) 626 { 627 MigrationState *s = migrate_get_current(); 628 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 629 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 630 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 631 int pct_max = s->parameters.max_cpu_throttle; 632 633 uint64_t throttle_now = cpu_throttle_get_percentage(); 634 uint64_t cpu_now, cpu_ideal, throttle_inc; 635 636 /* We have not started throttling yet. Let's start it. */ 637 if (!cpu_throttle_active()) { 638 cpu_throttle_set(pct_initial); 639 } else { 640 /* Throttling already on, just increase the rate */ 641 if (!pct_tailslow) { 642 throttle_inc = pct_increment; 643 } else { 644 /* Compute the ideal CPU percentage used by Guest, which may 645 * make the dirty rate match the dirty rate threshold. */ 646 cpu_now = 100 - throttle_now; 647 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 648 bytes_dirty_period); 649 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 650 } 651 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 652 } 653 } 654 655 /** 656 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 657 * 658 * @rs: current RAM state 659 * @current_addr: address for the zero page 660 * 661 * Update the xbzrle cache to reflect a page that's been sent as all 0. 662 * The important thing is that a stale (not-yet-0'd) page be replaced 663 * by the new data. 664 * As a bonus, if the page wasn't in the cache it gets added so that 665 * when a small write is made into the 0'd page it gets XBZRLE sent. 666 */ 667 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 668 { 669 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 670 return; 671 } 672 673 /* We don't care if this fails to allocate a new cache page 674 * as long as it updated an old one */ 675 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 676 ram_counters.dirty_sync_count); 677 } 678 679 #define ENCODING_FLAG_XBZRLE 0x1 680 681 /** 682 * save_xbzrle_page: compress and send current page 683 * 684 * Returns: 1 means that we wrote the page 685 * 0 means that page is identical to the one already sent 686 * -1 means that xbzrle would be longer than normal 687 * 688 * @rs: current RAM state 689 * @current_data: pointer to the address of the page contents 690 * @current_addr: addr of the page 691 * @block: block that contains the page we want to send 692 * @offset: offset inside the block for the page 693 * @last_stage: if we are at the completion stage 694 */ 695 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 696 ram_addr_t current_addr, RAMBlock *block, 697 ram_addr_t offset, bool last_stage) 698 { 699 int encoded_len = 0, bytes_xbzrle; 700 uint8_t *prev_cached_page; 701 702 if (!cache_is_cached(XBZRLE.cache, current_addr, 703 ram_counters.dirty_sync_count)) { 704 xbzrle_counters.cache_miss++; 705 if (!last_stage) { 706 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 707 ram_counters.dirty_sync_count) == -1) { 708 return -1; 709 } else { 710 /* update *current_data when the page has been 711 inserted into cache */ 712 *current_data = get_cached_data(XBZRLE.cache, current_addr); 713 } 714 } 715 return -1; 716 } 717 718 /* 719 * Reaching here means the page has hit the xbzrle cache, no matter what 720 * encoding result it is (normal encoding, overflow or skipping the page), 721 * count the page as encoded. This is used to calculate the encoding rate. 722 * 723 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 724 * 2nd page turns out to be skipped (i.e. no new bytes written to the 725 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 726 * skipped page included. In this way, the encoding rate can tell if the 727 * guest page is good for xbzrle encoding. 728 */ 729 xbzrle_counters.pages++; 730 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 731 732 /* save current buffer into memory */ 733 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 734 735 /* XBZRLE encoding (if there is no overflow) */ 736 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 737 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 738 TARGET_PAGE_SIZE); 739 740 /* 741 * Update the cache contents, so that it corresponds to the data 742 * sent, in all cases except where we skip the page. 743 */ 744 if (!last_stage && encoded_len != 0) { 745 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 746 /* 747 * In the case where we couldn't compress, ensure that the caller 748 * sends the data from the cache, since the guest might have 749 * changed the RAM since we copied it. 750 */ 751 *current_data = prev_cached_page; 752 } 753 754 if (encoded_len == 0) { 755 trace_save_xbzrle_page_skipping(); 756 return 0; 757 } else if (encoded_len == -1) { 758 trace_save_xbzrle_page_overflow(); 759 xbzrle_counters.overflow++; 760 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 761 return -1; 762 } 763 764 /* Send XBZRLE based compressed page */ 765 bytes_xbzrle = save_page_header(rs, rs->f, block, 766 offset | RAM_SAVE_FLAG_XBZRLE); 767 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 768 qemu_put_be16(rs->f, encoded_len); 769 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 770 bytes_xbzrle += encoded_len + 1 + 2; 771 /* 772 * Like compressed_size (please see update_compress_thread_counts), 773 * the xbzrle encoded bytes don't count the 8 byte header with 774 * RAM_SAVE_FLAG_CONTINUE. 775 */ 776 xbzrle_counters.bytes += bytes_xbzrle - 8; 777 ram_counters.transferred += bytes_xbzrle; 778 779 return 1; 780 } 781 782 /** 783 * migration_bitmap_find_dirty: find the next dirty page from start 784 * 785 * Returns the page offset within memory region of the start of a dirty page 786 * 787 * @rs: current RAM state 788 * @rb: RAMBlock where to search for dirty pages 789 * @start: page where we start the search 790 */ 791 static inline 792 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 793 unsigned long start) 794 { 795 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 796 unsigned long *bitmap = rb->bmap; 797 unsigned long next; 798 799 if (ramblock_is_ignored(rb)) { 800 return size; 801 } 802 803 /* 804 * When the free page optimization is enabled, we need to check the bitmap 805 * to send the non-free pages rather than all the pages in the bulk stage. 806 */ 807 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) { 808 next = start + 1; 809 } else { 810 next = find_next_bit(bitmap, size, start); 811 } 812 813 return next; 814 } 815 816 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 817 RAMBlock *rb, 818 unsigned long page) 819 { 820 bool ret; 821 822 qemu_mutex_lock(&rs->bitmap_mutex); 823 824 /* 825 * Clear dirty bitmap if needed. This _must_ be called before we 826 * send any of the page in the chunk because we need to make sure 827 * we can capture further page content changes when we sync dirty 828 * log the next time. So as long as we are going to send any of 829 * the page in the chunk we clear the remote dirty bitmap for all. 830 * Clearing it earlier won't be a problem, but too late will. 831 */ 832 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) { 833 uint8_t shift = rb->clear_bmap_shift; 834 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift); 835 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); 836 837 /* 838 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 839 * can make things easier sometimes since then start address 840 * of the small chunk will always be 64 pages aligned so the 841 * bitmap will always be aligned to unsigned long. We should 842 * even be able to remove this restriction but I'm simply 843 * keeping it. 844 */ 845 assert(shift >= 6); 846 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 847 memory_region_clear_dirty_bitmap(rb->mr, start, size); 848 } 849 850 ret = test_and_clear_bit(page, rb->bmap); 851 852 if (ret) { 853 rs->migration_dirty_pages--; 854 } 855 qemu_mutex_unlock(&rs->bitmap_mutex); 856 857 return ret; 858 } 859 860 /* Called with RCU critical section */ 861 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 862 { 863 uint64_t new_dirty_pages = 864 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 865 866 rs->migration_dirty_pages += new_dirty_pages; 867 rs->num_dirty_pages_period += new_dirty_pages; 868 } 869 870 /** 871 * ram_pagesize_summary: calculate all the pagesizes of a VM 872 * 873 * Returns a summary bitmap of the page sizes of all RAMBlocks 874 * 875 * For VMs with just normal pages this is equivalent to the host page 876 * size. If it's got some huge pages then it's the OR of all the 877 * different page sizes. 878 */ 879 uint64_t ram_pagesize_summary(void) 880 { 881 RAMBlock *block; 882 uint64_t summary = 0; 883 884 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 885 summary |= block->page_size; 886 } 887 888 return summary; 889 } 890 891 uint64_t ram_get_total_transferred_pages(void) 892 { 893 return ram_counters.normal + ram_counters.duplicate + 894 compression_counters.pages + xbzrle_counters.pages; 895 } 896 897 static void migration_update_rates(RAMState *rs, int64_t end_time) 898 { 899 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 900 double compressed_size; 901 902 /* calculate period counters */ 903 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 904 / (end_time - rs->time_last_bitmap_sync); 905 906 if (!page_count) { 907 return; 908 } 909 910 if (migrate_use_xbzrle()) { 911 double encoded_size, unencoded_size; 912 913 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 914 rs->xbzrle_cache_miss_prev) / page_count; 915 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 916 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 917 TARGET_PAGE_SIZE; 918 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 919 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 920 xbzrle_counters.encoding_rate = 0; 921 } else { 922 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 923 } 924 rs->xbzrle_pages_prev = xbzrle_counters.pages; 925 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 926 } 927 928 if (migrate_use_compression()) { 929 compression_counters.busy_rate = (double)(compression_counters.busy - 930 rs->compress_thread_busy_prev) / page_count; 931 rs->compress_thread_busy_prev = compression_counters.busy; 932 933 compressed_size = compression_counters.compressed_size - 934 rs->compressed_size_prev; 935 if (compressed_size) { 936 double uncompressed_size = (compression_counters.pages - 937 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 938 939 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 940 compression_counters.compression_rate = 941 uncompressed_size / compressed_size; 942 943 rs->compress_pages_prev = compression_counters.pages; 944 rs->compressed_size_prev = compression_counters.compressed_size; 945 } 946 } 947 } 948 949 static void migration_trigger_throttle(RAMState *rs) 950 { 951 MigrationState *s = migrate_get_current(); 952 uint64_t threshold = s->parameters.throttle_trigger_threshold; 953 954 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; 955 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 956 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 957 958 /* During block migration the auto-converge logic incorrectly detects 959 * that ram migration makes no progress. Avoid this by disabling the 960 * throttling logic during the bulk phase of block migration. */ 961 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 962 /* The following detection logic can be refined later. For now: 963 Check to see if the ratio between dirtied bytes and the approx. 964 amount of bytes that just got transferred since the last time 965 we were in this routine reaches the threshold. If that happens 966 twice, start or increase throttling. */ 967 968 if ((bytes_dirty_period > bytes_dirty_threshold) && 969 (++rs->dirty_rate_high_cnt >= 2)) { 970 trace_migration_throttle(); 971 rs->dirty_rate_high_cnt = 0; 972 mig_throttle_guest_down(bytes_dirty_period, 973 bytes_dirty_threshold); 974 } 975 } 976 } 977 978 static void migration_bitmap_sync(RAMState *rs) 979 { 980 RAMBlock *block; 981 int64_t end_time; 982 983 ram_counters.dirty_sync_count++; 984 985 if (!rs->time_last_bitmap_sync) { 986 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 987 } 988 989 trace_migration_bitmap_sync_start(); 990 memory_global_dirty_log_sync(); 991 992 qemu_mutex_lock(&rs->bitmap_mutex); 993 WITH_RCU_READ_LOCK_GUARD() { 994 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 995 ramblock_sync_dirty_bitmap(rs, block); 996 } 997 ram_counters.remaining = ram_bytes_remaining(); 998 } 999 qemu_mutex_unlock(&rs->bitmap_mutex); 1000 1001 memory_global_after_dirty_log_sync(); 1002 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1003 1004 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1005 1006 /* more than 1 second = 1000 millisecons */ 1007 if (end_time > rs->time_last_bitmap_sync + 1000) { 1008 migration_trigger_throttle(rs); 1009 1010 migration_update_rates(rs, end_time); 1011 1012 rs->target_page_count_prev = rs->target_page_count; 1013 1014 /* reset period counters */ 1015 rs->time_last_bitmap_sync = end_time; 1016 rs->num_dirty_pages_period = 0; 1017 rs->bytes_xfer_prev = ram_counters.transferred; 1018 } 1019 if (migrate_use_events()) { 1020 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1021 } 1022 } 1023 1024 static void migration_bitmap_sync_precopy(RAMState *rs) 1025 { 1026 Error *local_err = NULL; 1027 1028 /* 1029 * The current notifier usage is just an optimization to migration, so we 1030 * don't stop the normal migration process in the error case. 1031 */ 1032 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1033 error_report_err(local_err); 1034 local_err = NULL; 1035 } 1036 1037 migration_bitmap_sync(rs); 1038 1039 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1040 error_report_err(local_err); 1041 } 1042 } 1043 1044 /** 1045 * save_zero_page_to_file: send the zero page to the file 1046 * 1047 * Returns the size of data written to the file, 0 means the page is not 1048 * a zero page 1049 * 1050 * @rs: current RAM state 1051 * @file: the file where the data is saved 1052 * @block: block that contains the page we want to send 1053 * @offset: offset inside the block for the page 1054 */ 1055 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, 1056 RAMBlock *block, ram_addr_t offset) 1057 { 1058 uint8_t *p = block->host + offset; 1059 int len = 0; 1060 1061 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1062 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); 1063 qemu_put_byte(file, 0); 1064 len += 1; 1065 } 1066 return len; 1067 } 1068 1069 /** 1070 * save_zero_page: send the zero page to the stream 1071 * 1072 * Returns the number of pages written. 1073 * 1074 * @rs: current RAM state 1075 * @block: block that contains the page we want to send 1076 * @offset: offset inside the block for the page 1077 */ 1078 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1079 { 1080 int len = save_zero_page_to_file(rs, rs->f, block, offset); 1081 1082 if (len) { 1083 ram_counters.duplicate++; 1084 ram_counters.transferred += len; 1085 return 1; 1086 } 1087 return -1; 1088 } 1089 1090 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1091 { 1092 if (!migrate_release_ram() || !migration_in_postcopy()) { 1093 return; 1094 } 1095 1096 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); 1097 } 1098 1099 /* 1100 * @pages: the number of pages written by the control path, 1101 * < 0 - error 1102 * > 0 - number of pages written 1103 * 1104 * Return true if the pages has been saved, otherwise false is returned. 1105 */ 1106 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1107 int *pages) 1108 { 1109 uint64_t bytes_xmit = 0; 1110 int ret; 1111 1112 *pages = -1; 1113 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1114 &bytes_xmit); 1115 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1116 return false; 1117 } 1118 1119 if (bytes_xmit) { 1120 ram_counters.transferred += bytes_xmit; 1121 *pages = 1; 1122 } 1123 1124 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1125 return true; 1126 } 1127 1128 if (bytes_xmit > 0) { 1129 ram_counters.normal++; 1130 } else if (bytes_xmit == 0) { 1131 ram_counters.duplicate++; 1132 } 1133 1134 return true; 1135 } 1136 1137 /* 1138 * directly send the page to the stream 1139 * 1140 * Returns the number of pages written. 1141 * 1142 * @rs: current RAM state 1143 * @block: block that contains the page we want to send 1144 * @offset: offset inside the block for the page 1145 * @buf: the page to be sent 1146 * @async: send to page asyncly 1147 */ 1148 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1149 uint8_t *buf, bool async) 1150 { 1151 ram_counters.transferred += save_page_header(rs, rs->f, block, 1152 offset | RAM_SAVE_FLAG_PAGE); 1153 if (async) { 1154 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1155 migrate_release_ram() & 1156 migration_in_postcopy()); 1157 } else { 1158 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1159 } 1160 ram_counters.transferred += TARGET_PAGE_SIZE; 1161 ram_counters.normal++; 1162 return 1; 1163 } 1164 1165 /** 1166 * ram_save_page: send the given page to the stream 1167 * 1168 * Returns the number of pages written. 1169 * < 0 - error 1170 * >=0 - Number of pages written - this might legally be 0 1171 * if xbzrle noticed the page was the same. 1172 * 1173 * @rs: current RAM state 1174 * @block: block that contains the page we want to send 1175 * @offset: offset inside the block for the page 1176 * @last_stage: if we are at the completion stage 1177 */ 1178 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1179 { 1180 int pages = -1; 1181 uint8_t *p; 1182 bool send_async = true; 1183 RAMBlock *block = pss->block; 1184 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1185 ram_addr_t current_addr = block->offset + offset; 1186 1187 p = block->host + offset; 1188 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1189 1190 XBZRLE_cache_lock(); 1191 if (!rs->ram_bulk_stage && !migration_in_postcopy() && 1192 migrate_use_xbzrle()) { 1193 pages = save_xbzrle_page(rs, &p, current_addr, block, 1194 offset, last_stage); 1195 if (!last_stage) { 1196 /* Can't send this cached data async, since the cache page 1197 * might get updated before it gets to the wire 1198 */ 1199 send_async = false; 1200 } 1201 } 1202 1203 /* XBZRLE overflow or normal page */ 1204 if (pages == -1) { 1205 pages = save_normal_page(rs, block, offset, p, send_async); 1206 } 1207 1208 XBZRLE_cache_unlock(); 1209 1210 return pages; 1211 } 1212 1213 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, 1214 ram_addr_t offset) 1215 { 1216 if (multifd_queue_page(rs->f, block, offset) < 0) { 1217 return -1; 1218 } 1219 ram_counters.normal++; 1220 1221 return 1; 1222 } 1223 1224 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1225 ram_addr_t offset, uint8_t *source_buf) 1226 { 1227 RAMState *rs = ram_state; 1228 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1229 bool zero_page = false; 1230 int ret; 1231 1232 if (save_zero_page_to_file(rs, f, block, offset)) { 1233 zero_page = true; 1234 goto exit; 1235 } 1236 1237 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1238 1239 /* 1240 * copy it to a internal buffer to avoid it being modified by VM 1241 * so that we can catch up the error during compression and 1242 * decompression 1243 */ 1244 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1245 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1246 if (ret < 0) { 1247 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1248 error_report("compressed data failed!"); 1249 return false; 1250 } 1251 1252 exit: 1253 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1254 return zero_page; 1255 } 1256 1257 static void 1258 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1259 { 1260 ram_counters.transferred += bytes_xmit; 1261 1262 if (param->zero_page) { 1263 ram_counters.duplicate++; 1264 return; 1265 } 1266 1267 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1268 compression_counters.compressed_size += bytes_xmit - 8; 1269 compression_counters.pages++; 1270 } 1271 1272 static bool save_page_use_compression(RAMState *rs); 1273 1274 static void flush_compressed_data(RAMState *rs) 1275 { 1276 int idx, len, thread_count; 1277 1278 if (!save_page_use_compression(rs)) { 1279 return; 1280 } 1281 thread_count = migrate_compress_threads(); 1282 1283 qemu_mutex_lock(&comp_done_lock); 1284 for (idx = 0; idx < thread_count; idx++) { 1285 while (!comp_param[idx].done) { 1286 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1287 } 1288 } 1289 qemu_mutex_unlock(&comp_done_lock); 1290 1291 for (idx = 0; idx < thread_count; idx++) { 1292 qemu_mutex_lock(&comp_param[idx].mutex); 1293 if (!comp_param[idx].quit) { 1294 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1295 /* 1296 * it's safe to fetch zero_page without holding comp_done_lock 1297 * as there is no further request submitted to the thread, 1298 * i.e, the thread should be waiting for a request at this point. 1299 */ 1300 update_compress_thread_counts(&comp_param[idx], len); 1301 } 1302 qemu_mutex_unlock(&comp_param[idx].mutex); 1303 } 1304 } 1305 1306 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1307 ram_addr_t offset) 1308 { 1309 param->block = block; 1310 param->offset = offset; 1311 } 1312 1313 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1314 ram_addr_t offset) 1315 { 1316 int idx, thread_count, bytes_xmit = -1, pages = -1; 1317 bool wait = migrate_compress_wait_thread(); 1318 1319 thread_count = migrate_compress_threads(); 1320 qemu_mutex_lock(&comp_done_lock); 1321 retry: 1322 for (idx = 0; idx < thread_count; idx++) { 1323 if (comp_param[idx].done) { 1324 comp_param[idx].done = false; 1325 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1326 qemu_mutex_lock(&comp_param[idx].mutex); 1327 set_compress_params(&comp_param[idx], block, offset); 1328 qemu_cond_signal(&comp_param[idx].cond); 1329 qemu_mutex_unlock(&comp_param[idx].mutex); 1330 pages = 1; 1331 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1332 break; 1333 } 1334 } 1335 1336 /* 1337 * wait for the free thread if the user specifies 'compress-wait-thread', 1338 * otherwise we will post the page out in the main thread as normal page. 1339 */ 1340 if (pages < 0 && wait) { 1341 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1342 goto retry; 1343 } 1344 qemu_mutex_unlock(&comp_done_lock); 1345 1346 return pages; 1347 } 1348 1349 /** 1350 * find_dirty_block: find the next dirty page and update any state 1351 * associated with the search process. 1352 * 1353 * Returns true if a page is found 1354 * 1355 * @rs: current RAM state 1356 * @pss: data about the state of the current dirty page scan 1357 * @again: set to false if the search has scanned the whole of RAM 1358 */ 1359 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1360 { 1361 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1362 if (pss->complete_round && pss->block == rs->last_seen_block && 1363 pss->page >= rs->last_page) { 1364 /* 1365 * We've been once around the RAM and haven't found anything. 1366 * Give up. 1367 */ 1368 *again = false; 1369 return false; 1370 } 1371 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS) 1372 >= pss->block->used_length) { 1373 /* Didn't find anything in this RAM Block */ 1374 pss->page = 0; 1375 pss->block = QLIST_NEXT_RCU(pss->block, next); 1376 if (!pss->block) { 1377 /* 1378 * If memory migration starts over, we will meet a dirtied page 1379 * which may still exists in compression threads's ring, so we 1380 * should flush the compressed data to make sure the new page 1381 * is not overwritten by the old one in the destination. 1382 * 1383 * Also If xbzrle is on, stop using the data compression at this 1384 * point. In theory, xbzrle can do better than compression. 1385 */ 1386 flush_compressed_data(rs); 1387 1388 /* Hit the end of the list */ 1389 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1390 /* Flag that we've looped */ 1391 pss->complete_round = true; 1392 rs->ram_bulk_stage = false; 1393 } 1394 /* Didn't find anything this time, but try again on the new block */ 1395 *again = true; 1396 return false; 1397 } else { 1398 /* Can go around again, but... */ 1399 *again = true; 1400 /* We've found something so probably don't need to */ 1401 return true; 1402 } 1403 } 1404 1405 /** 1406 * unqueue_page: gets a page of the queue 1407 * 1408 * Helper for 'get_queued_page' - gets a page off the queue 1409 * 1410 * Returns the block of the page (or NULL if none available) 1411 * 1412 * @rs: current RAM state 1413 * @offset: used to return the offset within the RAMBlock 1414 */ 1415 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1416 { 1417 RAMBlock *block = NULL; 1418 1419 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) { 1420 return NULL; 1421 } 1422 1423 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1424 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1425 struct RAMSrcPageRequest *entry = 1426 QSIMPLEQ_FIRST(&rs->src_page_requests); 1427 block = entry->rb; 1428 *offset = entry->offset; 1429 1430 if (entry->len > TARGET_PAGE_SIZE) { 1431 entry->len -= TARGET_PAGE_SIZE; 1432 entry->offset += TARGET_PAGE_SIZE; 1433 } else { 1434 memory_region_unref(block->mr); 1435 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1436 g_free(entry); 1437 migration_consume_urgent_request(); 1438 } 1439 } 1440 1441 return block; 1442 } 1443 1444 #if defined(__linux__) 1445 /** 1446 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1447 * is found, return RAM block pointer and page offset 1448 * 1449 * Returns pointer to the RAMBlock containing faulting page, 1450 * NULL if no write faults are pending 1451 * 1452 * @rs: current RAM state 1453 * @offset: page offset from the beginning of the block 1454 */ 1455 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1456 { 1457 struct uffd_msg uffd_msg; 1458 void *page_address; 1459 RAMBlock *bs; 1460 int res; 1461 1462 if (!migrate_background_snapshot()) { 1463 return NULL; 1464 } 1465 1466 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1467 if (res <= 0) { 1468 return NULL; 1469 } 1470 1471 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1472 bs = qemu_ram_block_from_host(page_address, false, offset); 1473 assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); 1474 return bs; 1475 } 1476 1477 /** 1478 * ram_save_release_protection: release UFFD write protection after 1479 * a range of pages has been saved 1480 * 1481 * @rs: current RAM state 1482 * @pss: page-search-status structure 1483 * @start_page: index of the first page in the range relative to pss->block 1484 * 1485 * Returns 0 on success, negative value in case of an error 1486 */ 1487 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1488 unsigned long start_page) 1489 { 1490 int res = 0; 1491 1492 /* Check if page is from UFFD-managed region. */ 1493 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1494 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1495 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; 1496 1497 /* Flush async buffers before un-protect. */ 1498 qemu_fflush(rs->f); 1499 /* Un-protect memory range. */ 1500 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1501 false, false); 1502 } 1503 1504 return res; 1505 } 1506 1507 /* ram_write_tracking_available: check if kernel supports required UFFD features 1508 * 1509 * Returns true if supports, false otherwise 1510 */ 1511 bool ram_write_tracking_available(void) 1512 { 1513 uint64_t uffd_features; 1514 int res; 1515 1516 res = uffd_query_features(&uffd_features); 1517 return (res == 0 && 1518 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1519 } 1520 1521 /* ram_write_tracking_compatible: check if guest configuration is 1522 * compatible with 'write-tracking' 1523 * 1524 * Returns true if compatible, false otherwise 1525 */ 1526 bool ram_write_tracking_compatible(void) 1527 { 1528 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1529 int uffd_fd; 1530 RAMBlock *bs; 1531 bool ret = false; 1532 1533 /* Open UFFD file descriptor */ 1534 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1535 if (uffd_fd < 0) { 1536 return false; 1537 } 1538 1539 RCU_READ_LOCK_GUARD(); 1540 1541 RAMBLOCK_FOREACH_NOT_IGNORED(bs) { 1542 uint64_t uffd_ioctls; 1543 1544 /* Nothing to do with read-only and MMIO-writable regions */ 1545 if (bs->mr->readonly || bs->mr->rom_device) { 1546 continue; 1547 } 1548 /* Try to register block memory via UFFD-IO to track writes */ 1549 if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, 1550 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1551 goto out; 1552 } 1553 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1554 goto out; 1555 } 1556 } 1557 ret = true; 1558 1559 out: 1560 uffd_close_fd(uffd_fd); 1561 return ret; 1562 } 1563 1564 /* 1565 * ram_write_tracking_start: start UFFD-WP memory tracking 1566 * 1567 * Returns 0 for success or negative value in case of error 1568 */ 1569 int ram_write_tracking_start(void) 1570 { 1571 int uffd_fd; 1572 RAMState *rs = ram_state; 1573 RAMBlock *bs; 1574 1575 /* Open UFFD file descriptor */ 1576 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1577 if (uffd_fd < 0) { 1578 return uffd_fd; 1579 } 1580 rs->uffdio_fd = uffd_fd; 1581 1582 RCU_READ_LOCK_GUARD(); 1583 1584 RAMBLOCK_FOREACH_NOT_IGNORED(bs) { 1585 /* Nothing to do with read-only and MMIO-writable regions */ 1586 if (bs->mr->readonly || bs->mr->rom_device) { 1587 continue; 1588 } 1589 1590 /* Register block memory with UFFD to track writes */ 1591 if (uffd_register_memory(rs->uffdio_fd, bs->host, 1592 bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1593 goto fail; 1594 } 1595 /* Apply UFFD write protection to the block memory range */ 1596 if (uffd_change_protection(rs->uffdio_fd, bs->host, 1597 bs->max_length, true, false)) { 1598 goto fail; 1599 } 1600 bs->flags |= RAM_UF_WRITEPROTECT; 1601 memory_region_ref(bs->mr); 1602 1603 trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, 1604 bs->host, bs->max_length); 1605 } 1606 1607 return 0; 1608 1609 fail: 1610 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1611 1612 RAMBLOCK_FOREACH_NOT_IGNORED(bs) { 1613 if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { 1614 continue; 1615 } 1616 /* 1617 * In case some memory block failed to be write-protected 1618 * remove protection and unregister all succeeded RAM blocks 1619 */ 1620 uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); 1621 uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); 1622 /* Cleanup flags and remove reference */ 1623 bs->flags &= ~RAM_UF_WRITEPROTECT; 1624 memory_region_unref(bs->mr); 1625 } 1626 1627 uffd_close_fd(uffd_fd); 1628 rs->uffdio_fd = -1; 1629 return -1; 1630 } 1631 1632 /** 1633 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1634 */ 1635 void ram_write_tracking_stop(void) 1636 { 1637 RAMState *rs = ram_state; 1638 RAMBlock *bs; 1639 1640 RCU_READ_LOCK_GUARD(); 1641 1642 RAMBLOCK_FOREACH_NOT_IGNORED(bs) { 1643 if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { 1644 continue; 1645 } 1646 /* Remove protection and unregister all affected RAM blocks */ 1647 uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); 1648 uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); 1649 1650 trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, 1651 bs->host, bs->max_length); 1652 1653 /* Cleanup flags and remove reference */ 1654 bs->flags &= ~RAM_UF_WRITEPROTECT; 1655 memory_region_unref(bs->mr); 1656 } 1657 1658 /* Finally close UFFD file descriptor */ 1659 uffd_close_fd(rs->uffdio_fd); 1660 rs->uffdio_fd = -1; 1661 } 1662 1663 #else 1664 /* No target OS support, stubs just fail or ignore */ 1665 1666 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1667 { 1668 (void) rs; 1669 (void) offset; 1670 1671 return NULL; 1672 } 1673 1674 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1675 unsigned long start_page) 1676 { 1677 (void) rs; 1678 (void) pss; 1679 (void) start_page; 1680 1681 return 0; 1682 } 1683 1684 bool ram_write_tracking_available(void) 1685 { 1686 return false; 1687 } 1688 1689 bool ram_write_tracking_compatible(void) 1690 { 1691 assert(0); 1692 return false; 1693 } 1694 1695 int ram_write_tracking_start(void) 1696 { 1697 assert(0); 1698 return -1; 1699 } 1700 1701 void ram_write_tracking_stop(void) 1702 { 1703 assert(0); 1704 } 1705 #endif /* defined(__linux__) */ 1706 1707 /** 1708 * get_queued_page: unqueue a page from the postcopy requests 1709 * 1710 * Skips pages that are already sent (!dirty) 1711 * 1712 * Returns true if a queued page is found 1713 * 1714 * @rs: current RAM state 1715 * @pss: data about the state of the current dirty page scan 1716 */ 1717 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1718 { 1719 RAMBlock *block; 1720 ram_addr_t offset; 1721 bool dirty; 1722 1723 do { 1724 block = unqueue_page(rs, &offset); 1725 /* 1726 * We're sending this page, and since it's postcopy nothing else 1727 * will dirty it, and we must make sure it doesn't get sent again 1728 * even if this queue request was received after the background 1729 * search already sent it. 1730 */ 1731 if (block) { 1732 unsigned long page; 1733 1734 page = offset >> TARGET_PAGE_BITS; 1735 dirty = test_bit(page, block->bmap); 1736 if (!dirty) { 1737 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1738 page); 1739 } else { 1740 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1741 } 1742 } 1743 1744 } while (block && !dirty); 1745 1746 if (!block) { 1747 /* 1748 * Poll write faults too if background snapshot is enabled; that's 1749 * when we have vcpus got blocked by the write protected pages. 1750 */ 1751 block = poll_fault_page(rs, &offset); 1752 } 1753 1754 if (block) { 1755 /* 1756 * As soon as we start servicing pages out of order, then we have 1757 * to kill the bulk stage, since the bulk stage assumes 1758 * in (migration_bitmap_find_and_reset_dirty) that every page is 1759 * dirty, that's no longer true. 1760 */ 1761 rs->ram_bulk_stage = false; 1762 1763 /* 1764 * We want the background search to continue from the queued page 1765 * since the guest is likely to want other pages near to the page 1766 * it just requested. 1767 */ 1768 pss->block = block; 1769 pss->page = offset >> TARGET_PAGE_BITS; 1770 1771 /* 1772 * This unqueued page would break the "one round" check, even is 1773 * really rare. 1774 */ 1775 pss->complete_round = false; 1776 } 1777 1778 return !!block; 1779 } 1780 1781 /** 1782 * migration_page_queue_free: drop any remaining pages in the ram 1783 * request queue 1784 * 1785 * It should be empty at the end anyway, but in error cases there may 1786 * be some left. in case that there is any page left, we drop it. 1787 * 1788 */ 1789 static void migration_page_queue_free(RAMState *rs) 1790 { 1791 struct RAMSrcPageRequest *mspr, *next_mspr; 1792 /* This queue generally should be empty - but in the case of a failed 1793 * migration might have some droppings in. 1794 */ 1795 RCU_READ_LOCK_GUARD(); 1796 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1797 memory_region_unref(mspr->rb->mr); 1798 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1799 g_free(mspr); 1800 } 1801 } 1802 1803 /** 1804 * ram_save_queue_pages: queue the page for transmission 1805 * 1806 * A request from postcopy destination for example. 1807 * 1808 * Returns zero on success or negative on error 1809 * 1810 * @rbname: Name of the RAMBLock of the request. NULL means the 1811 * same that last one. 1812 * @start: starting address from the start of the RAMBlock 1813 * @len: length (in bytes) to send 1814 */ 1815 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1816 { 1817 RAMBlock *ramblock; 1818 RAMState *rs = ram_state; 1819 1820 ram_counters.postcopy_requests++; 1821 RCU_READ_LOCK_GUARD(); 1822 1823 if (!rbname) { 1824 /* Reuse last RAMBlock */ 1825 ramblock = rs->last_req_rb; 1826 1827 if (!ramblock) { 1828 /* 1829 * Shouldn't happen, we can't reuse the last RAMBlock if 1830 * it's the 1st request. 1831 */ 1832 error_report("ram_save_queue_pages no previous block"); 1833 return -1; 1834 } 1835 } else { 1836 ramblock = qemu_ram_block_by_name(rbname); 1837 1838 if (!ramblock) { 1839 /* We shouldn't be asked for a non-existent RAMBlock */ 1840 error_report("ram_save_queue_pages no block '%s'", rbname); 1841 return -1; 1842 } 1843 rs->last_req_rb = ramblock; 1844 } 1845 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1846 if (start + len > ramblock->used_length) { 1847 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1848 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1849 __func__, start, len, ramblock->used_length); 1850 return -1; 1851 } 1852 1853 struct RAMSrcPageRequest *new_entry = 1854 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1855 new_entry->rb = ramblock; 1856 new_entry->offset = start; 1857 new_entry->len = len; 1858 1859 memory_region_ref(ramblock->mr); 1860 qemu_mutex_lock(&rs->src_page_req_mutex); 1861 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1862 migration_make_urgent_request(); 1863 qemu_mutex_unlock(&rs->src_page_req_mutex); 1864 1865 return 0; 1866 } 1867 1868 static bool save_page_use_compression(RAMState *rs) 1869 { 1870 if (!migrate_use_compression()) { 1871 return false; 1872 } 1873 1874 /* 1875 * If xbzrle is on, stop using the data compression after first 1876 * round of migration even if compression is enabled. In theory, 1877 * xbzrle can do better than compression. 1878 */ 1879 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 1880 return true; 1881 } 1882 1883 return false; 1884 } 1885 1886 /* 1887 * try to compress the page before posting it out, return true if the page 1888 * has been properly handled by compression, otherwise needs other 1889 * paths to handle it 1890 */ 1891 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1892 { 1893 if (!save_page_use_compression(rs)) { 1894 return false; 1895 } 1896 1897 /* 1898 * When starting the process of a new block, the first page of 1899 * the block should be sent out before other pages in the same 1900 * block, and all the pages in last block should have been sent 1901 * out, keeping this order is important, because the 'cont' flag 1902 * is used to avoid resending the block name. 1903 * 1904 * We post the fist page as normal page as compression will take 1905 * much CPU resource. 1906 */ 1907 if (block != rs->last_sent_block) { 1908 flush_compressed_data(rs); 1909 return false; 1910 } 1911 1912 if (compress_page_with_multi_thread(rs, block, offset) > 0) { 1913 return true; 1914 } 1915 1916 compression_counters.busy++; 1917 return false; 1918 } 1919 1920 /** 1921 * ram_save_target_page: save one target page 1922 * 1923 * Returns the number of pages written 1924 * 1925 * @rs: current RAM state 1926 * @pss: data about the page we want to send 1927 * @last_stage: if we are at the completion stage 1928 */ 1929 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1930 bool last_stage) 1931 { 1932 RAMBlock *block = pss->block; 1933 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1934 int res; 1935 1936 if (control_save_page(rs, block, offset, &res)) { 1937 return res; 1938 } 1939 1940 if (save_compress_page(rs, block, offset)) { 1941 return 1; 1942 } 1943 1944 res = save_zero_page(rs, block, offset); 1945 if (res > 0) { 1946 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1947 * page would be stale 1948 */ 1949 if (!save_page_use_compression(rs)) { 1950 XBZRLE_cache_lock(); 1951 xbzrle_cache_zero_page(rs, block->offset + offset); 1952 XBZRLE_cache_unlock(); 1953 } 1954 ram_release_pages(block->idstr, offset, res); 1955 return res; 1956 } 1957 1958 /* 1959 * Do not use multifd for: 1960 * 1. Compression as the first page in the new block should be posted out 1961 * before sending the compressed page 1962 * 2. In postcopy as one whole host page should be placed 1963 */ 1964 if (!save_page_use_compression(rs) && migrate_use_multifd() 1965 && !migration_in_postcopy()) { 1966 return ram_save_multifd_page(rs, block, offset); 1967 } 1968 1969 return ram_save_page(rs, pss, last_stage); 1970 } 1971 1972 /** 1973 * ram_save_host_page: save a whole host page 1974 * 1975 * Starting at *offset send pages up to the end of the current host 1976 * page. It's valid for the initial offset to point into the middle of 1977 * a host page in which case the remainder of the hostpage is sent. 1978 * Only dirty target pages are sent. Note that the host page size may 1979 * be a huge page for this block. 1980 * The saving stops at the boundary of the used_length of the block 1981 * if the RAMBlock isn't a multiple of the host page size. 1982 * 1983 * Returns the number of pages written or negative on error 1984 * 1985 * @rs: current RAM state 1986 * @ms: current migration state 1987 * @pss: data about the page we want to send 1988 * @last_stage: if we are at the completion stage 1989 */ 1990 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1991 bool last_stage) 1992 { 1993 int tmppages, pages = 0; 1994 size_t pagesize_bits = 1995 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1996 unsigned long start_page = pss->page; 1997 int res; 1998 1999 if (ramblock_is_ignored(pss->block)) { 2000 error_report("block %s should not be migrated !", pss->block->idstr); 2001 return 0; 2002 } 2003 2004 do { 2005 /* Check the pages is dirty and if it is send it */ 2006 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 2007 pss->page++; 2008 continue; 2009 } 2010 2011 tmppages = ram_save_target_page(rs, pss, last_stage); 2012 if (tmppages < 0) { 2013 return tmppages; 2014 } 2015 2016 pages += tmppages; 2017 pss->page++; 2018 /* Allow rate limiting to happen in the middle of huge pages */ 2019 migration_rate_limit(); 2020 } while ((pss->page & (pagesize_bits - 1)) && 2021 offset_in_ramblock(pss->block, 2022 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); 2023 /* The offset we leave with is the last one we looked at */ 2024 pss->page--; 2025 2026 res = ram_save_release_protection(rs, pss, start_page); 2027 return (res < 0 ? res : pages); 2028 } 2029 2030 /** 2031 * ram_find_and_save_block: finds a dirty page and sends it to f 2032 * 2033 * Called within an RCU critical section. 2034 * 2035 * Returns the number of pages written where zero means no dirty pages, 2036 * or negative on error 2037 * 2038 * @rs: current RAM state 2039 * @last_stage: if we are at the completion stage 2040 * 2041 * On systems where host-page-size > target-page-size it will send all the 2042 * pages in a host page that are dirty. 2043 */ 2044 2045 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 2046 { 2047 PageSearchStatus pss; 2048 int pages = 0; 2049 bool again, found; 2050 2051 /* No dirty page as there is zero RAM */ 2052 if (!ram_bytes_total()) { 2053 return pages; 2054 } 2055 2056 pss.block = rs->last_seen_block; 2057 pss.page = rs->last_page; 2058 pss.complete_round = false; 2059 2060 if (!pss.block) { 2061 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 2062 } 2063 2064 do { 2065 again = true; 2066 found = get_queued_page(rs, &pss); 2067 2068 if (!found) { 2069 /* priority queue empty, so just search for something dirty */ 2070 found = find_dirty_block(rs, &pss, &again); 2071 } 2072 2073 if (found) { 2074 pages = ram_save_host_page(rs, &pss, last_stage); 2075 } 2076 } while (!pages && again); 2077 2078 rs->last_seen_block = pss.block; 2079 rs->last_page = pss.page; 2080 2081 return pages; 2082 } 2083 2084 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2085 { 2086 uint64_t pages = size / TARGET_PAGE_SIZE; 2087 2088 if (zero) { 2089 ram_counters.duplicate += pages; 2090 } else { 2091 ram_counters.normal += pages; 2092 ram_counters.transferred += size; 2093 qemu_update_position(f, size); 2094 } 2095 } 2096 2097 static uint64_t ram_bytes_total_common(bool count_ignored) 2098 { 2099 RAMBlock *block; 2100 uint64_t total = 0; 2101 2102 RCU_READ_LOCK_GUARD(); 2103 2104 if (count_ignored) { 2105 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2106 total += block->used_length; 2107 } 2108 } else { 2109 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2110 total += block->used_length; 2111 } 2112 } 2113 return total; 2114 } 2115 2116 uint64_t ram_bytes_total(void) 2117 { 2118 return ram_bytes_total_common(false); 2119 } 2120 2121 static void xbzrle_load_setup(void) 2122 { 2123 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2124 } 2125 2126 static void xbzrle_load_cleanup(void) 2127 { 2128 g_free(XBZRLE.decoded_buf); 2129 XBZRLE.decoded_buf = NULL; 2130 } 2131 2132 static void ram_state_cleanup(RAMState **rsp) 2133 { 2134 if (*rsp) { 2135 migration_page_queue_free(*rsp); 2136 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2137 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2138 g_free(*rsp); 2139 *rsp = NULL; 2140 } 2141 } 2142 2143 static void xbzrle_cleanup(void) 2144 { 2145 XBZRLE_cache_lock(); 2146 if (XBZRLE.cache) { 2147 cache_fini(XBZRLE.cache); 2148 g_free(XBZRLE.encoded_buf); 2149 g_free(XBZRLE.current_buf); 2150 g_free(XBZRLE.zero_target_page); 2151 XBZRLE.cache = NULL; 2152 XBZRLE.encoded_buf = NULL; 2153 XBZRLE.current_buf = NULL; 2154 XBZRLE.zero_target_page = NULL; 2155 } 2156 XBZRLE_cache_unlock(); 2157 } 2158 2159 static void ram_save_cleanup(void *opaque) 2160 { 2161 RAMState **rsp = opaque; 2162 RAMBlock *block; 2163 2164 /* We don't use dirty log with background snapshots */ 2165 if (!migrate_background_snapshot()) { 2166 /* caller have hold iothread lock or is in a bh, so there is 2167 * no writing race against the migration bitmap 2168 */ 2169 memory_global_dirty_log_stop(); 2170 } 2171 2172 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2173 g_free(block->clear_bmap); 2174 block->clear_bmap = NULL; 2175 g_free(block->bmap); 2176 block->bmap = NULL; 2177 } 2178 2179 xbzrle_cleanup(); 2180 compress_threads_save_cleanup(); 2181 ram_state_cleanup(rsp); 2182 } 2183 2184 static void ram_state_reset(RAMState *rs) 2185 { 2186 rs->last_seen_block = NULL; 2187 rs->last_sent_block = NULL; 2188 rs->last_page = 0; 2189 rs->last_version = ram_list.version; 2190 rs->ram_bulk_stage = true; 2191 rs->fpo_enabled = false; 2192 } 2193 2194 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2195 2196 /* 2197 * 'expected' is the value you expect the bitmap mostly to be full 2198 * of; it won't bother printing lines that are all this value. 2199 * If 'todump' is null the migration bitmap is dumped. 2200 */ 2201 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 2202 unsigned long pages) 2203 { 2204 int64_t cur; 2205 int64_t linelen = 128; 2206 char linebuf[129]; 2207 2208 for (cur = 0; cur < pages; cur += linelen) { 2209 int64_t curb; 2210 bool found = false; 2211 /* 2212 * Last line; catch the case where the line length 2213 * is longer than remaining ram 2214 */ 2215 if (cur + linelen > pages) { 2216 linelen = pages - cur; 2217 } 2218 for (curb = 0; curb < linelen; curb++) { 2219 bool thisbit = test_bit(cur + curb, todump); 2220 linebuf[curb] = thisbit ? '1' : '.'; 2221 found = found || (thisbit != expected); 2222 } 2223 if (found) { 2224 linebuf[curb] = '\0'; 2225 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 2226 } 2227 } 2228 } 2229 2230 /* **** functions for postcopy ***** */ 2231 2232 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2233 { 2234 struct RAMBlock *block; 2235 2236 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2237 unsigned long *bitmap = block->bmap; 2238 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2239 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2240 2241 while (run_start < range) { 2242 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2243 ram_discard_range(block->idstr, 2244 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2245 ((ram_addr_t)(run_end - run_start)) 2246 << TARGET_PAGE_BITS); 2247 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2248 } 2249 } 2250 } 2251 2252 /** 2253 * postcopy_send_discard_bm_ram: discard a RAMBlock 2254 * 2255 * Returns zero on success 2256 * 2257 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2258 * 2259 * @ms: current migration state 2260 * @block: RAMBlock to discard 2261 */ 2262 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2263 { 2264 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2265 unsigned long current; 2266 unsigned long *bitmap = block->bmap; 2267 2268 for (current = 0; current < end; ) { 2269 unsigned long one = find_next_bit(bitmap, end, current); 2270 unsigned long zero, discard_length; 2271 2272 if (one >= end) { 2273 break; 2274 } 2275 2276 zero = find_next_zero_bit(bitmap, end, one + 1); 2277 2278 if (zero >= end) { 2279 discard_length = end - one; 2280 } else { 2281 discard_length = zero - one; 2282 } 2283 postcopy_discard_send_range(ms, one, discard_length); 2284 current = one + discard_length; 2285 } 2286 2287 return 0; 2288 } 2289 2290 /** 2291 * postcopy_each_ram_send_discard: discard all RAMBlocks 2292 * 2293 * Returns 0 for success or negative for error 2294 * 2295 * Utility for the outgoing postcopy code. 2296 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2297 * passing it bitmap indexes and name. 2298 * (qemu_ram_foreach_block ends up passing unscaled lengths 2299 * which would mean postcopy code would have to deal with target page) 2300 * 2301 * @ms: current migration state 2302 */ 2303 static int postcopy_each_ram_send_discard(MigrationState *ms) 2304 { 2305 struct RAMBlock *block; 2306 int ret; 2307 2308 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2309 postcopy_discard_send_init(ms, block->idstr); 2310 2311 /* 2312 * Postcopy sends chunks of bitmap over the wire, but it 2313 * just needs indexes at this point, avoids it having 2314 * target page specific code. 2315 */ 2316 ret = postcopy_send_discard_bm_ram(ms, block); 2317 postcopy_discard_send_finish(ms); 2318 if (ret) { 2319 return ret; 2320 } 2321 } 2322 2323 return 0; 2324 } 2325 2326 /** 2327 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2328 * 2329 * Helper for postcopy_chunk_hostpages; it's called twice to 2330 * canonicalize the two bitmaps, that are similar, but one is 2331 * inverted. 2332 * 2333 * Postcopy requires that all target pages in a hostpage are dirty or 2334 * clean, not a mix. This function canonicalizes the bitmaps. 2335 * 2336 * @ms: current migration state 2337 * @block: block that contains the page we want to canonicalize 2338 */ 2339 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2340 { 2341 RAMState *rs = ram_state; 2342 unsigned long *bitmap = block->bmap; 2343 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2344 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2345 unsigned long run_start; 2346 2347 if (block->page_size == TARGET_PAGE_SIZE) { 2348 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2349 return; 2350 } 2351 2352 /* Find a dirty page */ 2353 run_start = find_next_bit(bitmap, pages, 0); 2354 2355 while (run_start < pages) { 2356 2357 /* 2358 * If the start of this run of pages is in the middle of a host 2359 * page, then we need to fixup this host page. 2360 */ 2361 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2362 /* Find the end of this run */ 2363 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2364 /* 2365 * If the end isn't at the start of a host page, then the 2366 * run doesn't finish at the end of a host page 2367 * and we need to discard. 2368 */ 2369 } 2370 2371 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2372 unsigned long page; 2373 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2374 host_ratio); 2375 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2376 2377 /* Clean up the bitmap */ 2378 for (page = fixup_start_addr; 2379 page < fixup_start_addr + host_ratio; page++) { 2380 /* 2381 * Remark them as dirty, updating the count for any pages 2382 * that weren't previously dirty. 2383 */ 2384 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2385 } 2386 } 2387 2388 /* Find the next dirty page for the next iteration */ 2389 run_start = find_next_bit(bitmap, pages, run_start); 2390 } 2391 } 2392 2393 /** 2394 * postcopy_chunk_hostpages: discard any partially sent host page 2395 * 2396 * Utility for the outgoing postcopy code. 2397 * 2398 * Discard any partially sent host-page size chunks, mark any partially 2399 * dirty host-page size chunks as all dirty. In this case the host-page 2400 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2401 * 2402 * Returns zero on success 2403 * 2404 * @ms: current migration state 2405 * @block: block we want to work with 2406 */ 2407 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2408 { 2409 postcopy_discard_send_init(ms, block->idstr); 2410 2411 /* 2412 * Ensure that all partially dirty host pages are made fully dirty. 2413 */ 2414 postcopy_chunk_hostpages_pass(ms, block); 2415 2416 postcopy_discard_send_finish(ms); 2417 return 0; 2418 } 2419 2420 /** 2421 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2422 * 2423 * Returns zero on success 2424 * 2425 * Transmit the set of pages to be discarded after precopy to the target 2426 * these are pages that: 2427 * a) Have been previously transmitted but are now dirty again 2428 * b) Pages that have never been transmitted, this ensures that 2429 * any pages on the destination that have been mapped by background 2430 * tasks get discarded (transparent huge pages is the specific concern) 2431 * Hopefully this is pretty sparse 2432 * 2433 * @ms: current migration state 2434 */ 2435 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2436 { 2437 RAMState *rs = ram_state; 2438 RAMBlock *block; 2439 int ret; 2440 2441 RCU_READ_LOCK_GUARD(); 2442 2443 /* This should be our last sync, the src is now paused */ 2444 migration_bitmap_sync(rs); 2445 2446 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2447 rs->last_seen_block = NULL; 2448 rs->last_sent_block = NULL; 2449 rs->last_page = 0; 2450 2451 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2452 /* Deal with TPS != HPS and huge pages */ 2453 ret = postcopy_chunk_hostpages(ms, block); 2454 if (ret) { 2455 return ret; 2456 } 2457 2458 #ifdef DEBUG_POSTCOPY 2459 ram_debug_dump_bitmap(block->bmap, true, 2460 block->used_length >> TARGET_PAGE_BITS); 2461 #endif 2462 } 2463 trace_ram_postcopy_send_discard_bitmap(); 2464 2465 return postcopy_each_ram_send_discard(ms); 2466 } 2467 2468 /** 2469 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2470 * 2471 * Returns zero on success 2472 * 2473 * @rbname: name of the RAMBlock of the request. NULL means the 2474 * same that last one. 2475 * @start: RAMBlock starting page 2476 * @length: RAMBlock size 2477 */ 2478 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2479 { 2480 trace_ram_discard_range(rbname, start, length); 2481 2482 RCU_READ_LOCK_GUARD(); 2483 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2484 2485 if (!rb) { 2486 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2487 return -1; 2488 } 2489 2490 /* 2491 * On source VM, we don't need to update the received bitmap since 2492 * we don't even have one. 2493 */ 2494 if (rb->receivedmap) { 2495 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2496 length >> qemu_target_page_bits()); 2497 } 2498 2499 return ram_block_discard_range(rb, start, length); 2500 } 2501 2502 /* 2503 * For every allocation, we will try not to crash the VM if the 2504 * allocation failed. 2505 */ 2506 static int xbzrle_init(void) 2507 { 2508 Error *local_err = NULL; 2509 2510 if (!migrate_use_xbzrle()) { 2511 return 0; 2512 } 2513 2514 XBZRLE_cache_lock(); 2515 2516 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2517 if (!XBZRLE.zero_target_page) { 2518 error_report("%s: Error allocating zero page", __func__); 2519 goto err_out; 2520 } 2521 2522 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2523 TARGET_PAGE_SIZE, &local_err); 2524 if (!XBZRLE.cache) { 2525 error_report_err(local_err); 2526 goto free_zero_page; 2527 } 2528 2529 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2530 if (!XBZRLE.encoded_buf) { 2531 error_report("%s: Error allocating encoded_buf", __func__); 2532 goto free_cache; 2533 } 2534 2535 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2536 if (!XBZRLE.current_buf) { 2537 error_report("%s: Error allocating current_buf", __func__); 2538 goto free_encoded_buf; 2539 } 2540 2541 /* We are all good */ 2542 XBZRLE_cache_unlock(); 2543 return 0; 2544 2545 free_encoded_buf: 2546 g_free(XBZRLE.encoded_buf); 2547 XBZRLE.encoded_buf = NULL; 2548 free_cache: 2549 cache_fini(XBZRLE.cache); 2550 XBZRLE.cache = NULL; 2551 free_zero_page: 2552 g_free(XBZRLE.zero_target_page); 2553 XBZRLE.zero_target_page = NULL; 2554 err_out: 2555 XBZRLE_cache_unlock(); 2556 return -ENOMEM; 2557 } 2558 2559 static int ram_state_init(RAMState **rsp) 2560 { 2561 *rsp = g_try_new0(RAMState, 1); 2562 2563 if (!*rsp) { 2564 error_report("%s: Init ramstate fail", __func__); 2565 return -1; 2566 } 2567 2568 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2569 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2570 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2571 2572 /* 2573 * Count the total number of pages used by ram blocks not including any 2574 * gaps due to alignment or unplugs. 2575 * This must match with the initial values of dirty bitmap. 2576 */ 2577 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2578 ram_state_reset(*rsp); 2579 2580 return 0; 2581 } 2582 2583 static void ram_list_init_bitmaps(void) 2584 { 2585 MigrationState *ms = migrate_get_current(); 2586 RAMBlock *block; 2587 unsigned long pages; 2588 uint8_t shift; 2589 2590 /* Skip setting bitmap if there is no RAM */ 2591 if (ram_bytes_total()) { 2592 shift = ms->clear_bitmap_shift; 2593 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2594 error_report("clear_bitmap_shift (%u) too big, using " 2595 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2596 shift = CLEAR_BITMAP_SHIFT_MAX; 2597 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2598 error_report("clear_bitmap_shift (%u) too small, using " 2599 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2600 shift = CLEAR_BITMAP_SHIFT_MIN; 2601 } 2602 2603 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2604 pages = block->max_length >> TARGET_PAGE_BITS; 2605 /* 2606 * The initial dirty bitmap for migration must be set with all 2607 * ones to make sure we'll migrate every guest RAM page to 2608 * destination. 2609 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2610 * new migration after a failed migration, ram_list. 2611 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2612 * guest memory. 2613 */ 2614 block->bmap = bitmap_new(pages); 2615 bitmap_set(block->bmap, 0, pages); 2616 block->clear_bmap_shift = shift; 2617 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2618 } 2619 } 2620 } 2621 2622 static void ram_init_bitmaps(RAMState *rs) 2623 { 2624 /* For memory_global_dirty_log_start below. */ 2625 qemu_mutex_lock_iothread(); 2626 qemu_mutex_lock_ramlist(); 2627 2628 WITH_RCU_READ_LOCK_GUARD() { 2629 ram_list_init_bitmaps(); 2630 /* We don't use dirty log with background snapshots */ 2631 if (!migrate_background_snapshot()) { 2632 memory_global_dirty_log_start(); 2633 migration_bitmap_sync_precopy(rs); 2634 } 2635 } 2636 qemu_mutex_unlock_ramlist(); 2637 qemu_mutex_unlock_iothread(); 2638 } 2639 2640 static int ram_init_all(RAMState **rsp) 2641 { 2642 if (ram_state_init(rsp)) { 2643 return -1; 2644 } 2645 2646 if (xbzrle_init()) { 2647 ram_state_cleanup(rsp); 2648 return -1; 2649 } 2650 2651 ram_init_bitmaps(*rsp); 2652 2653 return 0; 2654 } 2655 2656 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2657 { 2658 RAMBlock *block; 2659 uint64_t pages = 0; 2660 2661 /* 2662 * Postcopy is not using xbzrle/compression, so no need for that. 2663 * Also, since source are already halted, we don't need to care 2664 * about dirty page logging as well. 2665 */ 2666 2667 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2668 pages += bitmap_count_one(block->bmap, 2669 block->used_length >> TARGET_PAGE_BITS); 2670 } 2671 2672 /* This may not be aligned with current bitmaps. Recalculate. */ 2673 rs->migration_dirty_pages = pages; 2674 2675 rs->last_seen_block = NULL; 2676 rs->last_sent_block = NULL; 2677 rs->last_page = 0; 2678 rs->last_version = ram_list.version; 2679 /* 2680 * Disable the bulk stage, otherwise we'll resend the whole RAM no 2681 * matter what we have sent. 2682 */ 2683 rs->ram_bulk_stage = false; 2684 2685 /* Update RAMState cache of output QEMUFile */ 2686 rs->f = out; 2687 2688 trace_ram_state_resume_prepare(pages); 2689 } 2690 2691 /* 2692 * This function clears bits of the free pages reported by the caller from the 2693 * migration dirty bitmap. @addr is the host address corresponding to the 2694 * start of the continuous guest free pages, and @len is the total bytes of 2695 * those pages. 2696 */ 2697 void qemu_guest_free_page_hint(void *addr, size_t len) 2698 { 2699 RAMBlock *block; 2700 ram_addr_t offset; 2701 size_t used_len, start, npages; 2702 MigrationState *s = migrate_get_current(); 2703 2704 /* This function is currently expected to be used during live migration */ 2705 if (!migration_is_setup_or_active(s->state)) { 2706 return; 2707 } 2708 2709 for (; len > 0; len -= used_len, addr += used_len) { 2710 block = qemu_ram_block_from_host(addr, false, &offset); 2711 if (unlikely(!block || offset >= block->used_length)) { 2712 /* 2713 * The implementation might not support RAMBlock resize during 2714 * live migration, but it could happen in theory with future 2715 * updates. So we add a check here to capture that case. 2716 */ 2717 error_report_once("%s unexpected error", __func__); 2718 return; 2719 } 2720 2721 if (len <= block->used_length - offset) { 2722 used_len = len; 2723 } else { 2724 used_len = block->used_length - offset; 2725 } 2726 2727 start = offset >> TARGET_PAGE_BITS; 2728 npages = used_len >> TARGET_PAGE_BITS; 2729 2730 qemu_mutex_lock(&ram_state->bitmap_mutex); 2731 ram_state->migration_dirty_pages -= 2732 bitmap_count_one_with_offset(block->bmap, start, npages); 2733 bitmap_clear(block->bmap, start, npages); 2734 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2735 } 2736 } 2737 2738 /* 2739 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2740 * long-running RCU critical section. When rcu-reclaims in the code 2741 * start to become numerous it will be necessary to reduce the 2742 * granularity of these critical sections. 2743 */ 2744 2745 /** 2746 * ram_save_setup: Setup RAM for migration 2747 * 2748 * Returns zero to indicate success and negative for error 2749 * 2750 * @f: QEMUFile where to send the data 2751 * @opaque: RAMState pointer 2752 */ 2753 static int ram_save_setup(QEMUFile *f, void *opaque) 2754 { 2755 RAMState **rsp = opaque; 2756 RAMBlock *block; 2757 2758 if (compress_threads_save_setup()) { 2759 return -1; 2760 } 2761 2762 /* migration has already setup the bitmap, reuse it. */ 2763 if (!migration_in_colo_state()) { 2764 if (ram_init_all(rsp) != 0) { 2765 compress_threads_save_cleanup(); 2766 return -1; 2767 } 2768 } 2769 (*rsp)->f = f; 2770 2771 WITH_RCU_READ_LOCK_GUARD() { 2772 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 2773 2774 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2775 qemu_put_byte(f, strlen(block->idstr)); 2776 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2777 qemu_put_be64(f, block->used_length); 2778 if (migrate_postcopy_ram() && block->page_size != 2779 qemu_host_page_size) { 2780 qemu_put_be64(f, block->page_size); 2781 } 2782 if (migrate_ignore_shared()) { 2783 qemu_put_be64(f, block->mr->addr); 2784 } 2785 } 2786 } 2787 2788 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2789 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2790 2791 multifd_send_sync_main(f); 2792 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2793 qemu_fflush(f); 2794 2795 return 0; 2796 } 2797 2798 /** 2799 * ram_save_iterate: iterative stage for migration 2800 * 2801 * Returns zero to indicate success and negative for error 2802 * 2803 * @f: QEMUFile where to send the data 2804 * @opaque: RAMState pointer 2805 */ 2806 static int ram_save_iterate(QEMUFile *f, void *opaque) 2807 { 2808 RAMState **temp = opaque; 2809 RAMState *rs = *temp; 2810 int ret = 0; 2811 int i; 2812 int64_t t0; 2813 int done = 0; 2814 2815 if (blk_mig_bulk_active()) { 2816 /* Avoid transferring ram during bulk phase of block migration as 2817 * the bulk phase will usually take a long time and transferring 2818 * ram updates during that time is pointless. */ 2819 goto out; 2820 } 2821 2822 WITH_RCU_READ_LOCK_GUARD() { 2823 if (ram_list.version != rs->last_version) { 2824 ram_state_reset(rs); 2825 } 2826 2827 /* Read version before ram_list.blocks */ 2828 smp_rmb(); 2829 2830 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2831 2832 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2833 i = 0; 2834 while ((ret = qemu_file_rate_limit(f)) == 0 || 2835 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 2836 int pages; 2837 2838 if (qemu_file_get_error(f)) { 2839 break; 2840 } 2841 2842 pages = ram_find_and_save_block(rs, false); 2843 /* no more pages to sent */ 2844 if (pages == 0) { 2845 done = 1; 2846 break; 2847 } 2848 2849 if (pages < 0) { 2850 qemu_file_set_error(f, pages); 2851 break; 2852 } 2853 2854 rs->target_page_count += pages; 2855 2856 /* 2857 * During postcopy, it is necessary to make sure one whole host 2858 * page is sent in one chunk. 2859 */ 2860 if (migrate_postcopy_ram()) { 2861 flush_compressed_data(rs); 2862 } 2863 2864 /* 2865 * we want to check in the 1st loop, just in case it was the 1st 2866 * time and we had to sync the dirty bitmap. 2867 * qemu_clock_get_ns() is a bit expensive, so we only check each 2868 * some iterations 2869 */ 2870 if ((i & 63) == 0) { 2871 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 2872 1000000; 2873 if (t1 > MAX_WAIT) { 2874 trace_ram_save_iterate_big_wait(t1, i); 2875 break; 2876 } 2877 } 2878 i++; 2879 } 2880 } 2881 2882 /* 2883 * Must occur before EOS (or any QEMUFile operation) 2884 * because of RDMA protocol. 2885 */ 2886 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2887 2888 out: 2889 if (ret >= 0 2890 && migration_is_setup_or_active(migrate_get_current()->state)) { 2891 multifd_send_sync_main(rs->f); 2892 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2893 qemu_fflush(f); 2894 ram_counters.transferred += 8; 2895 2896 ret = qemu_file_get_error(f); 2897 } 2898 if (ret < 0) { 2899 return ret; 2900 } 2901 2902 return done; 2903 } 2904 2905 /** 2906 * ram_save_complete: function called to send the remaining amount of ram 2907 * 2908 * Returns zero to indicate success or negative on error 2909 * 2910 * Called with iothread lock 2911 * 2912 * @f: QEMUFile where to send the data 2913 * @opaque: RAMState pointer 2914 */ 2915 static int ram_save_complete(QEMUFile *f, void *opaque) 2916 { 2917 RAMState **temp = opaque; 2918 RAMState *rs = *temp; 2919 int ret = 0; 2920 2921 WITH_RCU_READ_LOCK_GUARD() { 2922 if (!migration_in_postcopy()) { 2923 migration_bitmap_sync_precopy(rs); 2924 } 2925 2926 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2927 2928 /* try transferring iterative blocks of memory */ 2929 2930 /* flush all remaining blocks regardless of rate limiting */ 2931 while (true) { 2932 int pages; 2933 2934 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2935 /* no more blocks to sent */ 2936 if (pages == 0) { 2937 break; 2938 } 2939 if (pages < 0) { 2940 ret = pages; 2941 break; 2942 } 2943 } 2944 2945 flush_compressed_data(rs); 2946 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2947 } 2948 2949 if (ret >= 0) { 2950 multifd_send_sync_main(rs->f); 2951 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2952 qemu_fflush(f); 2953 } 2954 2955 return ret; 2956 } 2957 2958 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2959 uint64_t *res_precopy_only, 2960 uint64_t *res_compatible, 2961 uint64_t *res_postcopy_only) 2962 { 2963 RAMState **temp = opaque; 2964 RAMState *rs = *temp; 2965 uint64_t remaining_size; 2966 2967 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2968 2969 if (!migration_in_postcopy() && 2970 remaining_size < max_size) { 2971 qemu_mutex_lock_iothread(); 2972 WITH_RCU_READ_LOCK_GUARD() { 2973 migration_bitmap_sync_precopy(rs); 2974 } 2975 qemu_mutex_unlock_iothread(); 2976 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2977 } 2978 2979 if (migrate_postcopy_ram()) { 2980 /* We can do postcopy, and all the data is postcopiable */ 2981 *res_compatible += remaining_size; 2982 } else { 2983 *res_precopy_only += remaining_size; 2984 } 2985 } 2986 2987 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2988 { 2989 unsigned int xh_len; 2990 int xh_flags; 2991 uint8_t *loaded_data; 2992 2993 /* extract RLE header */ 2994 xh_flags = qemu_get_byte(f); 2995 xh_len = qemu_get_be16(f); 2996 2997 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2998 error_report("Failed to load XBZRLE page - wrong compression!"); 2999 return -1; 3000 } 3001 3002 if (xh_len > TARGET_PAGE_SIZE) { 3003 error_report("Failed to load XBZRLE page - len overflow!"); 3004 return -1; 3005 } 3006 loaded_data = XBZRLE.decoded_buf; 3007 /* load data and decode */ 3008 /* it can change loaded_data to point to an internal buffer */ 3009 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3010 3011 /* decode RLE */ 3012 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3013 TARGET_PAGE_SIZE) == -1) { 3014 error_report("Failed to load XBZRLE page - decode error!"); 3015 return -1; 3016 } 3017 3018 return 0; 3019 } 3020 3021 /** 3022 * ram_block_from_stream: read a RAMBlock id from the migration stream 3023 * 3024 * Must be called from within a rcu critical section. 3025 * 3026 * Returns a pointer from within the RCU-protected ram_list. 3027 * 3028 * @f: QEMUFile where to read the data from 3029 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3030 */ 3031 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 3032 { 3033 static RAMBlock *block; 3034 char id[256]; 3035 uint8_t len; 3036 3037 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3038 if (!block) { 3039 error_report("Ack, bad migration stream!"); 3040 return NULL; 3041 } 3042 return block; 3043 } 3044 3045 len = qemu_get_byte(f); 3046 qemu_get_buffer(f, (uint8_t *)id, len); 3047 id[len] = 0; 3048 3049 block = qemu_ram_block_by_name(id); 3050 if (!block) { 3051 error_report("Can't find block %s", id); 3052 return NULL; 3053 } 3054 3055 if (ramblock_is_ignored(block)) { 3056 error_report("block %s should not be migrated !", id); 3057 return NULL; 3058 } 3059 3060 return block; 3061 } 3062 3063 static inline void *host_from_ram_block_offset(RAMBlock *block, 3064 ram_addr_t offset) 3065 { 3066 if (!offset_in_ramblock(block, offset)) { 3067 return NULL; 3068 } 3069 3070 return block->host + offset; 3071 } 3072 3073 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3074 ram_addr_t offset, bool record_bitmap) 3075 { 3076 if (!offset_in_ramblock(block, offset)) { 3077 return NULL; 3078 } 3079 if (!block->colo_cache) { 3080 error_report("%s: colo_cache is NULL in block :%s", 3081 __func__, block->idstr); 3082 return NULL; 3083 } 3084 3085 /* 3086 * During colo checkpoint, we need bitmap of these migrated pages. 3087 * It help us to decide which pages in ram cache should be flushed 3088 * into VM's RAM later. 3089 */ 3090 if (record_bitmap && 3091 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3092 ram_state->migration_dirty_pages++; 3093 } 3094 return block->colo_cache + offset; 3095 } 3096 3097 /** 3098 * ram_handle_compressed: handle the zero page case 3099 * 3100 * If a page (or a whole RDMA chunk) has been 3101 * determined to be zero, then zap it. 3102 * 3103 * @host: host address for the zero page 3104 * @ch: what the page is filled from. We only support zero 3105 * @size: size of the zero page 3106 */ 3107 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3108 { 3109 if (ch != 0 || !is_zero_range(host, size)) { 3110 memset(host, ch, size); 3111 } 3112 } 3113 3114 /* return the size after decompression, or negative value on error */ 3115 static int 3116 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3117 const uint8_t *source, size_t source_len) 3118 { 3119 int err; 3120 3121 err = inflateReset(stream); 3122 if (err != Z_OK) { 3123 return -1; 3124 } 3125 3126 stream->avail_in = source_len; 3127 stream->next_in = (uint8_t *)source; 3128 stream->avail_out = dest_len; 3129 stream->next_out = dest; 3130 3131 err = inflate(stream, Z_NO_FLUSH); 3132 if (err != Z_STREAM_END) { 3133 return -1; 3134 } 3135 3136 return stream->total_out; 3137 } 3138 3139 static void *do_data_decompress(void *opaque) 3140 { 3141 DecompressParam *param = opaque; 3142 unsigned long pagesize; 3143 uint8_t *des; 3144 int len, ret; 3145 3146 qemu_mutex_lock(¶m->mutex); 3147 while (!param->quit) { 3148 if (param->des) { 3149 des = param->des; 3150 len = param->len; 3151 param->des = 0; 3152 qemu_mutex_unlock(¶m->mutex); 3153 3154 pagesize = TARGET_PAGE_SIZE; 3155 3156 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3157 param->compbuf, len); 3158 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3159 error_report("decompress data failed"); 3160 qemu_file_set_error(decomp_file, ret); 3161 } 3162 3163 qemu_mutex_lock(&decomp_done_lock); 3164 param->done = true; 3165 qemu_cond_signal(&decomp_done_cond); 3166 qemu_mutex_unlock(&decomp_done_lock); 3167 3168 qemu_mutex_lock(¶m->mutex); 3169 } else { 3170 qemu_cond_wait(¶m->cond, ¶m->mutex); 3171 } 3172 } 3173 qemu_mutex_unlock(¶m->mutex); 3174 3175 return NULL; 3176 } 3177 3178 static int wait_for_decompress_done(void) 3179 { 3180 int idx, thread_count; 3181 3182 if (!migrate_use_compression()) { 3183 return 0; 3184 } 3185 3186 thread_count = migrate_decompress_threads(); 3187 qemu_mutex_lock(&decomp_done_lock); 3188 for (idx = 0; idx < thread_count; idx++) { 3189 while (!decomp_param[idx].done) { 3190 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3191 } 3192 } 3193 qemu_mutex_unlock(&decomp_done_lock); 3194 return qemu_file_get_error(decomp_file); 3195 } 3196 3197 static void compress_threads_load_cleanup(void) 3198 { 3199 int i, thread_count; 3200 3201 if (!migrate_use_compression()) { 3202 return; 3203 } 3204 thread_count = migrate_decompress_threads(); 3205 for (i = 0; i < thread_count; i++) { 3206 /* 3207 * we use it as a indicator which shows if the thread is 3208 * properly init'd or not 3209 */ 3210 if (!decomp_param[i].compbuf) { 3211 break; 3212 } 3213 3214 qemu_mutex_lock(&decomp_param[i].mutex); 3215 decomp_param[i].quit = true; 3216 qemu_cond_signal(&decomp_param[i].cond); 3217 qemu_mutex_unlock(&decomp_param[i].mutex); 3218 } 3219 for (i = 0; i < thread_count; i++) { 3220 if (!decomp_param[i].compbuf) { 3221 break; 3222 } 3223 3224 qemu_thread_join(decompress_threads + i); 3225 qemu_mutex_destroy(&decomp_param[i].mutex); 3226 qemu_cond_destroy(&decomp_param[i].cond); 3227 inflateEnd(&decomp_param[i].stream); 3228 g_free(decomp_param[i].compbuf); 3229 decomp_param[i].compbuf = NULL; 3230 } 3231 g_free(decompress_threads); 3232 g_free(decomp_param); 3233 decompress_threads = NULL; 3234 decomp_param = NULL; 3235 decomp_file = NULL; 3236 } 3237 3238 static int compress_threads_load_setup(QEMUFile *f) 3239 { 3240 int i, thread_count; 3241 3242 if (!migrate_use_compression()) { 3243 return 0; 3244 } 3245 3246 thread_count = migrate_decompress_threads(); 3247 decompress_threads = g_new0(QemuThread, thread_count); 3248 decomp_param = g_new0(DecompressParam, thread_count); 3249 qemu_mutex_init(&decomp_done_lock); 3250 qemu_cond_init(&decomp_done_cond); 3251 decomp_file = f; 3252 for (i = 0; i < thread_count; i++) { 3253 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3254 goto exit; 3255 } 3256 3257 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3258 qemu_mutex_init(&decomp_param[i].mutex); 3259 qemu_cond_init(&decomp_param[i].cond); 3260 decomp_param[i].done = true; 3261 decomp_param[i].quit = false; 3262 qemu_thread_create(decompress_threads + i, "decompress", 3263 do_data_decompress, decomp_param + i, 3264 QEMU_THREAD_JOINABLE); 3265 } 3266 return 0; 3267 exit: 3268 compress_threads_load_cleanup(); 3269 return -1; 3270 } 3271 3272 static void decompress_data_with_multi_threads(QEMUFile *f, 3273 void *host, int len) 3274 { 3275 int idx, thread_count; 3276 3277 thread_count = migrate_decompress_threads(); 3278 qemu_mutex_lock(&decomp_done_lock); 3279 while (true) { 3280 for (idx = 0; idx < thread_count; idx++) { 3281 if (decomp_param[idx].done) { 3282 decomp_param[idx].done = false; 3283 qemu_mutex_lock(&decomp_param[idx].mutex); 3284 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3285 decomp_param[idx].des = host; 3286 decomp_param[idx].len = len; 3287 qemu_cond_signal(&decomp_param[idx].cond); 3288 qemu_mutex_unlock(&decomp_param[idx].mutex); 3289 break; 3290 } 3291 } 3292 if (idx < thread_count) { 3293 break; 3294 } else { 3295 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3296 } 3297 } 3298 qemu_mutex_unlock(&decomp_done_lock); 3299 } 3300 3301 /* 3302 * we must set ram_bulk_stage to false, otherwise in 3303 * migation_bitmap_find_dirty the bitmap will be unused and 3304 * all the pages in ram cache wil be flushed to the ram of 3305 * secondary VM. 3306 */ 3307 static void colo_init_ram_state(void) 3308 { 3309 ram_state_init(&ram_state); 3310 ram_state->ram_bulk_stage = false; 3311 } 3312 3313 /* 3314 * colo cache: this is for secondary VM, we cache the whole 3315 * memory of the secondary VM, it is need to hold the global lock 3316 * to call this helper. 3317 */ 3318 int colo_init_ram_cache(void) 3319 { 3320 RAMBlock *block; 3321 3322 WITH_RCU_READ_LOCK_GUARD() { 3323 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3324 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3325 NULL, 3326 false); 3327 if (!block->colo_cache) { 3328 error_report("%s: Can't alloc memory for COLO cache of block %s," 3329 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3330 block->used_length); 3331 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3332 if (block->colo_cache) { 3333 qemu_anon_ram_free(block->colo_cache, block->used_length); 3334 block->colo_cache = NULL; 3335 } 3336 } 3337 return -errno; 3338 } 3339 } 3340 } 3341 3342 /* 3343 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3344 * with to decide which page in cache should be flushed into SVM's RAM. Here 3345 * we use the same name 'ram_bitmap' as for migration. 3346 */ 3347 if (ram_bytes_total()) { 3348 RAMBlock *block; 3349 3350 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3351 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3352 block->bmap = bitmap_new(pages); 3353 } 3354 } 3355 3356 colo_init_ram_state(); 3357 return 0; 3358 } 3359 3360 /* TODO: duplicated with ram_init_bitmaps */ 3361 void colo_incoming_start_dirty_log(void) 3362 { 3363 RAMBlock *block = NULL; 3364 /* For memory_global_dirty_log_start below. */ 3365 qemu_mutex_lock_iothread(); 3366 qemu_mutex_lock_ramlist(); 3367 3368 memory_global_dirty_log_sync(); 3369 WITH_RCU_READ_LOCK_GUARD() { 3370 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3371 ramblock_sync_dirty_bitmap(ram_state, block); 3372 /* Discard this dirty bitmap record */ 3373 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3374 } 3375 memory_global_dirty_log_start(); 3376 } 3377 ram_state->migration_dirty_pages = 0; 3378 qemu_mutex_unlock_ramlist(); 3379 qemu_mutex_unlock_iothread(); 3380 } 3381 3382 /* It is need to hold the global lock to call this helper */ 3383 void colo_release_ram_cache(void) 3384 { 3385 RAMBlock *block; 3386 3387 memory_global_dirty_log_stop(); 3388 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3389 g_free(block->bmap); 3390 block->bmap = NULL; 3391 } 3392 3393 WITH_RCU_READ_LOCK_GUARD() { 3394 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3395 if (block->colo_cache) { 3396 qemu_anon_ram_free(block->colo_cache, block->used_length); 3397 block->colo_cache = NULL; 3398 } 3399 } 3400 } 3401 ram_state_cleanup(&ram_state); 3402 } 3403 3404 /** 3405 * ram_load_setup: Setup RAM for migration incoming side 3406 * 3407 * Returns zero to indicate success and negative for error 3408 * 3409 * @f: QEMUFile where to receive the data 3410 * @opaque: RAMState pointer 3411 */ 3412 static int ram_load_setup(QEMUFile *f, void *opaque) 3413 { 3414 if (compress_threads_load_setup(f)) { 3415 return -1; 3416 } 3417 3418 xbzrle_load_setup(); 3419 ramblock_recv_map_init(); 3420 3421 return 0; 3422 } 3423 3424 static int ram_load_cleanup(void *opaque) 3425 { 3426 RAMBlock *rb; 3427 3428 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3429 qemu_ram_block_writeback(rb); 3430 } 3431 3432 xbzrle_load_cleanup(); 3433 compress_threads_load_cleanup(); 3434 3435 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3436 g_free(rb->receivedmap); 3437 rb->receivedmap = NULL; 3438 } 3439 3440 return 0; 3441 } 3442 3443 /** 3444 * ram_postcopy_incoming_init: allocate postcopy data structures 3445 * 3446 * Returns 0 for success and negative if there was one error 3447 * 3448 * @mis: current migration incoming state 3449 * 3450 * Allocate data structures etc needed by incoming migration with 3451 * postcopy-ram. postcopy-ram's similarly names 3452 * postcopy_ram_incoming_init does the work. 3453 */ 3454 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3455 { 3456 return postcopy_ram_incoming_init(mis); 3457 } 3458 3459 /** 3460 * ram_load_postcopy: load a page in postcopy case 3461 * 3462 * Returns 0 for success or -errno in case of error 3463 * 3464 * Called in postcopy mode by ram_load(). 3465 * rcu_read_lock is taken prior to this being called. 3466 * 3467 * @f: QEMUFile where to send the data 3468 */ 3469 static int ram_load_postcopy(QEMUFile *f) 3470 { 3471 int flags = 0, ret = 0; 3472 bool place_needed = false; 3473 bool matches_target_page_size = false; 3474 MigrationIncomingState *mis = migration_incoming_get_current(); 3475 /* Temporary page that is later 'placed' */ 3476 void *postcopy_host_page = mis->postcopy_tmp_page; 3477 void *this_host = NULL; 3478 bool all_zero = true; 3479 int target_pages = 0; 3480 3481 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3482 ram_addr_t addr; 3483 void *host = NULL; 3484 void *page_buffer = NULL; 3485 void *place_source = NULL; 3486 RAMBlock *block = NULL; 3487 uint8_t ch; 3488 int len; 3489 3490 addr = qemu_get_be64(f); 3491 3492 /* 3493 * If qemu file error, we should stop here, and then "addr" 3494 * may be invalid 3495 */ 3496 ret = qemu_file_get_error(f); 3497 if (ret) { 3498 break; 3499 } 3500 3501 flags = addr & ~TARGET_PAGE_MASK; 3502 addr &= TARGET_PAGE_MASK; 3503 3504 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3505 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3506 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 3507 block = ram_block_from_stream(f, flags); 3508 3509 host = host_from_ram_block_offset(block, addr); 3510 if (!host) { 3511 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3512 ret = -EINVAL; 3513 break; 3514 } 3515 target_pages++; 3516 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3517 /* 3518 * Postcopy requires that we place whole host pages atomically; 3519 * these may be huge pages for RAMBlocks that are backed by 3520 * hugetlbfs. 3521 * To make it atomic, the data is read into a temporary page 3522 * that's moved into place later. 3523 * The migration protocol uses, possibly smaller, target-pages 3524 * however the source ensures it always sends all the components 3525 * of a host page in one chunk. 3526 */ 3527 page_buffer = postcopy_host_page + 3528 ((uintptr_t)host & (block->page_size - 1)); 3529 if (target_pages == 1) { 3530 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3531 block->page_size); 3532 } else { 3533 /* not the 1st TP within the HP */ 3534 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) != 3535 (uintptr_t)this_host) { 3536 error_report("Non-same host page %p/%p", 3537 host, this_host); 3538 ret = -EINVAL; 3539 break; 3540 } 3541 } 3542 3543 /* 3544 * If it's the last part of a host page then we place the host 3545 * page 3546 */ 3547 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { 3548 place_needed = true; 3549 } 3550 place_source = postcopy_host_page; 3551 } 3552 3553 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3554 case RAM_SAVE_FLAG_ZERO: 3555 ch = qemu_get_byte(f); 3556 /* 3557 * Can skip to set page_buffer when 3558 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3559 */ 3560 if (ch || !matches_target_page_size) { 3561 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3562 } 3563 if (ch) { 3564 all_zero = false; 3565 } 3566 break; 3567 3568 case RAM_SAVE_FLAG_PAGE: 3569 all_zero = false; 3570 if (!matches_target_page_size) { 3571 /* For huge pages, we always use temporary buffer */ 3572 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3573 } else { 3574 /* 3575 * For small pages that matches target page size, we 3576 * avoid the qemu_file copy. Instead we directly use 3577 * the buffer of QEMUFile to place the page. Note: we 3578 * cannot do any QEMUFile operation before using that 3579 * buffer to make sure the buffer is valid when 3580 * placing the page. 3581 */ 3582 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3583 TARGET_PAGE_SIZE); 3584 } 3585 break; 3586 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3587 all_zero = false; 3588 len = qemu_get_be32(f); 3589 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3590 error_report("Invalid compressed data length: %d", len); 3591 ret = -EINVAL; 3592 break; 3593 } 3594 decompress_data_with_multi_threads(f, page_buffer, len); 3595 break; 3596 3597 case RAM_SAVE_FLAG_EOS: 3598 /* normal exit */ 3599 multifd_recv_sync_main(); 3600 break; 3601 default: 3602 error_report("Unknown combination of migration flags: 0x%x" 3603 " (postcopy mode)", flags); 3604 ret = -EINVAL; 3605 break; 3606 } 3607 3608 /* Got the whole host page, wait for decompress before placing. */ 3609 if (place_needed) { 3610 ret |= wait_for_decompress_done(); 3611 } 3612 3613 /* Detect for any possible file errors */ 3614 if (!ret && qemu_file_get_error(f)) { 3615 ret = qemu_file_get_error(f); 3616 } 3617 3618 if (!ret && place_needed) { 3619 /* This gets called at the last target page in the host page */ 3620 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, 3621 block->page_size); 3622 3623 if (all_zero) { 3624 ret = postcopy_place_page_zero(mis, place_dest, 3625 block); 3626 } else { 3627 ret = postcopy_place_page(mis, place_dest, 3628 place_source, block); 3629 } 3630 place_needed = false; 3631 target_pages = 0; 3632 /* Assume we have a zero page until we detect something different */ 3633 all_zero = true; 3634 } 3635 } 3636 3637 return ret; 3638 } 3639 3640 static bool postcopy_is_advised(void) 3641 { 3642 PostcopyState ps = postcopy_state_get(); 3643 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3644 } 3645 3646 static bool postcopy_is_running(void) 3647 { 3648 PostcopyState ps = postcopy_state_get(); 3649 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3650 } 3651 3652 /* 3653 * Flush content of RAM cache into SVM's memory. 3654 * Only flush the pages that be dirtied by PVM or SVM or both. 3655 */ 3656 void colo_flush_ram_cache(void) 3657 { 3658 RAMBlock *block = NULL; 3659 void *dst_host; 3660 void *src_host; 3661 unsigned long offset = 0; 3662 3663 memory_global_dirty_log_sync(); 3664 WITH_RCU_READ_LOCK_GUARD() { 3665 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3666 ramblock_sync_dirty_bitmap(ram_state, block); 3667 } 3668 } 3669 3670 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3671 WITH_RCU_READ_LOCK_GUARD() { 3672 block = QLIST_FIRST_RCU(&ram_list.blocks); 3673 3674 while (block) { 3675 offset = migration_bitmap_find_dirty(ram_state, block, offset); 3676 3677 if (((ram_addr_t)offset) << TARGET_PAGE_BITS 3678 >= block->used_length) { 3679 offset = 0; 3680 block = QLIST_NEXT_RCU(block, next); 3681 } else { 3682 migration_bitmap_clear_dirty(ram_state, block, offset); 3683 dst_host = block->host 3684 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3685 src_host = block->colo_cache 3686 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3687 memcpy(dst_host, src_host, TARGET_PAGE_SIZE); 3688 } 3689 } 3690 } 3691 trace_colo_flush_ram_cache_end(); 3692 } 3693 3694 /** 3695 * ram_load_precopy: load pages in precopy case 3696 * 3697 * Returns 0 for success or -errno in case of error 3698 * 3699 * Called in precopy mode by ram_load(). 3700 * rcu_read_lock is taken prior to this being called. 3701 * 3702 * @f: QEMUFile where to send the data 3703 */ 3704 static int ram_load_precopy(QEMUFile *f) 3705 { 3706 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 3707 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3708 bool postcopy_advised = postcopy_is_advised(); 3709 if (!migrate_use_compression()) { 3710 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3711 } 3712 3713 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3714 ram_addr_t addr, total_ram_bytes; 3715 void *host = NULL, *host_bak = NULL; 3716 uint8_t ch; 3717 3718 /* 3719 * Yield periodically to let main loop run, but an iteration of 3720 * the main loop is expensive, so do it each some iterations 3721 */ 3722 if ((i & 32767) == 0 && qemu_in_coroutine()) { 3723 aio_co_schedule(qemu_get_current_aio_context(), 3724 qemu_coroutine_self()); 3725 qemu_coroutine_yield(); 3726 } 3727 i++; 3728 3729 addr = qemu_get_be64(f); 3730 flags = addr & ~TARGET_PAGE_MASK; 3731 addr &= TARGET_PAGE_MASK; 3732 3733 if (flags & invalid_flags) { 3734 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3735 error_report("Received an unexpected compressed page"); 3736 } 3737 3738 ret = -EINVAL; 3739 break; 3740 } 3741 3742 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3743 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3744 RAMBlock *block = ram_block_from_stream(f, flags); 3745 3746 host = host_from_ram_block_offset(block, addr); 3747 /* 3748 * After going into COLO stage, we should not load the page 3749 * into SVM's memory directly, we put them into colo_cache firstly. 3750 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 3751 * Previously, we copied all these memory in preparing stage of COLO 3752 * while we need to stop VM, which is a time-consuming process. 3753 * Here we optimize it by a trick, back-up every page while in 3754 * migration process while COLO is enabled, though it affects the 3755 * speed of the migration, but it obviously reduce the downtime of 3756 * back-up all SVM'S memory in COLO preparing stage. 3757 */ 3758 if (migration_incoming_colo_enabled()) { 3759 if (migration_incoming_in_colo_state()) { 3760 /* In COLO stage, put all pages into cache temporarily */ 3761 host = colo_cache_from_block_offset(block, addr, true); 3762 } else { 3763 /* 3764 * In migration stage but before COLO stage, 3765 * Put all pages into both cache and SVM's memory. 3766 */ 3767 host_bak = colo_cache_from_block_offset(block, addr, false); 3768 } 3769 } 3770 if (!host) { 3771 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3772 ret = -EINVAL; 3773 break; 3774 } 3775 if (!migration_incoming_in_colo_state()) { 3776 ramblock_recv_bitmap_set(block, host); 3777 } 3778 3779 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3780 } 3781 3782 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3783 case RAM_SAVE_FLAG_MEM_SIZE: 3784 /* Synchronize RAM block list */ 3785 total_ram_bytes = addr; 3786 while (!ret && total_ram_bytes) { 3787 RAMBlock *block; 3788 char id[256]; 3789 ram_addr_t length; 3790 3791 len = qemu_get_byte(f); 3792 qemu_get_buffer(f, (uint8_t *)id, len); 3793 id[len] = 0; 3794 length = qemu_get_be64(f); 3795 3796 block = qemu_ram_block_by_name(id); 3797 if (block && !qemu_ram_is_migratable(block)) { 3798 error_report("block %s should not be migrated !", id); 3799 ret = -EINVAL; 3800 } else if (block) { 3801 if (length != block->used_length) { 3802 Error *local_err = NULL; 3803 3804 ret = qemu_ram_resize(block, length, 3805 &local_err); 3806 if (local_err) { 3807 error_report_err(local_err); 3808 } 3809 } 3810 /* For postcopy we need to check hugepage sizes match */ 3811 if (postcopy_advised && migrate_postcopy_ram() && 3812 block->page_size != qemu_host_page_size) { 3813 uint64_t remote_page_size = qemu_get_be64(f); 3814 if (remote_page_size != block->page_size) { 3815 error_report("Mismatched RAM page size %s " 3816 "(local) %zd != %" PRId64, 3817 id, block->page_size, 3818 remote_page_size); 3819 ret = -EINVAL; 3820 } 3821 } 3822 if (migrate_ignore_shared()) { 3823 hwaddr addr = qemu_get_be64(f); 3824 if (ramblock_is_ignored(block) && 3825 block->mr->addr != addr) { 3826 error_report("Mismatched GPAs for block %s " 3827 "%" PRId64 "!= %" PRId64, 3828 id, (uint64_t)addr, 3829 (uint64_t)block->mr->addr); 3830 ret = -EINVAL; 3831 } 3832 } 3833 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3834 block->idstr); 3835 } else { 3836 error_report("Unknown ramblock \"%s\", cannot " 3837 "accept migration", id); 3838 ret = -EINVAL; 3839 } 3840 3841 total_ram_bytes -= length; 3842 } 3843 break; 3844 3845 case RAM_SAVE_FLAG_ZERO: 3846 ch = qemu_get_byte(f); 3847 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3848 break; 3849 3850 case RAM_SAVE_FLAG_PAGE: 3851 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3852 break; 3853 3854 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3855 len = qemu_get_be32(f); 3856 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3857 error_report("Invalid compressed data length: %d", len); 3858 ret = -EINVAL; 3859 break; 3860 } 3861 decompress_data_with_multi_threads(f, host, len); 3862 break; 3863 3864 case RAM_SAVE_FLAG_XBZRLE: 3865 if (load_xbzrle(f, addr, host) < 0) { 3866 error_report("Failed to decompress XBZRLE page at " 3867 RAM_ADDR_FMT, addr); 3868 ret = -EINVAL; 3869 break; 3870 } 3871 break; 3872 case RAM_SAVE_FLAG_EOS: 3873 /* normal exit */ 3874 multifd_recv_sync_main(); 3875 break; 3876 default: 3877 if (flags & RAM_SAVE_FLAG_HOOK) { 3878 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3879 } else { 3880 error_report("Unknown combination of migration flags: 0x%x", 3881 flags); 3882 ret = -EINVAL; 3883 } 3884 } 3885 if (!ret) { 3886 ret = qemu_file_get_error(f); 3887 } 3888 if (!ret && host_bak) { 3889 memcpy(host_bak, host, TARGET_PAGE_SIZE); 3890 } 3891 } 3892 3893 ret |= wait_for_decompress_done(); 3894 return ret; 3895 } 3896 3897 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3898 { 3899 int ret = 0; 3900 static uint64_t seq_iter; 3901 /* 3902 * If system is running in postcopy mode, page inserts to host memory must 3903 * be atomic 3904 */ 3905 bool postcopy_running = postcopy_is_running(); 3906 3907 seq_iter++; 3908 3909 if (version_id != 4) { 3910 return -EINVAL; 3911 } 3912 3913 /* 3914 * This RCU critical section can be very long running. 3915 * When RCU reclaims in the code start to become numerous, 3916 * it will be necessary to reduce the granularity of this 3917 * critical section. 3918 */ 3919 WITH_RCU_READ_LOCK_GUARD() { 3920 if (postcopy_running) { 3921 ret = ram_load_postcopy(f); 3922 } else { 3923 ret = ram_load_precopy(f); 3924 } 3925 } 3926 trace_ram_load_complete(ret, seq_iter); 3927 3928 return ret; 3929 } 3930 3931 static bool ram_has_postcopy(void *opaque) 3932 { 3933 RAMBlock *rb; 3934 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3935 if (ramblock_is_pmem(rb)) { 3936 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 3937 "is not supported now!", rb->idstr, rb->host); 3938 return false; 3939 } 3940 } 3941 3942 return migrate_postcopy_ram(); 3943 } 3944 3945 /* Sync all the dirty bitmap with destination VM. */ 3946 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 3947 { 3948 RAMBlock *block; 3949 QEMUFile *file = s->to_dst_file; 3950 int ramblock_count = 0; 3951 3952 trace_ram_dirty_bitmap_sync_start(); 3953 3954 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3955 qemu_savevm_send_recv_bitmap(file, block->idstr); 3956 trace_ram_dirty_bitmap_request(block->idstr); 3957 ramblock_count++; 3958 } 3959 3960 trace_ram_dirty_bitmap_sync_wait(); 3961 3962 /* Wait until all the ramblocks' dirty bitmap synced */ 3963 while (ramblock_count--) { 3964 qemu_sem_wait(&s->rp_state.rp_sem); 3965 } 3966 3967 trace_ram_dirty_bitmap_sync_complete(); 3968 3969 return 0; 3970 } 3971 3972 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 3973 { 3974 qemu_sem_post(&s->rp_state.rp_sem); 3975 } 3976 3977 /* 3978 * Read the received bitmap, revert it as the initial dirty bitmap. 3979 * This is only used when the postcopy migration is paused but wants 3980 * to resume from a middle point. 3981 */ 3982 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 3983 { 3984 int ret = -EINVAL; 3985 QEMUFile *file = s->rp_state.from_dst_file; 3986 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 3987 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 3988 uint64_t size, end_mark; 3989 3990 trace_ram_dirty_bitmap_reload_begin(block->idstr); 3991 3992 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 3993 error_report("%s: incorrect state %s", __func__, 3994 MigrationStatus_str(s->state)); 3995 return -EINVAL; 3996 } 3997 3998 /* 3999 * Note: see comments in ramblock_recv_bitmap_send() on why we 4000 * need the endianness conversion, and the paddings. 4001 */ 4002 local_size = ROUND_UP(local_size, 8); 4003 4004 /* Add paddings */ 4005 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4006 4007 size = qemu_get_be64(file); 4008 4009 /* The size of the bitmap should match with our ramblock */ 4010 if (size != local_size) { 4011 error_report("%s: ramblock '%s' bitmap size mismatch " 4012 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4013 block->idstr, size, local_size); 4014 ret = -EINVAL; 4015 goto out; 4016 } 4017 4018 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4019 end_mark = qemu_get_be64(file); 4020 4021 ret = qemu_file_get_error(file); 4022 if (ret || size != local_size) { 4023 error_report("%s: read bitmap failed for ramblock '%s': %d" 4024 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4025 __func__, block->idstr, ret, local_size, size); 4026 ret = -EIO; 4027 goto out; 4028 } 4029 4030 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4031 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4032 __func__, block->idstr, end_mark); 4033 ret = -EINVAL; 4034 goto out; 4035 } 4036 4037 /* 4038 * Endianness conversion. We are during postcopy (though paused). 4039 * The dirty bitmap won't change. We can directly modify it. 4040 */ 4041 bitmap_from_le(block->bmap, le_bitmap, nbits); 4042 4043 /* 4044 * What we received is "received bitmap". Revert it as the initial 4045 * dirty bitmap for this ramblock. 4046 */ 4047 bitmap_complement(block->bmap, block->bmap, nbits); 4048 4049 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4050 4051 /* 4052 * We succeeded to sync bitmap for current ramblock. If this is 4053 * the last one to sync, we need to notify the main send thread. 4054 */ 4055 ram_dirty_bitmap_reload_notify(s); 4056 4057 ret = 0; 4058 out: 4059 g_free(le_bitmap); 4060 return ret; 4061 } 4062 4063 static int ram_resume_prepare(MigrationState *s, void *opaque) 4064 { 4065 RAMState *rs = *(RAMState **)opaque; 4066 int ret; 4067 4068 ret = ram_dirty_bitmap_sync_all(s, rs); 4069 if (ret) { 4070 return ret; 4071 } 4072 4073 ram_state_resume_prepare(rs, s->to_dst_file); 4074 4075 return 0; 4076 } 4077 4078 static SaveVMHandlers savevm_ram_handlers = { 4079 .save_setup = ram_save_setup, 4080 .save_live_iterate = ram_save_iterate, 4081 .save_live_complete_postcopy = ram_save_complete, 4082 .save_live_complete_precopy = ram_save_complete, 4083 .has_postcopy = ram_has_postcopy, 4084 .save_live_pending = ram_save_pending, 4085 .load_state = ram_load, 4086 .save_cleanup = ram_save_cleanup, 4087 .load_setup = ram_load_setup, 4088 .load_cleanup = ram_load_cleanup, 4089 .resume_prepare = ram_resume_prepare, 4090 }; 4091 4092 void ram_mig_init(void) 4093 { 4094 qemu_mutex_init(&XBZRLE.lock); 4095 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4096 } 4097