1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "io/channel-null.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "sysemu/runstate.h" 60 61 #include "hw/boards.h" /* for machine_dump_guest_core() */ 62 63 #if defined(__linux__) 64 #include "qemu/userfaultfd.h" 65 #endif /* defined(__linux__) */ 66 67 /***********************************************************/ 68 /* ram save/restore */ 69 70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 71 * worked for pages that where filled with the same char. We switched 72 * it to only search for the zero value. And to avoid confusion with 73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 74 */ 75 76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 77 #define RAM_SAVE_FLAG_ZERO 0x02 78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 79 #define RAM_SAVE_FLAG_PAGE 0x08 80 #define RAM_SAVE_FLAG_EOS 0x10 81 #define RAM_SAVE_FLAG_CONTINUE 0x20 82 #define RAM_SAVE_FLAG_XBZRLE 0x40 83 /* 0x80 is reserved in migration.h start with 0x100 next */ 84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 85 86 XBZRLECacheStats xbzrle_counters; 87 88 /* used by the search for pages to send */ 89 struct PageSearchStatus { 90 /* The migration channel used for a specific host page */ 91 QEMUFile *pss_channel; 92 /* Last block from where we have sent data */ 93 RAMBlock *last_sent_block; 94 /* Current block being searched */ 95 RAMBlock *block; 96 /* Current page to search from */ 97 unsigned long page; 98 /* Set once we wrap around */ 99 bool complete_round; 100 /* Whether we're sending a host page */ 101 bool host_page_sending; 102 /* The start/end of current host page. Invalid if host_page_sending==false */ 103 unsigned long host_page_start; 104 unsigned long host_page_end; 105 }; 106 typedef struct PageSearchStatus PageSearchStatus; 107 108 /* struct contains XBZRLE cache and a static page 109 used by the compression */ 110 static struct { 111 /* buffer used for XBZRLE encoding */ 112 uint8_t *encoded_buf; 113 /* buffer for storing page content */ 114 uint8_t *current_buf; 115 /* Cache for XBZRLE, Protected by lock. */ 116 PageCache *cache; 117 QemuMutex lock; 118 /* it will store a page full of zeros */ 119 uint8_t *zero_target_page; 120 /* buffer used for XBZRLE decoding */ 121 uint8_t *decoded_buf; 122 } XBZRLE; 123 124 static void XBZRLE_cache_lock(void) 125 { 126 if (migrate_use_xbzrle()) { 127 qemu_mutex_lock(&XBZRLE.lock); 128 } 129 } 130 131 static void XBZRLE_cache_unlock(void) 132 { 133 if (migrate_use_xbzrle()) { 134 qemu_mutex_unlock(&XBZRLE.lock); 135 } 136 } 137 138 /** 139 * xbzrle_cache_resize: resize the xbzrle cache 140 * 141 * This function is called from migrate_params_apply in main 142 * thread, possibly while a migration is in progress. A running 143 * migration may be using the cache and might finish during this call, 144 * hence changes to the cache are protected by XBZRLE.lock(). 145 * 146 * Returns 0 for success or -1 for error 147 * 148 * @new_size: new cache size 149 * @errp: set *errp if the check failed, with reason 150 */ 151 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 152 { 153 PageCache *new_cache; 154 int64_t ret = 0; 155 156 /* Check for truncation */ 157 if (new_size != (size_t)new_size) { 158 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 159 "exceeding address space"); 160 return -1; 161 } 162 163 if (new_size == migrate_xbzrle_cache_size()) { 164 /* nothing to do */ 165 return 0; 166 } 167 168 XBZRLE_cache_lock(); 169 170 if (XBZRLE.cache != NULL) { 171 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 172 if (!new_cache) { 173 ret = -1; 174 goto out; 175 } 176 177 cache_fini(XBZRLE.cache); 178 XBZRLE.cache = new_cache; 179 } 180 out: 181 XBZRLE_cache_unlock(); 182 return ret; 183 } 184 185 static bool postcopy_preempt_active(void) 186 { 187 return migrate_postcopy_preempt() && migration_in_postcopy(); 188 } 189 190 bool ramblock_is_ignored(RAMBlock *block) 191 { 192 return !qemu_ram_is_migratable(block) || 193 (migrate_ignore_shared() && qemu_ram_is_shared(block)); 194 } 195 196 #undef RAMBLOCK_FOREACH 197 198 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 199 { 200 RAMBlock *block; 201 int ret = 0; 202 203 RCU_READ_LOCK_GUARD(); 204 205 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 206 ret = func(block, opaque); 207 if (ret) { 208 break; 209 } 210 } 211 return ret; 212 } 213 214 static void ramblock_recv_map_init(void) 215 { 216 RAMBlock *rb; 217 218 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 219 assert(!rb->receivedmap); 220 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 221 } 222 } 223 224 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 225 { 226 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 227 rb->receivedmap); 228 } 229 230 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 231 { 232 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 233 } 234 235 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 236 { 237 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 238 } 239 240 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 241 size_t nr) 242 { 243 bitmap_set_atomic(rb->receivedmap, 244 ramblock_recv_bitmap_offset(host_addr, rb), 245 nr); 246 } 247 248 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 249 250 /* 251 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 252 * 253 * Returns >0 if success with sent bytes, or <0 if error. 254 */ 255 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 256 const char *block_name) 257 { 258 RAMBlock *block = qemu_ram_block_by_name(block_name); 259 unsigned long *le_bitmap, nbits; 260 uint64_t size; 261 262 if (!block) { 263 error_report("%s: invalid block name: %s", __func__, block_name); 264 return -1; 265 } 266 267 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 268 269 /* 270 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 271 * machines we may need 4 more bytes for padding (see below 272 * comment). So extend it a bit before hand. 273 */ 274 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 275 276 /* 277 * Always use little endian when sending the bitmap. This is 278 * required that when source and destination VMs are not using the 279 * same endianness. (Note: big endian won't work.) 280 */ 281 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 282 283 /* Size of the bitmap, in bytes */ 284 size = DIV_ROUND_UP(nbits, 8); 285 286 /* 287 * size is always aligned to 8 bytes for 64bit machines, but it 288 * may not be true for 32bit machines. We need this padding to 289 * make sure the migration can survive even between 32bit and 290 * 64bit machines. 291 */ 292 size = ROUND_UP(size, 8); 293 294 qemu_put_be64(file, size); 295 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 296 /* 297 * Mark as an end, in case the middle part is screwed up due to 298 * some "mysterious" reason. 299 */ 300 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 301 qemu_fflush(file); 302 303 g_free(le_bitmap); 304 305 if (qemu_file_get_error(file)) { 306 return qemu_file_get_error(file); 307 } 308 309 return size + sizeof(size); 310 } 311 312 /* 313 * An outstanding page request, on the source, having been received 314 * and queued 315 */ 316 struct RAMSrcPageRequest { 317 RAMBlock *rb; 318 hwaddr offset; 319 hwaddr len; 320 321 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 322 }; 323 324 /* State of RAM for migration */ 325 struct RAMState { 326 /* 327 * PageSearchStatus structures for the channels when send pages. 328 * Protected by the bitmap_mutex. 329 */ 330 PageSearchStatus pss[RAM_CHANNEL_MAX]; 331 /* UFFD file descriptor, used in 'write-tracking' migration */ 332 int uffdio_fd; 333 /* Last block that we have visited searching for dirty pages */ 334 RAMBlock *last_seen_block; 335 /* Last dirty target page we have sent */ 336 ram_addr_t last_page; 337 /* last ram version we have seen */ 338 uint32_t last_version; 339 /* How many times we have dirty too many pages */ 340 int dirty_rate_high_cnt; 341 /* these variables are used for bitmap sync */ 342 /* last time we did a full bitmap_sync */ 343 int64_t time_last_bitmap_sync; 344 /* bytes transferred at start_time */ 345 uint64_t bytes_xfer_prev; 346 /* number of dirty pages since start_time */ 347 uint64_t num_dirty_pages_period; 348 /* xbzrle misses since the beginning of the period */ 349 uint64_t xbzrle_cache_miss_prev; 350 /* Amount of xbzrle pages since the beginning of the period */ 351 uint64_t xbzrle_pages_prev; 352 /* Amount of xbzrle encoded bytes since the beginning of the period */ 353 uint64_t xbzrle_bytes_prev; 354 /* Start using XBZRLE (e.g., after the first round). */ 355 bool xbzrle_enabled; 356 /* Are we on the last stage of migration */ 357 bool last_stage; 358 /* compression statistics since the beginning of the period */ 359 /* amount of count that no free thread to compress data */ 360 uint64_t compress_thread_busy_prev; 361 /* amount bytes after compression */ 362 uint64_t compressed_size_prev; 363 /* amount of compressed pages */ 364 uint64_t compress_pages_prev; 365 366 /* total handled target pages at the beginning of period */ 367 uint64_t target_page_count_prev; 368 /* total handled target pages since start */ 369 uint64_t target_page_count; 370 /* number of dirty bits in the bitmap */ 371 uint64_t migration_dirty_pages; 372 /* 373 * Protects: 374 * - dirty/clear bitmap 375 * - migration_dirty_pages 376 * - pss structures 377 */ 378 QemuMutex bitmap_mutex; 379 /* The RAMBlock used in the last src_page_requests */ 380 RAMBlock *last_req_rb; 381 /* Queue of outstanding page requests from the destination */ 382 QemuMutex src_page_req_mutex; 383 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 384 }; 385 typedef struct RAMState RAMState; 386 387 static RAMState *ram_state; 388 389 static NotifierWithReturnList precopy_notifier_list; 390 391 /* Whether postcopy has queued requests? */ 392 static bool postcopy_has_request(RAMState *rs) 393 { 394 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 395 } 396 397 void precopy_infrastructure_init(void) 398 { 399 notifier_with_return_list_init(&precopy_notifier_list); 400 } 401 402 void precopy_add_notifier(NotifierWithReturn *n) 403 { 404 notifier_with_return_list_add(&precopy_notifier_list, n); 405 } 406 407 void precopy_remove_notifier(NotifierWithReturn *n) 408 { 409 notifier_with_return_remove(n); 410 } 411 412 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 413 { 414 PrecopyNotifyData pnd; 415 pnd.reason = reason; 416 pnd.errp = errp; 417 418 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); 419 } 420 421 uint64_t ram_bytes_remaining(void) 422 { 423 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 424 0; 425 } 426 427 /* 428 * NOTE: not all stats in ram_counters are used in reality. See comments 429 * for struct MigrationAtomicStats. The ultimate result of ram migration 430 * counters will be a merged version with both ram_counters and the atomic 431 * fields in ram_atomic_counters. 432 */ 433 MigrationStats ram_counters; 434 MigrationAtomicStats ram_atomic_counters; 435 436 void ram_transferred_add(uint64_t bytes) 437 { 438 if (runstate_is_running()) { 439 ram_counters.precopy_bytes += bytes; 440 } else if (migration_in_postcopy()) { 441 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes); 442 } else { 443 ram_counters.downtime_bytes += bytes; 444 } 445 stat64_add(&ram_atomic_counters.transferred, bytes); 446 } 447 448 void dirty_sync_missed_zero_copy(void) 449 { 450 ram_counters.dirty_sync_missed_zero_copy++; 451 } 452 453 CompressionStats compression_counters; 454 455 struct CompressParam { 456 bool done; 457 bool quit; 458 bool zero_page; 459 QEMUFile *file; 460 QemuMutex mutex; 461 QemuCond cond; 462 RAMBlock *block; 463 ram_addr_t offset; 464 465 /* internally used fields */ 466 z_stream stream; 467 uint8_t *originbuf; 468 }; 469 typedef struct CompressParam CompressParam; 470 471 struct DecompressParam { 472 bool done; 473 bool quit; 474 QemuMutex mutex; 475 QemuCond cond; 476 void *des; 477 uint8_t *compbuf; 478 int len; 479 z_stream stream; 480 }; 481 typedef struct DecompressParam DecompressParam; 482 483 static CompressParam *comp_param; 484 static QemuThread *compress_threads; 485 /* comp_done_cond is used to wake up the migration thread when 486 * one of the compression threads has finished the compression. 487 * comp_done_lock is used to co-work with comp_done_cond. 488 */ 489 static QemuMutex comp_done_lock; 490 static QemuCond comp_done_cond; 491 492 static QEMUFile *decomp_file; 493 static DecompressParam *decomp_param; 494 static QemuThread *decompress_threads; 495 static QemuMutex decomp_done_lock; 496 static QemuCond decomp_done_cond; 497 498 static int ram_save_host_page_urgent(PageSearchStatus *pss); 499 500 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 501 ram_addr_t offset, uint8_t *source_buf); 502 503 /* NOTE: page is the PFN not real ram_addr_t. */ 504 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 505 { 506 pss->block = rb; 507 pss->page = page; 508 pss->complete_round = false; 509 } 510 511 /* 512 * Check whether two PSSs are actively sending the same page. Return true 513 * if it is, false otherwise. 514 */ 515 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 516 { 517 return pss1->host_page_sending && pss2->host_page_sending && 518 (pss1->host_page_start == pss2->host_page_start); 519 } 520 521 static void *do_data_compress(void *opaque) 522 { 523 CompressParam *param = opaque; 524 RAMBlock *block; 525 ram_addr_t offset; 526 bool zero_page; 527 528 qemu_mutex_lock(¶m->mutex); 529 while (!param->quit) { 530 if (param->block) { 531 block = param->block; 532 offset = param->offset; 533 param->block = NULL; 534 qemu_mutex_unlock(¶m->mutex); 535 536 zero_page = do_compress_ram_page(param->file, ¶m->stream, 537 block, offset, param->originbuf); 538 539 qemu_mutex_lock(&comp_done_lock); 540 param->done = true; 541 param->zero_page = zero_page; 542 qemu_cond_signal(&comp_done_cond); 543 qemu_mutex_unlock(&comp_done_lock); 544 545 qemu_mutex_lock(¶m->mutex); 546 } else { 547 qemu_cond_wait(¶m->cond, ¶m->mutex); 548 } 549 } 550 qemu_mutex_unlock(¶m->mutex); 551 552 return NULL; 553 } 554 555 static void compress_threads_save_cleanup(void) 556 { 557 int i, thread_count; 558 559 if (!migrate_use_compression() || !comp_param) { 560 return; 561 } 562 563 thread_count = migrate_compress_threads(); 564 for (i = 0; i < thread_count; i++) { 565 /* 566 * we use it as a indicator which shows if the thread is 567 * properly init'd or not 568 */ 569 if (!comp_param[i].file) { 570 break; 571 } 572 573 qemu_mutex_lock(&comp_param[i].mutex); 574 comp_param[i].quit = true; 575 qemu_cond_signal(&comp_param[i].cond); 576 qemu_mutex_unlock(&comp_param[i].mutex); 577 578 qemu_thread_join(compress_threads + i); 579 qemu_mutex_destroy(&comp_param[i].mutex); 580 qemu_cond_destroy(&comp_param[i].cond); 581 deflateEnd(&comp_param[i].stream); 582 g_free(comp_param[i].originbuf); 583 qemu_fclose(comp_param[i].file); 584 comp_param[i].file = NULL; 585 } 586 qemu_mutex_destroy(&comp_done_lock); 587 qemu_cond_destroy(&comp_done_cond); 588 g_free(compress_threads); 589 g_free(comp_param); 590 compress_threads = NULL; 591 comp_param = NULL; 592 } 593 594 static int compress_threads_save_setup(void) 595 { 596 int i, thread_count; 597 598 if (!migrate_use_compression()) { 599 return 0; 600 } 601 thread_count = migrate_compress_threads(); 602 compress_threads = g_new0(QemuThread, thread_count); 603 comp_param = g_new0(CompressParam, thread_count); 604 qemu_cond_init(&comp_done_cond); 605 qemu_mutex_init(&comp_done_lock); 606 for (i = 0; i < thread_count; i++) { 607 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 608 if (!comp_param[i].originbuf) { 609 goto exit; 610 } 611 612 if (deflateInit(&comp_param[i].stream, 613 migrate_compress_level()) != Z_OK) { 614 g_free(comp_param[i].originbuf); 615 goto exit; 616 } 617 618 /* comp_param[i].file is just used as a dummy buffer to save data, 619 * set its ops to empty. 620 */ 621 comp_param[i].file = qemu_file_new_output( 622 QIO_CHANNEL(qio_channel_null_new())); 623 comp_param[i].done = true; 624 comp_param[i].quit = false; 625 qemu_mutex_init(&comp_param[i].mutex); 626 qemu_cond_init(&comp_param[i].cond); 627 qemu_thread_create(compress_threads + i, "compress", 628 do_data_compress, comp_param + i, 629 QEMU_THREAD_JOINABLE); 630 } 631 return 0; 632 633 exit: 634 compress_threads_save_cleanup(); 635 return -1; 636 } 637 638 /** 639 * save_page_header: write page header to wire 640 * 641 * If this is the 1st block, it also writes the block identification 642 * 643 * Returns the number of bytes written 644 * 645 * @pss: current PSS channel status 646 * @block: block that contains the page we want to send 647 * @offset: offset inside the block for the page 648 * in the lower bits, it contains flags 649 */ 650 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block, 651 ram_addr_t offset) 652 { 653 size_t size, len; 654 bool same_block = (block == pss->last_sent_block); 655 QEMUFile *f = pss->pss_channel; 656 657 if (same_block) { 658 offset |= RAM_SAVE_FLAG_CONTINUE; 659 } 660 qemu_put_be64(f, offset); 661 size = 8; 662 663 if (!same_block) { 664 len = strlen(block->idstr); 665 qemu_put_byte(f, len); 666 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 667 size += 1 + len; 668 pss->last_sent_block = block; 669 } 670 return size; 671 } 672 673 /** 674 * mig_throttle_guest_down: throttle down the guest 675 * 676 * Reduce amount of guest cpu execution to hopefully slow down memory 677 * writes. If guest dirty memory rate is reduced below the rate at 678 * which we can transfer pages to the destination then we should be 679 * able to complete migration. Some workloads dirty memory way too 680 * fast and will not effectively converge, even with auto-converge. 681 */ 682 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 683 uint64_t bytes_dirty_threshold) 684 { 685 MigrationState *s = migrate_get_current(); 686 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 687 uint64_t pct_increment = s->parameters.cpu_throttle_increment; 688 bool pct_tailslow = s->parameters.cpu_throttle_tailslow; 689 int pct_max = s->parameters.max_cpu_throttle; 690 691 uint64_t throttle_now = cpu_throttle_get_percentage(); 692 uint64_t cpu_now, cpu_ideal, throttle_inc; 693 694 /* We have not started throttling yet. Let's start it. */ 695 if (!cpu_throttle_active()) { 696 cpu_throttle_set(pct_initial); 697 } else { 698 /* Throttling already on, just increase the rate */ 699 if (!pct_tailslow) { 700 throttle_inc = pct_increment; 701 } else { 702 /* Compute the ideal CPU percentage used by Guest, which may 703 * make the dirty rate match the dirty rate threshold. */ 704 cpu_now = 100 - throttle_now; 705 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 706 bytes_dirty_period); 707 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 708 } 709 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 710 } 711 } 712 713 void mig_throttle_counter_reset(void) 714 { 715 RAMState *rs = ram_state; 716 717 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 718 rs->num_dirty_pages_period = 0; 719 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); 720 } 721 722 /** 723 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 724 * 725 * @rs: current RAM state 726 * @current_addr: address for the zero page 727 * 728 * Update the xbzrle cache to reflect a page that's been sent as all 0. 729 * The important thing is that a stale (not-yet-0'd) page be replaced 730 * by the new data. 731 * As a bonus, if the page wasn't in the cache it gets added so that 732 * when a small write is made into the 0'd page it gets XBZRLE sent. 733 */ 734 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 735 { 736 /* We don't care if this fails to allocate a new cache page 737 * as long as it updated an old one */ 738 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 739 ram_counters.dirty_sync_count); 740 } 741 742 #define ENCODING_FLAG_XBZRLE 0x1 743 744 /** 745 * save_xbzrle_page: compress and send current page 746 * 747 * Returns: 1 means that we wrote the page 748 * 0 means that page is identical to the one already sent 749 * -1 means that xbzrle would be longer than normal 750 * 751 * @rs: current RAM state 752 * @pss: current PSS channel 753 * @current_data: pointer to the address of the page contents 754 * @current_addr: addr of the page 755 * @block: block that contains the page we want to send 756 * @offset: offset inside the block for the page 757 */ 758 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 759 uint8_t **current_data, ram_addr_t current_addr, 760 RAMBlock *block, ram_addr_t offset) 761 { 762 int encoded_len = 0, bytes_xbzrle; 763 uint8_t *prev_cached_page; 764 QEMUFile *file = pss->pss_channel; 765 766 if (!cache_is_cached(XBZRLE.cache, current_addr, 767 ram_counters.dirty_sync_count)) { 768 xbzrle_counters.cache_miss++; 769 if (!rs->last_stage) { 770 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 771 ram_counters.dirty_sync_count) == -1) { 772 return -1; 773 } else { 774 /* update *current_data when the page has been 775 inserted into cache */ 776 *current_data = get_cached_data(XBZRLE.cache, current_addr); 777 } 778 } 779 return -1; 780 } 781 782 /* 783 * Reaching here means the page has hit the xbzrle cache, no matter what 784 * encoding result it is (normal encoding, overflow or skipping the page), 785 * count the page as encoded. This is used to calculate the encoding rate. 786 * 787 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 788 * 2nd page turns out to be skipped (i.e. no new bytes written to the 789 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 790 * skipped page included. In this way, the encoding rate can tell if the 791 * guest page is good for xbzrle encoding. 792 */ 793 xbzrle_counters.pages++; 794 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 795 796 /* save current buffer into memory */ 797 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 798 799 /* XBZRLE encoding (if there is no overflow) */ 800 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 801 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 802 TARGET_PAGE_SIZE); 803 804 /* 805 * Update the cache contents, so that it corresponds to the data 806 * sent, in all cases except where we skip the page. 807 */ 808 if (!rs->last_stage && encoded_len != 0) { 809 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 810 /* 811 * In the case where we couldn't compress, ensure that the caller 812 * sends the data from the cache, since the guest might have 813 * changed the RAM since we copied it. 814 */ 815 *current_data = prev_cached_page; 816 } 817 818 if (encoded_len == 0) { 819 trace_save_xbzrle_page_skipping(); 820 return 0; 821 } else if (encoded_len == -1) { 822 trace_save_xbzrle_page_overflow(); 823 xbzrle_counters.overflow++; 824 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 825 return -1; 826 } 827 828 /* Send XBZRLE based compressed page */ 829 bytes_xbzrle = save_page_header(pss, block, 830 offset | RAM_SAVE_FLAG_XBZRLE); 831 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 832 qemu_put_be16(file, encoded_len); 833 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 834 bytes_xbzrle += encoded_len + 1 + 2; 835 /* 836 * Like compressed_size (please see update_compress_thread_counts), 837 * the xbzrle encoded bytes don't count the 8 byte header with 838 * RAM_SAVE_FLAG_CONTINUE. 839 */ 840 xbzrle_counters.bytes += bytes_xbzrle - 8; 841 ram_transferred_add(bytes_xbzrle); 842 843 return 1; 844 } 845 846 /** 847 * pss_find_next_dirty: find the next dirty page of current ramblock 848 * 849 * This function updates pss->page to point to the next dirty page index 850 * within the ramblock to migrate, or the end of ramblock when nothing 851 * found. Note that when pss->host_page_sending==true it means we're 852 * during sending a host page, so we won't look for dirty page that is 853 * outside the host page boundary. 854 * 855 * @pss: the current page search status 856 */ 857 static void pss_find_next_dirty(PageSearchStatus *pss) 858 { 859 RAMBlock *rb = pss->block; 860 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 861 unsigned long *bitmap = rb->bmap; 862 863 if (ramblock_is_ignored(rb)) { 864 /* Points directly to the end, so we know no dirty page */ 865 pss->page = size; 866 return; 867 } 868 869 /* 870 * If during sending a host page, only look for dirty pages within the 871 * current host page being send. 872 */ 873 if (pss->host_page_sending) { 874 assert(pss->host_page_end); 875 size = MIN(size, pss->host_page_end); 876 } 877 878 pss->page = find_next_bit(bitmap, size, pss->page); 879 } 880 881 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 882 unsigned long page) 883 { 884 uint8_t shift; 885 hwaddr size, start; 886 887 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 888 return; 889 } 890 891 shift = rb->clear_bmap_shift; 892 /* 893 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 894 * can make things easier sometimes since then start address 895 * of the small chunk will always be 64 pages aligned so the 896 * bitmap will always be aligned to unsigned long. We should 897 * even be able to remove this restriction but I'm simply 898 * keeping it. 899 */ 900 assert(shift >= 6); 901 902 size = 1ULL << (TARGET_PAGE_BITS + shift); 903 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 904 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 905 memory_region_clear_dirty_bitmap(rb->mr, start, size); 906 } 907 908 static void 909 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 910 unsigned long start, 911 unsigned long npages) 912 { 913 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 914 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 915 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 916 917 /* 918 * Clear pages from start to start + npages - 1, so the end boundary is 919 * exclusive. 920 */ 921 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 922 migration_clear_memory_region_dirty_bitmap(rb, i); 923 } 924 } 925 926 /* 927 * colo_bitmap_find_diry:find contiguous dirty pages from start 928 * 929 * Returns the page offset within memory region of the start of the contiguout 930 * dirty page 931 * 932 * @rs: current RAM state 933 * @rb: RAMBlock where to search for dirty pages 934 * @start: page where we start the search 935 * @num: the number of contiguous dirty pages 936 */ 937 static inline 938 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 939 unsigned long start, unsigned long *num) 940 { 941 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 942 unsigned long *bitmap = rb->bmap; 943 unsigned long first, next; 944 945 *num = 0; 946 947 if (ramblock_is_ignored(rb)) { 948 return size; 949 } 950 951 first = find_next_bit(bitmap, size, start); 952 if (first >= size) { 953 return first; 954 } 955 next = find_next_zero_bit(bitmap, size, first + 1); 956 assert(next >= first); 957 *num = next - first; 958 return first; 959 } 960 961 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 962 RAMBlock *rb, 963 unsigned long page) 964 { 965 bool ret; 966 967 /* 968 * Clear dirty bitmap if needed. This _must_ be called before we 969 * send any of the page in the chunk because we need to make sure 970 * we can capture further page content changes when we sync dirty 971 * log the next time. So as long as we are going to send any of 972 * the page in the chunk we clear the remote dirty bitmap for all. 973 * Clearing it earlier won't be a problem, but too late will. 974 */ 975 migration_clear_memory_region_dirty_bitmap(rb, page); 976 977 ret = test_and_clear_bit(page, rb->bmap); 978 if (ret) { 979 rs->migration_dirty_pages--; 980 } 981 982 return ret; 983 } 984 985 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 986 void *opaque) 987 { 988 const hwaddr offset = section->offset_within_region; 989 const hwaddr size = int128_get64(section->size); 990 const unsigned long start = offset >> TARGET_PAGE_BITS; 991 const unsigned long npages = size >> TARGET_PAGE_BITS; 992 RAMBlock *rb = section->mr->ram_block; 993 uint64_t *cleared_bits = opaque; 994 995 /* 996 * We don't grab ram_state->bitmap_mutex because we expect to run 997 * only when starting migration or during postcopy recovery where 998 * we don't have concurrent access. 999 */ 1000 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 1001 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 1002 } 1003 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 1004 bitmap_clear(rb->bmap, start, npages); 1005 } 1006 1007 /* 1008 * Exclude all dirty pages from migration that fall into a discarded range as 1009 * managed by a RamDiscardManager responsible for the mapped memory region of 1010 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 1011 * 1012 * Discarded pages ("logically unplugged") have undefined content and must 1013 * not get migrated, because even reading these pages for migration might 1014 * result in undesired behavior. 1015 * 1016 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 1017 * 1018 * Note: The result is only stable while migrating (precopy/postcopy). 1019 */ 1020 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 1021 { 1022 uint64_t cleared_bits = 0; 1023 1024 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 1025 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1026 MemoryRegionSection section = { 1027 .mr = rb->mr, 1028 .offset_within_region = 0, 1029 .size = int128_make64(qemu_ram_get_used_length(rb)), 1030 }; 1031 1032 ram_discard_manager_replay_discarded(rdm, §ion, 1033 dirty_bitmap_clear_section, 1034 &cleared_bits); 1035 } 1036 return cleared_bits; 1037 } 1038 1039 /* 1040 * Check if a host-page aligned page falls into a discarded range as managed by 1041 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 1042 * 1043 * Note: The result is only stable while migrating (precopy/postcopy). 1044 */ 1045 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 1046 { 1047 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1048 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1049 MemoryRegionSection section = { 1050 .mr = rb->mr, 1051 .offset_within_region = start, 1052 .size = int128_make64(qemu_ram_pagesize(rb)), 1053 }; 1054 1055 return !ram_discard_manager_is_populated(rdm, §ion); 1056 } 1057 return false; 1058 } 1059 1060 /* Called with RCU critical section */ 1061 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 1062 { 1063 uint64_t new_dirty_pages = 1064 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 1065 1066 rs->migration_dirty_pages += new_dirty_pages; 1067 rs->num_dirty_pages_period += new_dirty_pages; 1068 } 1069 1070 /** 1071 * ram_pagesize_summary: calculate all the pagesizes of a VM 1072 * 1073 * Returns a summary bitmap of the page sizes of all RAMBlocks 1074 * 1075 * For VMs with just normal pages this is equivalent to the host page 1076 * size. If it's got some huge pages then it's the OR of all the 1077 * different page sizes. 1078 */ 1079 uint64_t ram_pagesize_summary(void) 1080 { 1081 RAMBlock *block; 1082 uint64_t summary = 0; 1083 1084 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1085 summary |= block->page_size; 1086 } 1087 1088 return summary; 1089 } 1090 1091 uint64_t ram_get_total_transferred_pages(void) 1092 { 1093 return stat64_get(&ram_atomic_counters.normal) + 1094 stat64_get(&ram_atomic_counters.duplicate) + 1095 compression_counters.pages + xbzrle_counters.pages; 1096 } 1097 1098 static void migration_update_rates(RAMState *rs, int64_t end_time) 1099 { 1100 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 1101 double compressed_size; 1102 1103 /* calculate period counters */ 1104 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1105 / (end_time - rs->time_last_bitmap_sync); 1106 1107 if (!page_count) { 1108 return; 1109 } 1110 1111 if (migrate_use_xbzrle()) { 1112 double encoded_size, unencoded_size; 1113 1114 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 1115 rs->xbzrle_cache_miss_prev) / page_count; 1116 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1117 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 1118 TARGET_PAGE_SIZE; 1119 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 1120 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 1121 xbzrle_counters.encoding_rate = 0; 1122 } else { 1123 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 1124 } 1125 rs->xbzrle_pages_prev = xbzrle_counters.pages; 1126 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 1127 } 1128 1129 if (migrate_use_compression()) { 1130 compression_counters.busy_rate = (double)(compression_counters.busy - 1131 rs->compress_thread_busy_prev) / page_count; 1132 rs->compress_thread_busy_prev = compression_counters.busy; 1133 1134 compressed_size = compression_counters.compressed_size - 1135 rs->compressed_size_prev; 1136 if (compressed_size) { 1137 double uncompressed_size = (compression_counters.pages - 1138 rs->compress_pages_prev) * TARGET_PAGE_SIZE; 1139 1140 /* Compression-Ratio = Uncompressed-size / Compressed-size */ 1141 compression_counters.compression_rate = 1142 uncompressed_size / compressed_size; 1143 1144 rs->compress_pages_prev = compression_counters.pages; 1145 rs->compressed_size_prev = compression_counters.compressed_size; 1146 } 1147 } 1148 } 1149 1150 static void migration_trigger_throttle(RAMState *rs) 1151 { 1152 MigrationState *s = migrate_get_current(); 1153 uint64_t threshold = s->parameters.throttle_trigger_threshold; 1154 uint64_t bytes_xfer_period = 1155 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev; 1156 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1157 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1158 1159 /* During block migration the auto-converge logic incorrectly detects 1160 * that ram migration makes no progress. Avoid this by disabling the 1161 * throttling logic during the bulk phase of block migration. */ 1162 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1163 /* The following detection logic can be refined later. For now: 1164 Check to see if the ratio between dirtied bytes and the approx. 1165 amount of bytes that just got transferred since the last time 1166 we were in this routine reaches the threshold. If that happens 1167 twice, start or increase throttling. */ 1168 1169 if ((bytes_dirty_period > bytes_dirty_threshold) && 1170 (++rs->dirty_rate_high_cnt >= 2)) { 1171 trace_migration_throttle(); 1172 rs->dirty_rate_high_cnt = 0; 1173 mig_throttle_guest_down(bytes_dirty_period, 1174 bytes_dirty_threshold); 1175 } 1176 } 1177 } 1178 1179 static void migration_bitmap_sync(RAMState *rs) 1180 { 1181 RAMBlock *block; 1182 int64_t end_time; 1183 1184 ram_counters.dirty_sync_count++; 1185 1186 if (!rs->time_last_bitmap_sync) { 1187 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1188 } 1189 1190 trace_migration_bitmap_sync_start(); 1191 memory_global_dirty_log_sync(); 1192 1193 qemu_mutex_lock(&rs->bitmap_mutex); 1194 WITH_RCU_READ_LOCK_GUARD() { 1195 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1196 ramblock_sync_dirty_bitmap(rs, block); 1197 } 1198 ram_counters.remaining = ram_bytes_remaining(); 1199 } 1200 qemu_mutex_unlock(&rs->bitmap_mutex); 1201 1202 memory_global_after_dirty_log_sync(); 1203 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1204 1205 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1206 1207 /* more than 1 second = 1000 millisecons */ 1208 if (end_time > rs->time_last_bitmap_sync + 1000) { 1209 migration_trigger_throttle(rs); 1210 1211 migration_update_rates(rs, end_time); 1212 1213 rs->target_page_count_prev = rs->target_page_count; 1214 1215 /* reset period counters */ 1216 rs->time_last_bitmap_sync = end_time; 1217 rs->num_dirty_pages_period = 0; 1218 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); 1219 } 1220 if (migrate_use_events()) { 1221 qapi_event_send_migration_pass(ram_counters.dirty_sync_count); 1222 } 1223 } 1224 1225 static void migration_bitmap_sync_precopy(RAMState *rs) 1226 { 1227 Error *local_err = NULL; 1228 1229 /* 1230 * The current notifier usage is just an optimization to migration, so we 1231 * don't stop the normal migration process in the error case. 1232 */ 1233 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1234 error_report_err(local_err); 1235 local_err = NULL; 1236 } 1237 1238 migration_bitmap_sync(rs); 1239 1240 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1241 error_report_err(local_err); 1242 } 1243 } 1244 1245 void ram_release_page(const char *rbname, uint64_t offset) 1246 { 1247 if (!migrate_release_ram() || !migration_in_postcopy()) { 1248 return; 1249 } 1250 1251 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1252 } 1253 1254 /** 1255 * save_zero_page_to_file: send the zero page to the file 1256 * 1257 * Returns the size of data written to the file, 0 means the page is not 1258 * a zero page 1259 * 1260 * @pss: current PSS channel 1261 * @block: block that contains the page we want to send 1262 * @offset: offset inside the block for the page 1263 */ 1264 static int save_zero_page_to_file(PageSearchStatus *pss, 1265 RAMBlock *block, ram_addr_t offset) 1266 { 1267 uint8_t *p = block->host + offset; 1268 QEMUFile *file = pss->pss_channel; 1269 int len = 0; 1270 1271 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1272 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO); 1273 qemu_put_byte(file, 0); 1274 len += 1; 1275 ram_release_page(block->idstr, offset); 1276 } 1277 return len; 1278 } 1279 1280 /** 1281 * save_zero_page: send the zero page to the stream 1282 * 1283 * Returns the number of pages written. 1284 * 1285 * @pss: current PSS channel 1286 * @block: block that contains the page we want to send 1287 * @offset: offset inside the block for the page 1288 */ 1289 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block, 1290 ram_addr_t offset) 1291 { 1292 int len = save_zero_page_to_file(pss, block, offset); 1293 1294 if (len) { 1295 stat64_add(&ram_atomic_counters.duplicate, 1); 1296 ram_transferred_add(len); 1297 return 1; 1298 } 1299 return -1; 1300 } 1301 1302 /* 1303 * @pages: the number of pages written by the control path, 1304 * < 0 - error 1305 * > 0 - number of pages written 1306 * 1307 * Return true if the pages has been saved, otherwise false is returned. 1308 */ 1309 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, 1310 ram_addr_t offset, int *pages) 1311 { 1312 uint64_t bytes_xmit = 0; 1313 int ret; 1314 1315 *pages = -1; 1316 ret = ram_control_save_page(pss->pss_channel, block->offset, offset, 1317 TARGET_PAGE_SIZE, &bytes_xmit); 1318 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1319 return false; 1320 } 1321 1322 if (bytes_xmit) { 1323 ram_transferred_add(bytes_xmit); 1324 *pages = 1; 1325 } 1326 1327 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1328 return true; 1329 } 1330 1331 if (bytes_xmit > 0) { 1332 stat64_add(&ram_atomic_counters.normal, 1); 1333 } else if (bytes_xmit == 0) { 1334 stat64_add(&ram_atomic_counters.duplicate, 1); 1335 } 1336 1337 return true; 1338 } 1339 1340 /* 1341 * directly send the page to the stream 1342 * 1343 * Returns the number of pages written. 1344 * 1345 * @pss: current PSS channel 1346 * @block: block that contains the page we want to send 1347 * @offset: offset inside the block for the page 1348 * @buf: the page to be sent 1349 * @async: send to page asyncly 1350 */ 1351 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1352 ram_addr_t offset, uint8_t *buf, bool async) 1353 { 1354 QEMUFile *file = pss->pss_channel; 1355 1356 ram_transferred_add(save_page_header(pss, block, 1357 offset | RAM_SAVE_FLAG_PAGE)); 1358 if (async) { 1359 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1360 migrate_release_ram() && 1361 migration_in_postcopy()); 1362 } else { 1363 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1364 } 1365 ram_transferred_add(TARGET_PAGE_SIZE); 1366 stat64_add(&ram_atomic_counters.normal, 1); 1367 return 1; 1368 } 1369 1370 /** 1371 * ram_save_page: send the given page to the stream 1372 * 1373 * Returns the number of pages written. 1374 * < 0 - error 1375 * >=0 - Number of pages written - this might legally be 0 1376 * if xbzrle noticed the page was the same. 1377 * 1378 * @rs: current RAM state 1379 * @block: block that contains the page we want to send 1380 * @offset: offset inside the block for the page 1381 */ 1382 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1383 { 1384 int pages = -1; 1385 uint8_t *p; 1386 bool send_async = true; 1387 RAMBlock *block = pss->block; 1388 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1389 ram_addr_t current_addr = block->offset + offset; 1390 1391 p = block->host + offset; 1392 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1393 1394 XBZRLE_cache_lock(); 1395 if (rs->xbzrle_enabled && !migration_in_postcopy()) { 1396 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1397 block, offset); 1398 if (!rs->last_stage) { 1399 /* Can't send this cached data async, since the cache page 1400 * might get updated before it gets to the wire 1401 */ 1402 send_async = false; 1403 } 1404 } 1405 1406 /* XBZRLE overflow or normal page */ 1407 if (pages == -1) { 1408 pages = save_normal_page(pss, block, offset, p, send_async); 1409 } 1410 1411 XBZRLE_cache_unlock(); 1412 1413 return pages; 1414 } 1415 1416 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, 1417 ram_addr_t offset) 1418 { 1419 if (multifd_queue_page(file, block, offset) < 0) { 1420 return -1; 1421 } 1422 stat64_add(&ram_atomic_counters.normal, 1); 1423 1424 return 1; 1425 } 1426 1427 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1428 ram_addr_t offset, uint8_t *source_buf) 1429 { 1430 RAMState *rs = ram_state; 1431 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 1432 uint8_t *p = block->host + offset; 1433 int ret; 1434 1435 if (save_zero_page_to_file(pss, block, offset)) { 1436 return true; 1437 } 1438 1439 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); 1440 1441 /* 1442 * copy it to a internal buffer to avoid it being modified by VM 1443 * so that we can catch up the error during compression and 1444 * decompression 1445 */ 1446 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1447 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1448 if (ret < 0) { 1449 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 1450 error_report("compressed data failed!"); 1451 } 1452 return false; 1453 } 1454 1455 static void 1456 update_compress_thread_counts(const CompressParam *param, int bytes_xmit) 1457 { 1458 ram_transferred_add(bytes_xmit); 1459 1460 if (param->zero_page) { 1461 stat64_add(&ram_atomic_counters.duplicate, 1); 1462 return; 1463 } 1464 1465 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */ 1466 compression_counters.compressed_size += bytes_xmit - 8; 1467 compression_counters.pages++; 1468 } 1469 1470 static bool save_page_use_compression(RAMState *rs); 1471 1472 static void flush_compressed_data(RAMState *rs) 1473 { 1474 MigrationState *ms = migrate_get_current(); 1475 int idx, len, thread_count; 1476 1477 if (!save_page_use_compression(rs)) { 1478 return; 1479 } 1480 thread_count = migrate_compress_threads(); 1481 1482 qemu_mutex_lock(&comp_done_lock); 1483 for (idx = 0; idx < thread_count; idx++) { 1484 while (!comp_param[idx].done) { 1485 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1486 } 1487 } 1488 qemu_mutex_unlock(&comp_done_lock); 1489 1490 for (idx = 0; idx < thread_count; idx++) { 1491 qemu_mutex_lock(&comp_param[idx].mutex); 1492 if (!comp_param[idx].quit) { 1493 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); 1494 /* 1495 * it's safe to fetch zero_page without holding comp_done_lock 1496 * as there is no further request submitted to the thread, 1497 * i.e, the thread should be waiting for a request at this point. 1498 */ 1499 update_compress_thread_counts(&comp_param[idx], len); 1500 } 1501 qemu_mutex_unlock(&comp_param[idx].mutex); 1502 } 1503 } 1504 1505 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1506 ram_addr_t offset) 1507 { 1508 param->block = block; 1509 param->offset = offset; 1510 } 1511 1512 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) 1513 { 1514 int idx, thread_count, bytes_xmit = -1, pages = -1; 1515 bool wait = migrate_compress_wait_thread(); 1516 MigrationState *ms = migrate_get_current(); 1517 1518 thread_count = migrate_compress_threads(); 1519 qemu_mutex_lock(&comp_done_lock); 1520 retry: 1521 for (idx = 0; idx < thread_count; idx++) { 1522 if (comp_param[idx].done) { 1523 comp_param[idx].done = false; 1524 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, 1525 comp_param[idx].file); 1526 qemu_mutex_lock(&comp_param[idx].mutex); 1527 set_compress_params(&comp_param[idx], block, offset); 1528 qemu_cond_signal(&comp_param[idx].cond); 1529 qemu_mutex_unlock(&comp_param[idx].mutex); 1530 pages = 1; 1531 update_compress_thread_counts(&comp_param[idx], bytes_xmit); 1532 break; 1533 } 1534 } 1535 1536 /* 1537 * wait for the free thread if the user specifies 'compress-wait-thread', 1538 * otherwise we will post the page out in the main thread as normal page. 1539 */ 1540 if (pages < 0 && wait) { 1541 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1542 goto retry; 1543 } 1544 qemu_mutex_unlock(&comp_done_lock); 1545 1546 return pages; 1547 } 1548 1549 /** 1550 * find_dirty_block: find the next dirty page and update any state 1551 * associated with the search process. 1552 * 1553 * Returns true if a page is found 1554 * 1555 * @rs: current RAM state 1556 * @pss: data about the state of the current dirty page scan 1557 * @again: set to false if the search has scanned the whole of RAM 1558 */ 1559 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1560 { 1561 /* Update pss->page for the next dirty bit in ramblock */ 1562 pss_find_next_dirty(pss); 1563 1564 if (pss->complete_round && pss->block == rs->last_seen_block && 1565 pss->page >= rs->last_page) { 1566 /* 1567 * We've been once around the RAM and haven't found anything. 1568 * Give up. 1569 */ 1570 *again = false; 1571 return false; 1572 } 1573 if (!offset_in_ramblock(pss->block, 1574 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1575 /* Didn't find anything in this RAM Block */ 1576 pss->page = 0; 1577 pss->block = QLIST_NEXT_RCU(pss->block, next); 1578 if (!pss->block) { 1579 /* 1580 * If memory migration starts over, we will meet a dirtied page 1581 * which may still exists in compression threads's ring, so we 1582 * should flush the compressed data to make sure the new page 1583 * is not overwritten by the old one in the destination. 1584 * 1585 * Also If xbzrle is on, stop using the data compression at this 1586 * point. In theory, xbzrle can do better than compression. 1587 */ 1588 flush_compressed_data(rs); 1589 1590 /* Hit the end of the list */ 1591 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1592 /* Flag that we've looped */ 1593 pss->complete_round = true; 1594 /* After the first round, enable XBZRLE. */ 1595 if (migrate_use_xbzrle()) { 1596 rs->xbzrle_enabled = true; 1597 } 1598 } 1599 /* Didn't find anything this time, but try again on the new block */ 1600 *again = true; 1601 return false; 1602 } else { 1603 /* Can go around again, but... */ 1604 *again = true; 1605 /* We've found something so probably don't need to */ 1606 return true; 1607 } 1608 } 1609 1610 /** 1611 * unqueue_page: gets a page of the queue 1612 * 1613 * Helper for 'get_queued_page' - gets a page off the queue 1614 * 1615 * Returns the block of the page (or NULL if none available) 1616 * 1617 * @rs: current RAM state 1618 * @offset: used to return the offset within the RAMBlock 1619 */ 1620 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1621 { 1622 struct RAMSrcPageRequest *entry; 1623 RAMBlock *block = NULL; 1624 1625 if (!postcopy_has_request(rs)) { 1626 return NULL; 1627 } 1628 1629 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1630 1631 /* 1632 * This should _never_ change even after we take the lock, because no one 1633 * should be taking anything off the request list other than us. 1634 */ 1635 assert(postcopy_has_request(rs)); 1636 1637 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1638 block = entry->rb; 1639 *offset = entry->offset; 1640 1641 if (entry->len > TARGET_PAGE_SIZE) { 1642 entry->len -= TARGET_PAGE_SIZE; 1643 entry->offset += TARGET_PAGE_SIZE; 1644 } else { 1645 memory_region_unref(block->mr); 1646 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1647 g_free(entry); 1648 migration_consume_urgent_request(); 1649 } 1650 1651 return block; 1652 } 1653 1654 #if defined(__linux__) 1655 /** 1656 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1657 * is found, return RAM block pointer and page offset 1658 * 1659 * Returns pointer to the RAMBlock containing faulting page, 1660 * NULL if no write faults are pending 1661 * 1662 * @rs: current RAM state 1663 * @offset: page offset from the beginning of the block 1664 */ 1665 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1666 { 1667 struct uffd_msg uffd_msg; 1668 void *page_address; 1669 RAMBlock *block; 1670 int res; 1671 1672 if (!migrate_background_snapshot()) { 1673 return NULL; 1674 } 1675 1676 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1677 if (res <= 0) { 1678 return NULL; 1679 } 1680 1681 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1682 block = qemu_ram_block_from_host(page_address, false, offset); 1683 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1684 return block; 1685 } 1686 1687 /** 1688 * ram_save_release_protection: release UFFD write protection after 1689 * a range of pages has been saved 1690 * 1691 * @rs: current RAM state 1692 * @pss: page-search-status structure 1693 * @start_page: index of the first page in the range relative to pss->block 1694 * 1695 * Returns 0 on success, negative value in case of an error 1696 */ 1697 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1698 unsigned long start_page) 1699 { 1700 int res = 0; 1701 1702 /* Check if page is from UFFD-managed region. */ 1703 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1704 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1705 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1706 1707 /* Flush async buffers before un-protect. */ 1708 qemu_fflush(pss->pss_channel); 1709 /* Un-protect memory range. */ 1710 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1711 false, false); 1712 } 1713 1714 return res; 1715 } 1716 1717 /* ram_write_tracking_available: check if kernel supports required UFFD features 1718 * 1719 * Returns true if supports, false otherwise 1720 */ 1721 bool ram_write_tracking_available(void) 1722 { 1723 uint64_t uffd_features; 1724 int res; 1725 1726 res = uffd_query_features(&uffd_features); 1727 return (res == 0 && 1728 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1729 } 1730 1731 /* ram_write_tracking_compatible: check if guest configuration is 1732 * compatible with 'write-tracking' 1733 * 1734 * Returns true if compatible, false otherwise 1735 */ 1736 bool ram_write_tracking_compatible(void) 1737 { 1738 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1739 int uffd_fd; 1740 RAMBlock *block; 1741 bool ret = false; 1742 1743 /* Open UFFD file descriptor */ 1744 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1745 if (uffd_fd < 0) { 1746 return false; 1747 } 1748 1749 RCU_READ_LOCK_GUARD(); 1750 1751 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1752 uint64_t uffd_ioctls; 1753 1754 /* Nothing to do with read-only and MMIO-writable regions */ 1755 if (block->mr->readonly || block->mr->rom_device) { 1756 continue; 1757 } 1758 /* Try to register block memory via UFFD-IO to track writes */ 1759 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1760 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1761 goto out; 1762 } 1763 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1764 goto out; 1765 } 1766 } 1767 ret = true; 1768 1769 out: 1770 uffd_close_fd(uffd_fd); 1771 return ret; 1772 } 1773 1774 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1775 ram_addr_t size) 1776 { 1777 const ram_addr_t end = offset + size; 1778 1779 /* 1780 * We read one byte of each page; this will preallocate page tables if 1781 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1782 * where no page was populated yet. This might require adaption when 1783 * supporting other mappings, like shmem. 1784 */ 1785 for (; offset < end; offset += block->page_size) { 1786 char tmp = *((char *)block->host + offset); 1787 1788 /* Don't optimize the read out */ 1789 asm volatile("" : "+r" (tmp)); 1790 } 1791 } 1792 1793 static inline int populate_read_section(MemoryRegionSection *section, 1794 void *opaque) 1795 { 1796 const hwaddr size = int128_get64(section->size); 1797 hwaddr offset = section->offset_within_region; 1798 RAMBlock *block = section->mr->ram_block; 1799 1800 populate_read_range(block, offset, size); 1801 return 0; 1802 } 1803 1804 /* 1805 * ram_block_populate_read: preallocate page tables and populate pages in the 1806 * RAM block by reading a byte of each page. 1807 * 1808 * Since it's solely used for userfault_fd WP feature, here we just 1809 * hardcode page size to qemu_real_host_page_size. 1810 * 1811 * @block: RAM block to populate 1812 */ 1813 static void ram_block_populate_read(RAMBlock *rb) 1814 { 1815 /* 1816 * Skip populating all pages that fall into a discarded range as managed by 1817 * a RamDiscardManager responsible for the mapped memory region of the 1818 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1819 * must not get populated automatically. We don't have to track 1820 * modifications via userfaultfd WP reliably, because these pages will 1821 * not be part of the migration stream either way -- see 1822 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1823 * 1824 * Note: The result is only stable while migrating (precopy/postcopy). 1825 */ 1826 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1827 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1828 MemoryRegionSection section = { 1829 .mr = rb->mr, 1830 .offset_within_region = 0, 1831 .size = rb->mr->size, 1832 }; 1833 1834 ram_discard_manager_replay_populated(rdm, §ion, 1835 populate_read_section, NULL); 1836 } else { 1837 populate_read_range(rb, 0, rb->used_length); 1838 } 1839 } 1840 1841 /* 1842 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1843 */ 1844 void ram_write_tracking_prepare(void) 1845 { 1846 RAMBlock *block; 1847 1848 RCU_READ_LOCK_GUARD(); 1849 1850 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1851 /* Nothing to do with read-only and MMIO-writable regions */ 1852 if (block->mr->readonly || block->mr->rom_device) { 1853 continue; 1854 } 1855 1856 /* 1857 * Populate pages of the RAM block before enabling userfault_fd 1858 * write protection. 1859 * 1860 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1861 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1862 * pages with pte_none() entries in page table. 1863 */ 1864 ram_block_populate_read(block); 1865 } 1866 } 1867 1868 static inline int uffd_protect_section(MemoryRegionSection *section, 1869 void *opaque) 1870 { 1871 const hwaddr size = int128_get64(section->size); 1872 const hwaddr offset = section->offset_within_region; 1873 RAMBlock *rb = section->mr->ram_block; 1874 int uffd_fd = (uintptr_t)opaque; 1875 1876 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1877 false); 1878 } 1879 1880 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1881 { 1882 assert(rb->flags & RAM_UF_WRITEPROTECT); 1883 1884 /* See ram_block_populate_read() */ 1885 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1886 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1887 MemoryRegionSection section = { 1888 .mr = rb->mr, 1889 .offset_within_region = 0, 1890 .size = rb->mr->size, 1891 }; 1892 1893 return ram_discard_manager_replay_populated(rdm, §ion, 1894 uffd_protect_section, 1895 (void *)(uintptr_t)uffd_fd); 1896 } 1897 return uffd_change_protection(uffd_fd, rb->host, 1898 rb->used_length, true, false); 1899 } 1900 1901 /* 1902 * ram_write_tracking_start: start UFFD-WP memory tracking 1903 * 1904 * Returns 0 for success or negative value in case of error 1905 */ 1906 int ram_write_tracking_start(void) 1907 { 1908 int uffd_fd; 1909 RAMState *rs = ram_state; 1910 RAMBlock *block; 1911 1912 /* Open UFFD file descriptor */ 1913 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1914 if (uffd_fd < 0) { 1915 return uffd_fd; 1916 } 1917 rs->uffdio_fd = uffd_fd; 1918 1919 RCU_READ_LOCK_GUARD(); 1920 1921 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1922 /* Nothing to do with read-only and MMIO-writable regions */ 1923 if (block->mr->readonly || block->mr->rom_device) { 1924 continue; 1925 } 1926 1927 /* Register block memory with UFFD to track writes */ 1928 if (uffd_register_memory(rs->uffdio_fd, block->host, 1929 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1930 goto fail; 1931 } 1932 block->flags |= RAM_UF_WRITEPROTECT; 1933 memory_region_ref(block->mr); 1934 1935 /* Apply UFFD write protection to the block memory range */ 1936 if (ram_block_uffd_protect(block, uffd_fd)) { 1937 goto fail; 1938 } 1939 1940 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1941 block->host, block->max_length); 1942 } 1943 1944 return 0; 1945 1946 fail: 1947 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1948 1949 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1950 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1951 continue; 1952 } 1953 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1954 /* Cleanup flags and remove reference */ 1955 block->flags &= ~RAM_UF_WRITEPROTECT; 1956 memory_region_unref(block->mr); 1957 } 1958 1959 uffd_close_fd(uffd_fd); 1960 rs->uffdio_fd = -1; 1961 return -1; 1962 } 1963 1964 /** 1965 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1966 */ 1967 void ram_write_tracking_stop(void) 1968 { 1969 RAMState *rs = ram_state; 1970 RAMBlock *block; 1971 1972 RCU_READ_LOCK_GUARD(); 1973 1974 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1975 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1976 continue; 1977 } 1978 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1979 1980 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1981 block->host, block->max_length); 1982 1983 /* Cleanup flags and remove reference */ 1984 block->flags &= ~RAM_UF_WRITEPROTECT; 1985 memory_region_unref(block->mr); 1986 } 1987 1988 /* Finally close UFFD file descriptor */ 1989 uffd_close_fd(rs->uffdio_fd); 1990 rs->uffdio_fd = -1; 1991 } 1992 1993 #else 1994 /* No target OS support, stubs just fail or ignore */ 1995 1996 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1997 { 1998 (void) rs; 1999 (void) offset; 2000 2001 return NULL; 2002 } 2003 2004 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 2005 unsigned long start_page) 2006 { 2007 (void) rs; 2008 (void) pss; 2009 (void) start_page; 2010 2011 return 0; 2012 } 2013 2014 bool ram_write_tracking_available(void) 2015 { 2016 return false; 2017 } 2018 2019 bool ram_write_tracking_compatible(void) 2020 { 2021 assert(0); 2022 return false; 2023 } 2024 2025 int ram_write_tracking_start(void) 2026 { 2027 assert(0); 2028 return -1; 2029 } 2030 2031 void ram_write_tracking_stop(void) 2032 { 2033 assert(0); 2034 } 2035 #endif /* defined(__linux__) */ 2036 2037 /** 2038 * get_queued_page: unqueue a page from the postcopy requests 2039 * 2040 * Skips pages that are already sent (!dirty) 2041 * 2042 * Returns true if a queued page is found 2043 * 2044 * @rs: current RAM state 2045 * @pss: data about the state of the current dirty page scan 2046 */ 2047 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 2048 { 2049 RAMBlock *block; 2050 ram_addr_t offset; 2051 bool dirty; 2052 2053 do { 2054 block = unqueue_page(rs, &offset); 2055 /* 2056 * We're sending this page, and since it's postcopy nothing else 2057 * will dirty it, and we must make sure it doesn't get sent again 2058 * even if this queue request was received after the background 2059 * search already sent it. 2060 */ 2061 if (block) { 2062 unsigned long page; 2063 2064 page = offset >> TARGET_PAGE_BITS; 2065 dirty = test_bit(page, block->bmap); 2066 if (!dirty) { 2067 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 2068 page); 2069 } else { 2070 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 2071 } 2072 } 2073 2074 } while (block && !dirty); 2075 2076 if (!block) { 2077 /* 2078 * Poll write faults too if background snapshot is enabled; that's 2079 * when we have vcpus got blocked by the write protected pages. 2080 */ 2081 block = poll_fault_page(rs, &offset); 2082 } 2083 2084 if (block) { 2085 /* 2086 * We want the background search to continue from the queued page 2087 * since the guest is likely to want other pages near to the page 2088 * it just requested. 2089 */ 2090 pss->block = block; 2091 pss->page = offset >> TARGET_PAGE_BITS; 2092 2093 /* 2094 * This unqueued page would break the "one round" check, even is 2095 * really rare. 2096 */ 2097 pss->complete_round = false; 2098 } 2099 2100 return !!block; 2101 } 2102 2103 /** 2104 * migration_page_queue_free: drop any remaining pages in the ram 2105 * request queue 2106 * 2107 * It should be empty at the end anyway, but in error cases there may 2108 * be some left. in case that there is any page left, we drop it. 2109 * 2110 */ 2111 static void migration_page_queue_free(RAMState *rs) 2112 { 2113 struct RAMSrcPageRequest *mspr, *next_mspr; 2114 /* This queue generally should be empty - but in the case of a failed 2115 * migration might have some droppings in. 2116 */ 2117 RCU_READ_LOCK_GUARD(); 2118 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 2119 memory_region_unref(mspr->rb->mr); 2120 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 2121 g_free(mspr); 2122 } 2123 } 2124 2125 /** 2126 * ram_save_queue_pages: queue the page for transmission 2127 * 2128 * A request from postcopy destination for example. 2129 * 2130 * Returns zero on success or negative on error 2131 * 2132 * @rbname: Name of the RAMBLock of the request. NULL means the 2133 * same that last one. 2134 * @start: starting address from the start of the RAMBlock 2135 * @len: length (in bytes) to send 2136 */ 2137 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 2138 { 2139 RAMBlock *ramblock; 2140 RAMState *rs = ram_state; 2141 2142 ram_counters.postcopy_requests++; 2143 RCU_READ_LOCK_GUARD(); 2144 2145 if (!rbname) { 2146 /* Reuse last RAMBlock */ 2147 ramblock = rs->last_req_rb; 2148 2149 if (!ramblock) { 2150 /* 2151 * Shouldn't happen, we can't reuse the last RAMBlock if 2152 * it's the 1st request. 2153 */ 2154 error_report("ram_save_queue_pages no previous block"); 2155 return -1; 2156 } 2157 } else { 2158 ramblock = qemu_ram_block_by_name(rbname); 2159 2160 if (!ramblock) { 2161 /* We shouldn't be asked for a non-existent RAMBlock */ 2162 error_report("ram_save_queue_pages no block '%s'", rbname); 2163 return -1; 2164 } 2165 rs->last_req_rb = ramblock; 2166 } 2167 trace_ram_save_queue_pages(ramblock->idstr, start, len); 2168 if (!offset_in_ramblock(ramblock, start + len - 1)) { 2169 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 2170 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 2171 __func__, start, len, ramblock->used_length); 2172 return -1; 2173 } 2174 2175 /* 2176 * When with postcopy preempt, we send back the page directly in the 2177 * rp-return thread. 2178 */ 2179 if (postcopy_preempt_active()) { 2180 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 2181 size_t page_size = qemu_ram_pagesize(ramblock); 2182 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 2183 int ret = 0; 2184 2185 qemu_mutex_lock(&rs->bitmap_mutex); 2186 2187 pss_init(pss, ramblock, page_start); 2188 /* 2189 * Always use the preempt channel, and make sure it's there. It's 2190 * safe to access without lock, because when rp-thread is running 2191 * we should be the only one who operates on the qemufile 2192 */ 2193 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 2194 assert(pss->pss_channel); 2195 2196 /* 2197 * It must be either one or multiple of host page size. Just 2198 * assert; if something wrong we're mostly split brain anyway. 2199 */ 2200 assert(len % page_size == 0); 2201 while (len) { 2202 if (ram_save_host_page_urgent(pss)) { 2203 error_report("%s: ram_save_host_page_urgent() failed: " 2204 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 2205 __func__, ramblock->idstr, start); 2206 ret = -1; 2207 break; 2208 } 2209 /* 2210 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 2211 * will automatically be moved and point to the next host page 2212 * we're going to send, so no need to update here. 2213 * 2214 * Normally QEMU never sends >1 host page in requests, so 2215 * logically we don't even need that as the loop should only 2216 * run once, but just to be consistent. 2217 */ 2218 len -= page_size; 2219 }; 2220 qemu_mutex_unlock(&rs->bitmap_mutex); 2221 2222 return ret; 2223 } 2224 2225 struct RAMSrcPageRequest *new_entry = 2226 g_new0(struct RAMSrcPageRequest, 1); 2227 new_entry->rb = ramblock; 2228 new_entry->offset = start; 2229 new_entry->len = len; 2230 2231 memory_region_ref(ramblock->mr); 2232 qemu_mutex_lock(&rs->src_page_req_mutex); 2233 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 2234 migration_make_urgent_request(); 2235 qemu_mutex_unlock(&rs->src_page_req_mutex); 2236 2237 return 0; 2238 } 2239 2240 static bool save_page_use_compression(RAMState *rs) 2241 { 2242 if (!migrate_use_compression()) { 2243 return false; 2244 } 2245 2246 /* 2247 * If xbzrle is enabled (e.g., after first round of migration), stop 2248 * using the data compression. In theory, xbzrle can do better than 2249 * compression. 2250 */ 2251 if (rs->xbzrle_enabled) { 2252 return false; 2253 } 2254 2255 return true; 2256 } 2257 2258 /* 2259 * try to compress the page before posting it out, return true if the page 2260 * has been properly handled by compression, otherwise needs other 2261 * paths to handle it 2262 */ 2263 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, 2264 RAMBlock *block, ram_addr_t offset) 2265 { 2266 if (!save_page_use_compression(rs)) { 2267 return false; 2268 } 2269 2270 /* 2271 * When starting the process of a new block, the first page of 2272 * the block should be sent out before other pages in the same 2273 * block, and all the pages in last block should have been sent 2274 * out, keeping this order is important, because the 'cont' flag 2275 * is used to avoid resending the block name. 2276 * 2277 * We post the fist page as normal page as compression will take 2278 * much CPU resource. 2279 */ 2280 if (block != pss->last_sent_block) { 2281 flush_compressed_data(rs); 2282 return false; 2283 } 2284 2285 if (compress_page_with_multi_thread(block, offset) > 0) { 2286 return true; 2287 } 2288 2289 compression_counters.busy++; 2290 return false; 2291 } 2292 2293 /** 2294 * ram_save_target_page: save one target page 2295 * 2296 * Returns the number of pages written 2297 * 2298 * @rs: current RAM state 2299 * @pss: data about the page we want to send 2300 */ 2301 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 2302 { 2303 RAMBlock *block = pss->block; 2304 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2305 int res; 2306 2307 if (control_save_page(pss, block, offset, &res)) { 2308 return res; 2309 } 2310 2311 if (save_compress_page(rs, pss, block, offset)) { 2312 return 1; 2313 } 2314 2315 res = save_zero_page(pss, block, offset); 2316 if (res > 0) { 2317 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 2318 * page would be stale 2319 */ 2320 if (rs->xbzrle_enabled) { 2321 XBZRLE_cache_lock(); 2322 xbzrle_cache_zero_page(rs, block->offset + offset); 2323 XBZRLE_cache_unlock(); 2324 } 2325 return res; 2326 } 2327 2328 /* 2329 * Do not use multifd in postcopy as one whole host page should be 2330 * placed. Meanwhile postcopy requires atomic update of pages, so even 2331 * if host page size == guest page size the dest guest during run may 2332 * still see partially copied pages which is data corruption. 2333 */ 2334 if (migrate_use_multifd() && !migration_in_postcopy()) { 2335 return ram_save_multifd_page(pss->pss_channel, block, offset); 2336 } 2337 2338 return ram_save_page(rs, pss); 2339 } 2340 2341 /* Should be called before sending a host page */ 2342 static void pss_host_page_prepare(PageSearchStatus *pss) 2343 { 2344 /* How many guest pages are there in one host page? */ 2345 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2346 2347 pss->host_page_sending = true; 2348 if (guest_pfns <= 1) { 2349 /* 2350 * This covers both when guest psize == host psize, or when guest 2351 * has larger psize than the host (guest_pfns==0). 2352 * 2353 * For the latter, we always send one whole guest page per 2354 * iteration of the host page (example: an Alpha VM on x86 host 2355 * will have guest psize 8K while host psize 4K). 2356 */ 2357 pss->host_page_start = pss->page; 2358 pss->host_page_end = pss->page + 1; 2359 } else { 2360 /* 2361 * The host page spans over multiple guest pages, we send them 2362 * within the same host page iteration. 2363 */ 2364 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2365 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2366 } 2367 } 2368 2369 /* 2370 * Whether the page pointed by PSS is within the host page being sent. 2371 * Must be called after a previous pss_host_page_prepare(). 2372 */ 2373 static bool pss_within_range(PageSearchStatus *pss) 2374 { 2375 ram_addr_t ram_addr; 2376 2377 assert(pss->host_page_sending); 2378 2379 /* Over host-page boundary? */ 2380 if (pss->page >= pss->host_page_end) { 2381 return false; 2382 } 2383 2384 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2385 2386 return offset_in_ramblock(pss->block, ram_addr); 2387 } 2388 2389 static void pss_host_page_finish(PageSearchStatus *pss) 2390 { 2391 pss->host_page_sending = false; 2392 /* This is not needed, but just to reset it */ 2393 pss->host_page_start = pss->host_page_end = 0; 2394 } 2395 2396 /* 2397 * Send an urgent host page specified by `pss'. Need to be called with 2398 * bitmap_mutex held. 2399 * 2400 * Returns 0 if save host page succeeded, false otherwise. 2401 */ 2402 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2403 { 2404 bool page_dirty, sent = false; 2405 RAMState *rs = ram_state; 2406 int ret = 0; 2407 2408 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2409 pss_host_page_prepare(pss); 2410 2411 /* 2412 * If precopy is sending the same page, let it be done in precopy, or 2413 * we could send the same page in two channels and none of them will 2414 * receive the whole page. 2415 */ 2416 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2417 trace_postcopy_preempt_hit(pss->block->idstr, 2418 pss->page << TARGET_PAGE_BITS); 2419 return 0; 2420 } 2421 2422 do { 2423 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2424 2425 if (page_dirty) { 2426 /* Be strict to return code; it must be 1, or what else? */ 2427 if (ram_save_target_page(rs, pss) != 1) { 2428 error_report_once("%s: ram_save_target_page failed", __func__); 2429 ret = -1; 2430 goto out; 2431 } 2432 sent = true; 2433 } 2434 pss_find_next_dirty(pss); 2435 } while (pss_within_range(pss)); 2436 out: 2437 pss_host_page_finish(pss); 2438 /* For urgent requests, flush immediately if sent */ 2439 if (sent) { 2440 qemu_fflush(pss->pss_channel); 2441 } 2442 return ret; 2443 } 2444 2445 /** 2446 * ram_save_host_page: save a whole host page 2447 * 2448 * Starting at *offset send pages up to the end of the current host 2449 * page. It's valid for the initial offset to point into the middle of 2450 * a host page in which case the remainder of the hostpage is sent. 2451 * Only dirty target pages are sent. Note that the host page size may 2452 * be a huge page for this block. 2453 * 2454 * The saving stops at the boundary of the used_length of the block 2455 * if the RAMBlock isn't a multiple of the host page size. 2456 * 2457 * The caller must be with ram_state.bitmap_mutex held to call this 2458 * function. Note that this function can temporarily release the lock, but 2459 * when the function is returned it'll make sure the lock is still held. 2460 * 2461 * Returns the number of pages written or negative on error 2462 * 2463 * @rs: current RAM state 2464 * @pss: data about the page we want to send 2465 */ 2466 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2467 { 2468 bool page_dirty, preempt_active = postcopy_preempt_active(); 2469 int tmppages, pages = 0; 2470 size_t pagesize_bits = 2471 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2472 unsigned long start_page = pss->page; 2473 int res; 2474 2475 if (ramblock_is_ignored(pss->block)) { 2476 error_report("block %s should not be migrated !", pss->block->idstr); 2477 return 0; 2478 } 2479 2480 /* Update host page boundary information */ 2481 pss_host_page_prepare(pss); 2482 2483 do { 2484 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2485 2486 /* Check the pages is dirty and if it is send it */ 2487 if (page_dirty) { 2488 /* 2489 * Properly yield the lock only in postcopy preempt mode 2490 * because both migration thread and rp-return thread can 2491 * operate on the bitmaps. 2492 */ 2493 if (preempt_active) { 2494 qemu_mutex_unlock(&rs->bitmap_mutex); 2495 } 2496 tmppages = ram_save_target_page(rs, pss); 2497 if (tmppages >= 0) { 2498 pages += tmppages; 2499 /* 2500 * Allow rate limiting to happen in the middle of huge pages if 2501 * something is sent in the current iteration. 2502 */ 2503 if (pagesize_bits > 1 && tmppages > 0) { 2504 migration_rate_limit(); 2505 } 2506 } 2507 if (preempt_active) { 2508 qemu_mutex_lock(&rs->bitmap_mutex); 2509 } 2510 } else { 2511 tmppages = 0; 2512 } 2513 2514 if (tmppages < 0) { 2515 pss_host_page_finish(pss); 2516 return tmppages; 2517 } 2518 2519 pss_find_next_dirty(pss); 2520 } while (pss_within_range(pss)); 2521 2522 pss_host_page_finish(pss); 2523 2524 res = ram_save_release_protection(rs, pss, start_page); 2525 return (res < 0 ? res : pages); 2526 } 2527 2528 /** 2529 * ram_find_and_save_block: finds a dirty page and sends it to f 2530 * 2531 * Called within an RCU critical section. 2532 * 2533 * Returns the number of pages written where zero means no dirty pages, 2534 * or negative on error 2535 * 2536 * @rs: current RAM state 2537 * 2538 * On systems where host-page-size > target-page-size it will send all the 2539 * pages in a host page that are dirty. 2540 */ 2541 static int ram_find_and_save_block(RAMState *rs) 2542 { 2543 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2544 int pages = 0; 2545 bool again, found; 2546 2547 /* No dirty page as there is zero RAM */ 2548 if (!ram_bytes_total()) { 2549 return pages; 2550 } 2551 2552 /* 2553 * Always keep last_seen_block/last_page valid during this procedure, 2554 * because find_dirty_block() relies on these values (e.g., we compare 2555 * last_seen_block with pss.block to see whether we searched all the 2556 * ramblocks) to detect the completion of migration. Having NULL value 2557 * of last_seen_block can conditionally cause below loop to run forever. 2558 */ 2559 if (!rs->last_seen_block) { 2560 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2561 rs->last_page = 0; 2562 } 2563 2564 pss_init(pss, rs->last_seen_block, rs->last_page); 2565 2566 do { 2567 again = true; 2568 found = get_queued_page(rs, pss); 2569 2570 if (!found) { 2571 /* priority queue empty, so just search for something dirty */ 2572 found = find_dirty_block(rs, pss, &again); 2573 } 2574 2575 if (found) { 2576 pages = ram_save_host_page(rs, pss); 2577 } 2578 } while (!pages && again); 2579 2580 rs->last_seen_block = pss->block; 2581 rs->last_page = pss->page; 2582 2583 return pages; 2584 } 2585 2586 void acct_update_position(QEMUFile *f, size_t size, bool zero) 2587 { 2588 uint64_t pages = size / TARGET_PAGE_SIZE; 2589 2590 if (zero) { 2591 stat64_add(&ram_atomic_counters.duplicate, pages); 2592 } else { 2593 stat64_add(&ram_atomic_counters.normal, pages); 2594 ram_transferred_add(size); 2595 qemu_file_credit_transfer(f, size); 2596 } 2597 } 2598 2599 static uint64_t ram_bytes_total_common(bool count_ignored) 2600 { 2601 RAMBlock *block; 2602 uint64_t total = 0; 2603 2604 RCU_READ_LOCK_GUARD(); 2605 2606 if (count_ignored) { 2607 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2608 total += block->used_length; 2609 } 2610 } else { 2611 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2612 total += block->used_length; 2613 } 2614 } 2615 return total; 2616 } 2617 2618 uint64_t ram_bytes_total(void) 2619 { 2620 return ram_bytes_total_common(false); 2621 } 2622 2623 static void xbzrle_load_setup(void) 2624 { 2625 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2626 } 2627 2628 static void xbzrle_load_cleanup(void) 2629 { 2630 g_free(XBZRLE.decoded_buf); 2631 XBZRLE.decoded_buf = NULL; 2632 } 2633 2634 static void ram_state_cleanup(RAMState **rsp) 2635 { 2636 if (*rsp) { 2637 migration_page_queue_free(*rsp); 2638 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2639 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2640 g_free(*rsp); 2641 *rsp = NULL; 2642 } 2643 } 2644 2645 static void xbzrle_cleanup(void) 2646 { 2647 XBZRLE_cache_lock(); 2648 if (XBZRLE.cache) { 2649 cache_fini(XBZRLE.cache); 2650 g_free(XBZRLE.encoded_buf); 2651 g_free(XBZRLE.current_buf); 2652 g_free(XBZRLE.zero_target_page); 2653 XBZRLE.cache = NULL; 2654 XBZRLE.encoded_buf = NULL; 2655 XBZRLE.current_buf = NULL; 2656 XBZRLE.zero_target_page = NULL; 2657 } 2658 XBZRLE_cache_unlock(); 2659 } 2660 2661 static void ram_save_cleanup(void *opaque) 2662 { 2663 RAMState **rsp = opaque; 2664 RAMBlock *block; 2665 2666 /* We don't use dirty log with background snapshots */ 2667 if (!migrate_background_snapshot()) { 2668 /* caller have hold iothread lock or is in a bh, so there is 2669 * no writing race against the migration bitmap 2670 */ 2671 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2672 /* 2673 * do not stop dirty log without starting it, since 2674 * memory_global_dirty_log_stop will assert that 2675 * memory_global_dirty_log_start/stop used in pairs 2676 */ 2677 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2678 } 2679 } 2680 2681 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2682 g_free(block->clear_bmap); 2683 block->clear_bmap = NULL; 2684 g_free(block->bmap); 2685 block->bmap = NULL; 2686 } 2687 2688 xbzrle_cleanup(); 2689 compress_threads_save_cleanup(); 2690 ram_state_cleanup(rsp); 2691 } 2692 2693 static void ram_state_reset(RAMState *rs) 2694 { 2695 int i; 2696 2697 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2698 rs->pss[i].last_sent_block = NULL; 2699 } 2700 2701 rs->last_seen_block = NULL; 2702 rs->last_page = 0; 2703 rs->last_version = ram_list.version; 2704 rs->xbzrle_enabled = false; 2705 } 2706 2707 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2708 2709 /* **** functions for postcopy ***** */ 2710 2711 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2712 { 2713 struct RAMBlock *block; 2714 2715 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2716 unsigned long *bitmap = block->bmap; 2717 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2718 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2719 2720 while (run_start < range) { 2721 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2722 ram_discard_range(block->idstr, 2723 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2724 ((ram_addr_t)(run_end - run_start)) 2725 << TARGET_PAGE_BITS); 2726 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2727 } 2728 } 2729 } 2730 2731 /** 2732 * postcopy_send_discard_bm_ram: discard a RAMBlock 2733 * 2734 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2735 * 2736 * @ms: current migration state 2737 * @block: RAMBlock to discard 2738 */ 2739 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2740 { 2741 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2742 unsigned long current; 2743 unsigned long *bitmap = block->bmap; 2744 2745 for (current = 0; current < end; ) { 2746 unsigned long one = find_next_bit(bitmap, end, current); 2747 unsigned long zero, discard_length; 2748 2749 if (one >= end) { 2750 break; 2751 } 2752 2753 zero = find_next_zero_bit(bitmap, end, one + 1); 2754 2755 if (zero >= end) { 2756 discard_length = end - one; 2757 } else { 2758 discard_length = zero - one; 2759 } 2760 postcopy_discard_send_range(ms, one, discard_length); 2761 current = one + discard_length; 2762 } 2763 } 2764 2765 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2766 2767 /** 2768 * postcopy_each_ram_send_discard: discard all RAMBlocks 2769 * 2770 * Utility for the outgoing postcopy code. 2771 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2772 * passing it bitmap indexes and name. 2773 * (qemu_ram_foreach_block ends up passing unscaled lengths 2774 * which would mean postcopy code would have to deal with target page) 2775 * 2776 * @ms: current migration state 2777 */ 2778 static void postcopy_each_ram_send_discard(MigrationState *ms) 2779 { 2780 struct RAMBlock *block; 2781 2782 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2783 postcopy_discard_send_init(ms, block->idstr); 2784 2785 /* 2786 * Deal with TPS != HPS and huge pages. It discard any partially sent 2787 * host-page size chunks, mark any partially dirty host-page size 2788 * chunks as all dirty. In this case the host-page is the host-page 2789 * for the particular RAMBlock, i.e. it might be a huge page. 2790 */ 2791 postcopy_chunk_hostpages_pass(ms, block); 2792 2793 /* 2794 * Postcopy sends chunks of bitmap over the wire, but it 2795 * just needs indexes at this point, avoids it having 2796 * target page specific code. 2797 */ 2798 postcopy_send_discard_bm_ram(ms, block); 2799 postcopy_discard_send_finish(ms); 2800 } 2801 } 2802 2803 /** 2804 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2805 * 2806 * Helper for postcopy_chunk_hostpages; it's called twice to 2807 * canonicalize the two bitmaps, that are similar, but one is 2808 * inverted. 2809 * 2810 * Postcopy requires that all target pages in a hostpage are dirty or 2811 * clean, not a mix. This function canonicalizes the bitmaps. 2812 * 2813 * @ms: current migration state 2814 * @block: block that contains the page we want to canonicalize 2815 */ 2816 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2817 { 2818 RAMState *rs = ram_state; 2819 unsigned long *bitmap = block->bmap; 2820 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2821 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2822 unsigned long run_start; 2823 2824 if (block->page_size == TARGET_PAGE_SIZE) { 2825 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2826 return; 2827 } 2828 2829 /* Find a dirty page */ 2830 run_start = find_next_bit(bitmap, pages, 0); 2831 2832 while (run_start < pages) { 2833 2834 /* 2835 * If the start of this run of pages is in the middle of a host 2836 * page, then we need to fixup this host page. 2837 */ 2838 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2839 /* Find the end of this run */ 2840 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2841 /* 2842 * If the end isn't at the start of a host page, then the 2843 * run doesn't finish at the end of a host page 2844 * and we need to discard. 2845 */ 2846 } 2847 2848 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2849 unsigned long page; 2850 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2851 host_ratio); 2852 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2853 2854 /* Clean up the bitmap */ 2855 for (page = fixup_start_addr; 2856 page < fixup_start_addr + host_ratio; page++) { 2857 /* 2858 * Remark them as dirty, updating the count for any pages 2859 * that weren't previously dirty. 2860 */ 2861 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2862 } 2863 } 2864 2865 /* Find the next dirty page for the next iteration */ 2866 run_start = find_next_bit(bitmap, pages, run_start); 2867 } 2868 } 2869 2870 /** 2871 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2872 * 2873 * Transmit the set of pages to be discarded after precopy to the target 2874 * these are pages that: 2875 * a) Have been previously transmitted but are now dirty again 2876 * b) Pages that have never been transmitted, this ensures that 2877 * any pages on the destination that have been mapped by background 2878 * tasks get discarded (transparent huge pages is the specific concern) 2879 * Hopefully this is pretty sparse 2880 * 2881 * @ms: current migration state 2882 */ 2883 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2884 { 2885 RAMState *rs = ram_state; 2886 2887 RCU_READ_LOCK_GUARD(); 2888 2889 /* This should be our last sync, the src is now paused */ 2890 migration_bitmap_sync(rs); 2891 2892 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2893 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2894 rs->last_seen_block = NULL; 2895 rs->last_page = 0; 2896 2897 postcopy_each_ram_send_discard(ms); 2898 2899 trace_ram_postcopy_send_discard_bitmap(); 2900 } 2901 2902 /** 2903 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2904 * 2905 * Returns zero on success 2906 * 2907 * @rbname: name of the RAMBlock of the request. NULL means the 2908 * same that last one. 2909 * @start: RAMBlock starting page 2910 * @length: RAMBlock size 2911 */ 2912 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2913 { 2914 trace_ram_discard_range(rbname, start, length); 2915 2916 RCU_READ_LOCK_GUARD(); 2917 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2918 2919 if (!rb) { 2920 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2921 return -1; 2922 } 2923 2924 /* 2925 * On source VM, we don't need to update the received bitmap since 2926 * we don't even have one. 2927 */ 2928 if (rb->receivedmap) { 2929 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2930 length >> qemu_target_page_bits()); 2931 } 2932 2933 return ram_block_discard_range(rb, start, length); 2934 } 2935 2936 /* 2937 * For every allocation, we will try not to crash the VM if the 2938 * allocation failed. 2939 */ 2940 static int xbzrle_init(void) 2941 { 2942 Error *local_err = NULL; 2943 2944 if (!migrate_use_xbzrle()) { 2945 return 0; 2946 } 2947 2948 XBZRLE_cache_lock(); 2949 2950 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2951 if (!XBZRLE.zero_target_page) { 2952 error_report("%s: Error allocating zero page", __func__); 2953 goto err_out; 2954 } 2955 2956 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2957 TARGET_PAGE_SIZE, &local_err); 2958 if (!XBZRLE.cache) { 2959 error_report_err(local_err); 2960 goto free_zero_page; 2961 } 2962 2963 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2964 if (!XBZRLE.encoded_buf) { 2965 error_report("%s: Error allocating encoded_buf", __func__); 2966 goto free_cache; 2967 } 2968 2969 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2970 if (!XBZRLE.current_buf) { 2971 error_report("%s: Error allocating current_buf", __func__); 2972 goto free_encoded_buf; 2973 } 2974 2975 /* We are all good */ 2976 XBZRLE_cache_unlock(); 2977 return 0; 2978 2979 free_encoded_buf: 2980 g_free(XBZRLE.encoded_buf); 2981 XBZRLE.encoded_buf = NULL; 2982 free_cache: 2983 cache_fini(XBZRLE.cache); 2984 XBZRLE.cache = NULL; 2985 free_zero_page: 2986 g_free(XBZRLE.zero_target_page); 2987 XBZRLE.zero_target_page = NULL; 2988 err_out: 2989 XBZRLE_cache_unlock(); 2990 return -ENOMEM; 2991 } 2992 2993 static int ram_state_init(RAMState **rsp) 2994 { 2995 *rsp = g_try_new0(RAMState, 1); 2996 2997 if (!*rsp) { 2998 error_report("%s: Init ramstate fail", __func__); 2999 return -1; 3000 } 3001 3002 qemu_mutex_init(&(*rsp)->bitmap_mutex); 3003 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 3004 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 3005 3006 /* 3007 * Count the total number of pages used by ram blocks not including any 3008 * gaps due to alignment or unplugs. 3009 * This must match with the initial values of dirty bitmap. 3010 */ 3011 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 3012 ram_state_reset(*rsp); 3013 3014 return 0; 3015 } 3016 3017 static void ram_list_init_bitmaps(void) 3018 { 3019 MigrationState *ms = migrate_get_current(); 3020 RAMBlock *block; 3021 unsigned long pages; 3022 uint8_t shift; 3023 3024 /* Skip setting bitmap if there is no RAM */ 3025 if (ram_bytes_total()) { 3026 shift = ms->clear_bitmap_shift; 3027 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 3028 error_report("clear_bitmap_shift (%u) too big, using " 3029 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 3030 shift = CLEAR_BITMAP_SHIFT_MAX; 3031 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 3032 error_report("clear_bitmap_shift (%u) too small, using " 3033 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 3034 shift = CLEAR_BITMAP_SHIFT_MIN; 3035 } 3036 3037 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3038 pages = block->max_length >> TARGET_PAGE_BITS; 3039 /* 3040 * The initial dirty bitmap for migration must be set with all 3041 * ones to make sure we'll migrate every guest RAM page to 3042 * destination. 3043 * Here we set RAMBlock.bmap all to 1 because when rebegin a 3044 * new migration after a failed migration, ram_list. 3045 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 3046 * guest memory. 3047 */ 3048 block->bmap = bitmap_new(pages); 3049 bitmap_set(block->bmap, 0, pages); 3050 block->clear_bmap_shift = shift; 3051 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 3052 } 3053 } 3054 } 3055 3056 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 3057 { 3058 unsigned long pages; 3059 RAMBlock *rb; 3060 3061 RCU_READ_LOCK_GUARD(); 3062 3063 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3064 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 3065 rs->migration_dirty_pages -= pages; 3066 } 3067 } 3068 3069 static void ram_init_bitmaps(RAMState *rs) 3070 { 3071 /* For memory_global_dirty_log_start below. */ 3072 qemu_mutex_lock_iothread(); 3073 qemu_mutex_lock_ramlist(); 3074 3075 WITH_RCU_READ_LOCK_GUARD() { 3076 ram_list_init_bitmaps(); 3077 /* We don't use dirty log with background snapshots */ 3078 if (!migrate_background_snapshot()) { 3079 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3080 migration_bitmap_sync_precopy(rs); 3081 } 3082 } 3083 qemu_mutex_unlock_ramlist(); 3084 qemu_mutex_unlock_iothread(); 3085 3086 /* 3087 * After an eventual first bitmap sync, fixup the initial bitmap 3088 * containing all 1s to exclude any discarded pages from migration. 3089 */ 3090 migration_bitmap_clear_discarded_pages(rs); 3091 } 3092 3093 static int ram_init_all(RAMState **rsp) 3094 { 3095 if (ram_state_init(rsp)) { 3096 return -1; 3097 } 3098 3099 if (xbzrle_init()) { 3100 ram_state_cleanup(rsp); 3101 return -1; 3102 } 3103 3104 ram_init_bitmaps(*rsp); 3105 3106 return 0; 3107 } 3108 3109 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 3110 { 3111 RAMBlock *block; 3112 uint64_t pages = 0; 3113 3114 /* 3115 * Postcopy is not using xbzrle/compression, so no need for that. 3116 * Also, since source are already halted, we don't need to care 3117 * about dirty page logging as well. 3118 */ 3119 3120 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3121 pages += bitmap_count_one(block->bmap, 3122 block->used_length >> TARGET_PAGE_BITS); 3123 } 3124 3125 /* This may not be aligned with current bitmaps. Recalculate. */ 3126 rs->migration_dirty_pages = pages; 3127 3128 ram_state_reset(rs); 3129 3130 /* Update RAMState cache of output QEMUFile */ 3131 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 3132 3133 trace_ram_state_resume_prepare(pages); 3134 } 3135 3136 /* 3137 * This function clears bits of the free pages reported by the caller from the 3138 * migration dirty bitmap. @addr is the host address corresponding to the 3139 * start of the continuous guest free pages, and @len is the total bytes of 3140 * those pages. 3141 */ 3142 void qemu_guest_free_page_hint(void *addr, size_t len) 3143 { 3144 RAMBlock *block; 3145 ram_addr_t offset; 3146 size_t used_len, start, npages; 3147 MigrationState *s = migrate_get_current(); 3148 3149 /* This function is currently expected to be used during live migration */ 3150 if (!migration_is_setup_or_active(s->state)) { 3151 return; 3152 } 3153 3154 for (; len > 0; len -= used_len, addr += used_len) { 3155 block = qemu_ram_block_from_host(addr, false, &offset); 3156 if (unlikely(!block || offset >= block->used_length)) { 3157 /* 3158 * The implementation might not support RAMBlock resize during 3159 * live migration, but it could happen in theory with future 3160 * updates. So we add a check here to capture that case. 3161 */ 3162 error_report_once("%s unexpected error", __func__); 3163 return; 3164 } 3165 3166 if (len <= block->used_length - offset) { 3167 used_len = len; 3168 } else { 3169 used_len = block->used_length - offset; 3170 } 3171 3172 start = offset >> TARGET_PAGE_BITS; 3173 npages = used_len >> TARGET_PAGE_BITS; 3174 3175 qemu_mutex_lock(&ram_state->bitmap_mutex); 3176 /* 3177 * The skipped free pages are equavalent to be sent from clear_bmap's 3178 * perspective, so clear the bits from the memory region bitmap which 3179 * are initially set. Otherwise those skipped pages will be sent in 3180 * the next round after syncing from the memory region bitmap. 3181 */ 3182 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 3183 ram_state->migration_dirty_pages -= 3184 bitmap_count_one_with_offset(block->bmap, start, npages); 3185 bitmap_clear(block->bmap, start, npages); 3186 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3187 } 3188 } 3189 3190 /* 3191 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3192 * long-running RCU critical section. When rcu-reclaims in the code 3193 * start to become numerous it will be necessary to reduce the 3194 * granularity of these critical sections. 3195 */ 3196 3197 /** 3198 * ram_save_setup: Setup RAM for migration 3199 * 3200 * Returns zero to indicate success and negative for error 3201 * 3202 * @f: QEMUFile where to send the data 3203 * @opaque: RAMState pointer 3204 */ 3205 static int ram_save_setup(QEMUFile *f, void *opaque) 3206 { 3207 RAMState **rsp = opaque; 3208 RAMBlock *block; 3209 int ret; 3210 3211 if (compress_threads_save_setup()) { 3212 return -1; 3213 } 3214 3215 /* migration has already setup the bitmap, reuse it. */ 3216 if (!migration_in_colo_state()) { 3217 if (ram_init_all(rsp) != 0) { 3218 compress_threads_save_cleanup(); 3219 return -1; 3220 } 3221 } 3222 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3223 3224 WITH_RCU_READ_LOCK_GUARD() { 3225 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); 3226 3227 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3228 qemu_put_byte(f, strlen(block->idstr)); 3229 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3230 qemu_put_be64(f, block->used_length); 3231 if (migrate_postcopy_ram() && block->page_size != 3232 qemu_host_page_size) { 3233 qemu_put_be64(f, block->page_size); 3234 } 3235 if (migrate_ignore_shared()) { 3236 qemu_put_be64(f, block->mr->addr); 3237 } 3238 } 3239 } 3240 3241 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 3242 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 3243 3244 ret = multifd_send_sync_main(f); 3245 if (ret < 0) { 3246 return ret; 3247 } 3248 3249 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3250 qemu_fflush(f); 3251 3252 return 0; 3253 } 3254 3255 /** 3256 * ram_save_iterate: iterative stage for migration 3257 * 3258 * Returns zero to indicate success and negative for error 3259 * 3260 * @f: QEMUFile where to send the data 3261 * @opaque: RAMState pointer 3262 */ 3263 static int ram_save_iterate(QEMUFile *f, void *opaque) 3264 { 3265 RAMState **temp = opaque; 3266 RAMState *rs = *temp; 3267 int ret = 0; 3268 int i; 3269 int64_t t0; 3270 int done = 0; 3271 3272 if (blk_mig_bulk_active()) { 3273 /* Avoid transferring ram during bulk phase of block migration as 3274 * the bulk phase will usually take a long time and transferring 3275 * ram updates during that time is pointless. */ 3276 goto out; 3277 } 3278 3279 /* 3280 * We'll take this lock a little bit long, but it's okay for two reasons. 3281 * Firstly, the only possible other thread to take it is who calls 3282 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3283 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3284 * guarantees that we'll at least released it in a regular basis. 3285 */ 3286 qemu_mutex_lock(&rs->bitmap_mutex); 3287 WITH_RCU_READ_LOCK_GUARD() { 3288 if (ram_list.version != rs->last_version) { 3289 ram_state_reset(rs); 3290 } 3291 3292 /* Read version before ram_list.blocks */ 3293 smp_rmb(); 3294 3295 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 3296 3297 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3298 i = 0; 3299 while ((ret = qemu_file_rate_limit(f)) == 0 || 3300 postcopy_has_request(rs)) { 3301 int pages; 3302 3303 if (qemu_file_get_error(f)) { 3304 break; 3305 } 3306 3307 pages = ram_find_and_save_block(rs); 3308 /* no more pages to sent */ 3309 if (pages == 0) { 3310 done = 1; 3311 break; 3312 } 3313 3314 if (pages < 0) { 3315 qemu_file_set_error(f, pages); 3316 break; 3317 } 3318 3319 rs->target_page_count += pages; 3320 3321 /* 3322 * During postcopy, it is necessary to make sure one whole host 3323 * page is sent in one chunk. 3324 */ 3325 if (migrate_postcopy_ram()) { 3326 flush_compressed_data(rs); 3327 } 3328 3329 /* 3330 * we want to check in the 1st loop, just in case it was the 1st 3331 * time and we had to sync the dirty bitmap. 3332 * qemu_clock_get_ns() is a bit expensive, so we only check each 3333 * some iterations 3334 */ 3335 if ((i & 63) == 0) { 3336 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3337 1000000; 3338 if (t1 > MAX_WAIT) { 3339 trace_ram_save_iterate_big_wait(t1, i); 3340 break; 3341 } 3342 } 3343 i++; 3344 } 3345 } 3346 qemu_mutex_unlock(&rs->bitmap_mutex); 3347 3348 /* 3349 * Must occur before EOS (or any QEMUFile operation) 3350 * because of RDMA protocol. 3351 */ 3352 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 3353 3354 out: 3355 if (ret >= 0 3356 && migration_is_setup_or_active(migrate_get_current()->state)) { 3357 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3358 if (ret < 0) { 3359 return ret; 3360 } 3361 3362 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3363 qemu_fflush(f); 3364 ram_transferred_add(8); 3365 3366 ret = qemu_file_get_error(f); 3367 } 3368 if (ret < 0) { 3369 return ret; 3370 } 3371 3372 return done; 3373 } 3374 3375 /** 3376 * ram_save_complete: function called to send the remaining amount of ram 3377 * 3378 * Returns zero to indicate success or negative on error 3379 * 3380 * Called with iothread lock 3381 * 3382 * @f: QEMUFile where to send the data 3383 * @opaque: RAMState pointer 3384 */ 3385 static int ram_save_complete(QEMUFile *f, void *opaque) 3386 { 3387 RAMState **temp = opaque; 3388 RAMState *rs = *temp; 3389 int ret = 0; 3390 3391 rs->last_stage = !migration_in_colo_state(); 3392 3393 WITH_RCU_READ_LOCK_GUARD() { 3394 if (!migration_in_postcopy()) { 3395 migration_bitmap_sync_precopy(rs); 3396 } 3397 3398 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 3399 3400 /* try transferring iterative blocks of memory */ 3401 3402 /* flush all remaining blocks regardless of rate limiting */ 3403 qemu_mutex_lock(&rs->bitmap_mutex); 3404 while (true) { 3405 int pages; 3406 3407 pages = ram_find_and_save_block(rs); 3408 /* no more blocks to sent */ 3409 if (pages == 0) { 3410 break; 3411 } 3412 if (pages < 0) { 3413 ret = pages; 3414 break; 3415 } 3416 } 3417 qemu_mutex_unlock(&rs->bitmap_mutex); 3418 3419 flush_compressed_data(rs); 3420 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 3421 } 3422 3423 if (ret < 0) { 3424 return ret; 3425 } 3426 3427 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); 3428 if (ret < 0) { 3429 return ret; 3430 } 3431 3432 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3433 qemu_fflush(f); 3434 3435 return 0; 3436 } 3437 3438 static void ram_state_pending_estimate(void *opaque, 3439 uint64_t *res_precopy_only, 3440 uint64_t *res_compatible, 3441 uint64_t *res_postcopy_only) 3442 { 3443 RAMState **temp = opaque; 3444 RAMState *rs = *temp; 3445 3446 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3447 3448 if (migrate_postcopy_ram()) { 3449 /* We can do postcopy, and all the data is postcopiable */ 3450 *res_postcopy_only += remaining_size; 3451 } else { 3452 *res_precopy_only += remaining_size; 3453 } 3454 } 3455 3456 static void ram_state_pending_exact(void *opaque, 3457 uint64_t *res_precopy_only, 3458 uint64_t *res_compatible, 3459 uint64_t *res_postcopy_only) 3460 { 3461 RAMState **temp = opaque; 3462 RAMState *rs = *temp; 3463 3464 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3465 3466 if (!migration_in_postcopy()) { 3467 qemu_mutex_lock_iothread(); 3468 WITH_RCU_READ_LOCK_GUARD() { 3469 migration_bitmap_sync_precopy(rs); 3470 } 3471 qemu_mutex_unlock_iothread(); 3472 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3473 } 3474 3475 if (migrate_postcopy_ram()) { 3476 /* We can do postcopy, and all the data is postcopiable */ 3477 *res_compatible += remaining_size; 3478 } else { 3479 *res_precopy_only += remaining_size; 3480 } 3481 } 3482 3483 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3484 { 3485 unsigned int xh_len; 3486 int xh_flags; 3487 uint8_t *loaded_data; 3488 3489 /* extract RLE header */ 3490 xh_flags = qemu_get_byte(f); 3491 xh_len = qemu_get_be16(f); 3492 3493 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3494 error_report("Failed to load XBZRLE page - wrong compression!"); 3495 return -1; 3496 } 3497 3498 if (xh_len > TARGET_PAGE_SIZE) { 3499 error_report("Failed to load XBZRLE page - len overflow!"); 3500 return -1; 3501 } 3502 loaded_data = XBZRLE.decoded_buf; 3503 /* load data and decode */ 3504 /* it can change loaded_data to point to an internal buffer */ 3505 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3506 3507 /* decode RLE */ 3508 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3509 TARGET_PAGE_SIZE) == -1) { 3510 error_report("Failed to load XBZRLE page - decode error!"); 3511 return -1; 3512 } 3513 3514 return 0; 3515 } 3516 3517 /** 3518 * ram_block_from_stream: read a RAMBlock id from the migration stream 3519 * 3520 * Must be called from within a rcu critical section. 3521 * 3522 * Returns a pointer from within the RCU-protected ram_list. 3523 * 3524 * @mis: the migration incoming state pointer 3525 * @f: QEMUFile where to read the data from 3526 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3527 * @channel: the channel we're using 3528 */ 3529 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3530 QEMUFile *f, int flags, 3531 int channel) 3532 { 3533 RAMBlock *block = mis->last_recv_block[channel]; 3534 char id[256]; 3535 uint8_t len; 3536 3537 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3538 if (!block) { 3539 error_report("Ack, bad migration stream!"); 3540 return NULL; 3541 } 3542 return block; 3543 } 3544 3545 len = qemu_get_byte(f); 3546 qemu_get_buffer(f, (uint8_t *)id, len); 3547 id[len] = 0; 3548 3549 block = qemu_ram_block_by_name(id); 3550 if (!block) { 3551 error_report("Can't find block %s", id); 3552 return NULL; 3553 } 3554 3555 if (ramblock_is_ignored(block)) { 3556 error_report("block %s should not be migrated !", id); 3557 return NULL; 3558 } 3559 3560 mis->last_recv_block[channel] = block; 3561 3562 return block; 3563 } 3564 3565 static inline void *host_from_ram_block_offset(RAMBlock *block, 3566 ram_addr_t offset) 3567 { 3568 if (!offset_in_ramblock(block, offset)) { 3569 return NULL; 3570 } 3571 3572 return block->host + offset; 3573 } 3574 3575 static void *host_page_from_ram_block_offset(RAMBlock *block, 3576 ram_addr_t offset) 3577 { 3578 /* Note: Explicitly no check against offset_in_ramblock(). */ 3579 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3580 block->page_size); 3581 } 3582 3583 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3584 ram_addr_t offset) 3585 { 3586 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3587 } 3588 3589 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3590 ram_addr_t offset, bool record_bitmap) 3591 { 3592 if (!offset_in_ramblock(block, offset)) { 3593 return NULL; 3594 } 3595 if (!block->colo_cache) { 3596 error_report("%s: colo_cache is NULL in block :%s", 3597 __func__, block->idstr); 3598 return NULL; 3599 } 3600 3601 /* 3602 * During colo checkpoint, we need bitmap of these migrated pages. 3603 * It help us to decide which pages in ram cache should be flushed 3604 * into VM's RAM later. 3605 */ 3606 if (record_bitmap && 3607 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) { 3608 ram_state->migration_dirty_pages++; 3609 } 3610 return block->colo_cache + offset; 3611 } 3612 3613 /** 3614 * ram_handle_compressed: handle the zero page case 3615 * 3616 * If a page (or a whole RDMA chunk) has been 3617 * determined to be zero, then zap it. 3618 * 3619 * @host: host address for the zero page 3620 * @ch: what the page is filled from. We only support zero 3621 * @size: size of the zero page 3622 */ 3623 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 3624 { 3625 if (ch != 0 || !buffer_is_zero(host, size)) { 3626 memset(host, ch, size); 3627 } 3628 } 3629 3630 /* return the size after decompression, or negative value on error */ 3631 static int 3632 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 3633 const uint8_t *source, size_t source_len) 3634 { 3635 int err; 3636 3637 err = inflateReset(stream); 3638 if (err != Z_OK) { 3639 return -1; 3640 } 3641 3642 stream->avail_in = source_len; 3643 stream->next_in = (uint8_t *)source; 3644 stream->avail_out = dest_len; 3645 stream->next_out = dest; 3646 3647 err = inflate(stream, Z_NO_FLUSH); 3648 if (err != Z_STREAM_END) { 3649 return -1; 3650 } 3651 3652 return stream->total_out; 3653 } 3654 3655 static void *do_data_decompress(void *opaque) 3656 { 3657 DecompressParam *param = opaque; 3658 unsigned long pagesize; 3659 uint8_t *des; 3660 int len, ret; 3661 3662 qemu_mutex_lock(¶m->mutex); 3663 while (!param->quit) { 3664 if (param->des) { 3665 des = param->des; 3666 len = param->len; 3667 param->des = 0; 3668 qemu_mutex_unlock(¶m->mutex); 3669 3670 pagesize = TARGET_PAGE_SIZE; 3671 3672 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 3673 param->compbuf, len); 3674 if (ret < 0 && migrate_get_current()->decompress_error_check) { 3675 error_report("decompress data failed"); 3676 qemu_file_set_error(decomp_file, ret); 3677 } 3678 3679 qemu_mutex_lock(&decomp_done_lock); 3680 param->done = true; 3681 qemu_cond_signal(&decomp_done_cond); 3682 qemu_mutex_unlock(&decomp_done_lock); 3683 3684 qemu_mutex_lock(¶m->mutex); 3685 } else { 3686 qemu_cond_wait(¶m->cond, ¶m->mutex); 3687 } 3688 } 3689 qemu_mutex_unlock(¶m->mutex); 3690 3691 return NULL; 3692 } 3693 3694 static int wait_for_decompress_done(void) 3695 { 3696 int idx, thread_count; 3697 3698 if (!migrate_use_compression()) { 3699 return 0; 3700 } 3701 3702 thread_count = migrate_decompress_threads(); 3703 qemu_mutex_lock(&decomp_done_lock); 3704 for (idx = 0; idx < thread_count; idx++) { 3705 while (!decomp_param[idx].done) { 3706 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3707 } 3708 } 3709 qemu_mutex_unlock(&decomp_done_lock); 3710 return qemu_file_get_error(decomp_file); 3711 } 3712 3713 static void compress_threads_load_cleanup(void) 3714 { 3715 int i, thread_count; 3716 3717 if (!migrate_use_compression()) { 3718 return; 3719 } 3720 thread_count = migrate_decompress_threads(); 3721 for (i = 0; i < thread_count; i++) { 3722 /* 3723 * we use it as a indicator which shows if the thread is 3724 * properly init'd or not 3725 */ 3726 if (!decomp_param[i].compbuf) { 3727 break; 3728 } 3729 3730 qemu_mutex_lock(&decomp_param[i].mutex); 3731 decomp_param[i].quit = true; 3732 qemu_cond_signal(&decomp_param[i].cond); 3733 qemu_mutex_unlock(&decomp_param[i].mutex); 3734 } 3735 for (i = 0; i < thread_count; i++) { 3736 if (!decomp_param[i].compbuf) { 3737 break; 3738 } 3739 3740 qemu_thread_join(decompress_threads + i); 3741 qemu_mutex_destroy(&decomp_param[i].mutex); 3742 qemu_cond_destroy(&decomp_param[i].cond); 3743 inflateEnd(&decomp_param[i].stream); 3744 g_free(decomp_param[i].compbuf); 3745 decomp_param[i].compbuf = NULL; 3746 } 3747 g_free(decompress_threads); 3748 g_free(decomp_param); 3749 decompress_threads = NULL; 3750 decomp_param = NULL; 3751 decomp_file = NULL; 3752 } 3753 3754 static int compress_threads_load_setup(QEMUFile *f) 3755 { 3756 int i, thread_count; 3757 3758 if (!migrate_use_compression()) { 3759 return 0; 3760 } 3761 3762 thread_count = migrate_decompress_threads(); 3763 decompress_threads = g_new0(QemuThread, thread_count); 3764 decomp_param = g_new0(DecompressParam, thread_count); 3765 qemu_mutex_init(&decomp_done_lock); 3766 qemu_cond_init(&decomp_done_cond); 3767 decomp_file = f; 3768 for (i = 0; i < thread_count; i++) { 3769 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 3770 goto exit; 3771 } 3772 3773 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 3774 qemu_mutex_init(&decomp_param[i].mutex); 3775 qemu_cond_init(&decomp_param[i].cond); 3776 decomp_param[i].done = true; 3777 decomp_param[i].quit = false; 3778 qemu_thread_create(decompress_threads + i, "decompress", 3779 do_data_decompress, decomp_param + i, 3780 QEMU_THREAD_JOINABLE); 3781 } 3782 return 0; 3783 exit: 3784 compress_threads_load_cleanup(); 3785 return -1; 3786 } 3787 3788 static void decompress_data_with_multi_threads(QEMUFile *f, 3789 void *host, int len) 3790 { 3791 int idx, thread_count; 3792 3793 thread_count = migrate_decompress_threads(); 3794 QEMU_LOCK_GUARD(&decomp_done_lock); 3795 while (true) { 3796 for (idx = 0; idx < thread_count; idx++) { 3797 if (decomp_param[idx].done) { 3798 decomp_param[idx].done = false; 3799 qemu_mutex_lock(&decomp_param[idx].mutex); 3800 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3801 decomp_param[idx].des = host; 3802 decomp_param[idx].len = len; 3803 qemu_cond_signal(&decomp_param[idx].cond); 3804 qemu_mutex_unlock(&decomp_param[idx].mutex); 3805 break; 3806 } 3807 } 3808 if (idx < thread_count) { 3809 break; 3810 } else { 3811 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3812 } 3813 } 3814 } 3815 3816 static void colo_init_ram_state(void) 3817 { 3818 ram_state_init(&ram_state); 3819 } 3820 3821 /* 3822 * colo cache: this is for secondary VM, we cache the whole 3823 * memory of the secondary VM, it is need to hold the global lock 3824 * to call this helper. 3825 */ 3826 int colo_init_ram_cache(void) 3827 { 3828 RAMBlock *block; 3829 3830 WITH_RCU_READ_LOCK_GUARD() { 3831 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3832 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3833 NULL, false, false); 3834 if (!block->colo_cache) { 3835 error_report("%s: Can't alloc memory for COLO cache of block %s," 3836 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3837 block->used_length); 3838 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3839 if (block->colo_cache) { 3840 qemu_anon_ram_free(block->colo_cache, block->used_length); 3841 block->colo_cache = NULL; 3842 } 3843 } 3844 return -errno; 3845 } 3846 if (!machine_dump_guest_core(current_machine)) { 3847 qemu_madvise(block->colo_cache, block->used_length, 3848 QEMU_MADV_DONTDUMP); 3849 } 3850 } 3851 } 3852 3853 /* 3854 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3855 * with to decide which page in cache should be flushed into SVM's RAM. Here 3856 * we use the same name 'ram_bitmap' as for migration. 3857 */ 3858 if (ram_bytes_total()) { 3859 RAMBlock *block; 3860 3861 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3862 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3863 block->bmap = bitmap_new(pages); 3864 } 3865 } 3866 3867 colo_init_ram_state(); 3868 return 0; 3869 } 3870 3871 /* TODO: duplicated with ram_init_bitmaps */ 3872 void colo_incoming_start_dirty_log(void) 3873 { 3874 RAMBlock *block = NULL; 3875 /* For memory_global_dirty_log_start below. */ 3876 qemu_mutex_lock_iothread(); 3877 qemu_mutex_lock_ramlist(); 3878 3879 memory_global_dirty_log_sync(); 3880 WITH_RCU_READ_LOCK_GUARD() { 3881 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3882 ramblock_sync_dirty_bitmap(ram_state, block); 3883 /* Discard this dirty bitmap record */ 3884 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3885 } 3886 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); 3887 } 3888 ram_state->migration_dirty_pages = 0; 3889 qemu_mutex_unlock_ramlist(); 3890 qemu_mutex_unlock_iothread(); 3891 } 3892 3893 /* It is need to hold the global lock to call this helper */ 3894 void colo_release_ram_cache(void) 3895 { 3896 RAMBlock *block; 3897 3898 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3899 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3900 g_free(block->bmap); 3901 block->bmap = NULL; 3902 } 3903 3904 WITH_RCU_READ_LOCK_GUARD() { 3905 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3906 if (block->colo_cache) { 3907 qemu_anon_ram_free(block->colo_cache, block->used_length); 3908 block->colo_cache = NULL; 3909 } 3910 } 3911 } 3912 ram_state_cleanup(&ram_state); 3913 } 3914 3915 /** 3916 * ram_load_setup: Setup RAM for migration incoming side 3917 * 3918 * Returns zero to indicate success and negative for error 3919 * 3920 * @f: QEMUFile where to receive the data 3921 * @opaque: RAMState pointer 3922 */ 3923 static int ram_load_setup(QEMUFile *f, void *opaque) 3924 { 3925 if (compress_threads_load_setup(f)) { 3926 return -1; 3927 } 3928 3929 xbzrle_load_setup(); 3930 ramblock_recv_map_init(); 3931 3932 return 0; 3933 } 3934 3935 static int ram_load_cleanup(void *opaque) 3936 { 3937 RAMBlock *rb; 3938 3939 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3940 qemu_ram_block_writeback(rb); 3941 } 3942 3943 xbzrle_load_cleanup(); 3944 compress_threads_load_cleanup(); 3945 3946 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3947 g_free(rb->receivedmap); 3948 rb->receivedmap = NULL; 3949 } 3950 3951 return 0; 3952 } 3953 3954 /** 3955 * ram_postcopy_incoming_init: allocate postcopy data structures 3956 * 3957 * Returns 0 for success and negative if there was one error 3958 * 3959 * @mis: current migration incoming state 3960 * 3961 * Allocate data structures etc needed by incoming migration with 3962 * postcopy-ram. postcopy-ram's similarly names 3963 * postcopy_ram_incoming_init does the work. 3964 */ 3965 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3966 { 3967 return postcopy_ram_incoming_init(mis); 3968 } 3969 3970 /** 3971 * ram_load_postcopy: load a page in postcopy case 3972 * 3973 * Returns 0 for success or -errno in case of error 3974 * 3975 * Called in postcopy mode by ram_load(). 3976 * rcu_read_lock is taken prior to this being called. 3977 * 3978 * @f: QEMUFile where to send the data 3979 * @channel: the channel to use for loading 3980 */ 3981 int ram_load_postcopy(QEMUFile *f, int channel) 3982 { 3983 int flags = 0, ret = 0; 3984 bool place_needed = false; 3985 bool matches_target_page_size = false; 3986 MigrationIncomingState *mis = migration_incoming_get_current(); 3987 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3988 3989 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3990 ram_addr_t addr; 3991 void *page_buffer = NULL; 3992 void *place_source = NULL; 3993 RAMBlock *block = NULL; 3994 uint8_t ch; 3995 int len; 3996 3997 addr = qemu_get_be64(f); 3998 3999 /* 4000 * If qemu file error, we should stop here, and then "addr" 4001 * may be invalid 4002 */ 4003 ret = qemu_file_get_error(f); 4004 if (ret) { 4005 break; 4006 } 4007 4008 flags = addr & ~TARGET_PAGE_MASK; 4009 addr &= TARGET_PAGE_MASK; 4010 4011 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 4012 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4013 RAM_SAVE_FLAG_COMPRESS_PAGE)) { 4014 block = ram_block_from_stream(mis, f, flags, channel); 4015 if (!block) { 4016 ret = -EINVAL; 4017 break; 4018 } 4019 4020 /* 4021 * Relying on used_length is racy and can result in false positives. 4022 * We might place pages beyond used_length in case RAM was shrunk 4023 * while in postcopy, which is fine - trying to place via 4024 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 4025 */ 4026 if (!block->host || addr >= block->postcopy_length) { 4027 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4028 ret = -EINVAL; 4029 break; 4030 } 4031 tmp_page->target_pages++; 4032 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 4033 /* 4034 * Postcopy requires that we place whole host pages atomically; 4035 * these may be huge pages for RAMBlocks that are backed by 4036 * hugetlbfs. 4037 * To make it atomic, the data is read into a temporary page 4038 * that's moved into place later. 4039 * The migration protocol uses, possibly smaller, target-pages 4040 * however the source ensures it always sends all the components 4041 * of a host page in one chunk. 4042 */ 4043 page_buffer = tmp_page->tmp_huge_page + 4044 host_page_offset_from_ram_block_offset(block, addr); 4045 /* If all TP are zero then we can optimise the place */ 4046 if (tmp_page->target_pages == 1) { 4047 tmp_page->host_addr = 4048 host_page_from_ram_block_offset(block, addr); 4049 } else if (tmp_page->host_addr != 4050 host_page_from_ram_block_offset(block, addr)) { 4051 /* not the 1st TP within the HP */ 4052 error_report("Non-same host page detected on channel %d: " 4053 "Target host page %p, received host page %p " 4054 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 4055 channel, tmp_page->host_addr, 4056 host_page_from_ram_block_offset(block, addr), 4057 block->idstr, addr, tmp_page->target_pages); 4058 ret = -EINVAL; 4059 break; 4060 } 4061 4062 /* 4063 * If it's the last part of a host page then we place the host 4064 * page 4065 */ 4066 if (tmp_page->target_pages == 4067 (block->page_size / TARGET_PAGE_SIZE)) { 4068 place_needed = true; 4069 } 4070 place_source = tmp_page->tmp_huge_page; 4071 } 4072 4073 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4074 case RAM_SAVE_FLAG_ZERO: 4075 ch = qemu_get_byte(f); 4076 /* 4077 * Can skip to set page_buffer when 4078 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 4079 */ 4080 if (ch || !matches_target_page_size) { 4081 memset(page_buffer, ch, TARGET_PAGE_SIZE); 4082 } 4083 if (ch) { 4084 tmp_page->all_zero = false; 4085 } 4086 break; 4087 4088 case RAM_SAVE_FLAG_PAGE: 4089 tmp_page->all_zero = false; 4090 if (!matches_target_page_size) { 4091 /* For huge pages, we always use temporary buffer */ 4092 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 4093 } else { 4094 /* 4095 * For small pages that matches target page size, we 4096 * avoid the qemu_file copy. Instead we directly use 4097 * the buffer of QEMUFile to place the page. Note: we 4098 * cannot do any QEMUFile operation before using that 4099 * buffer to make sure the buffer is valid when 4100 * placing the page. 4101 */ 4102 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 4103 TARGET_PAGE_SIZE); 4104 } 4105 break; 4106 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4107 tmp_page->all_zero = false; 4108 len = qemu_get_be32(f); 4109 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4110 error_report("Invalid compressed data length: %d", len); 4111 ret = -EINVAL; 4112 break; 4113 } 4114 decompress_data_with_multi_threads(f, page_buffer, len); 4115 break; 4116 4117 case RAM_SAVE_FLAG_EOS: 4118 /* normal exit */ 4119 multifd_recv_sync_main(); 4120 break; 4121 default: 4122 error_report("Unknown combination of migration flags: 0x%x" 4123 " (postcopy mode)", flags); 4124 ret = -EINVAL; 4125 break; 4126 } 4127 4128 /* Got the whole host page, wait for decompress before placing. */ 4129 if (place_needed) { 4130 ret |= wait_for_decompress_done(); 4131 } 4132 4133 /* Detect for any possible file errors */ 4134 if (!ret && qemu_file_get_error(f)) { 4135 ret = qemu_file_get_error(f); 4136 } 4137 4138 if (!ret && place_needed) { 4139 if (tmp_page->all_zero) { 4140 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 4141 } else { 4142 ret = postcopy_place_page(mis, tmp_page->host_addr, 4143 place_source, block); 4144 } 4145 place_needed = false; 4146 postcopy_temp_page_reset(tmp_page); 4147 } 4148 } 4149 4150 return ret; 4151 } 4152 4153 static bool postcopy_is_running(void) 4154 { 4155 PostcopyState ps = postcopy_state_get(); 4156 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 4157 } 4158 4159 /* 4160 * Flush content of RAM cache into SVM's memory. 4161 * Only flush the pages that be dirtied by PVM or SVM or both. 4162 */ 4163 void colo_flush_ram_cache(void) 4164 { 4165 RAMBlock *block = NULL; 4166 void *dst_host; 4167 void *src_host; 4168 unsigned long offset = 0; 4169 4170 memory_global_dirty_log_sync(); 4171 WITH_RCU_READ_LOCK_GUARD() { 4172 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4173 ramblock_sync_dirty_bitmap(ram_state, block); 4174 } 4175 } 4176 4177 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 4178 WITH_RCU_READ_LOCK_GUARD() { 4179 block = QLIST_FIRST_RCU(&ram_list.blocks); 4180 4181 while (block) { 4182 unsigned long num = 0; 4183 4184 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 4185 if (!offset_in_ramblock(block, 4186 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 4187 offset = 0; 4188 num = 0; 4189 block = QLIST_NEXT_RCU(block, next); 4190 } else { 4191 unsigned long i = 0; 4192 4193 for (i = 0; i < num; i++) { 4194 migration_bitmap_clear_dirty(ram_state, block, offset + i); 4195 } 4196 dst_host = block->host 4197 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4198 src_host = block->colo_cache 4199 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 4200 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 4201 offset += num; 4202 } 4203 } 4204 } 4205 trace_colo_flush_ram_cache_end(); 4206 } 4207 4208 /** 4209 * ram_load_precopy: load pages in precopy case 4210 * 4211 * Returns 0 for success or -errno in case of error 4212 * 4213 * Called in precopy mode by ram_load(). 4214 * rcu_read_lock is taken prior to this being called. 4215 * 4216 * @f: QEMUFile where to send the data 4217 */ 4218 static int ram_load_precopy(QEMUFile *f) 4219 { 4220 MigrationIncomingState *mis = migration_incoming_get_current(); 4221 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; 4222 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4223 bool postcopy_advised = migration_incoming_postcopy_advised(); 4224 if (!migrate_use_compression()) { 4225 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 4226 } 4227 4228 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4229 ram_addr_t addr, total_ram_bytes; 4230 void *host = NULL, *host_bak = NULL; 4231 uint8_t ch; 4232 4233 /* 4234 * Yield periodically to let main loop run, but an iteration of 4235 * the main loop is expensive, so do it each some iterations 4236 */ 4237 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4238 aio_co_schedule(qemu_get_current_aio_context(), 4239 qemu_coroutine_self()); 4240 qemu_coroutine_yield(); 4241 } 4242 i++; 4243 4244 addr = qemu_get_be64(f); 4245 flags = addr & ~TARGET_PAGE_MASK; 4246 addr &= TARGET_PAGE_MASK; 4247 4248 if (flags & invalid_flags) { 4249 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 4250 error_report("Received an unexpected compressed page"); 4251 } 4252 4253 ret = -EINVAL; 4254 break; 4255 } 4256 4257 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4258 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 4259 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4260 RAM_CHANNEL_PRECOPY); 4261 4262 host = host_from_ram_block_offset(block, addr); 4263 /* 4264 * After going into COLO stage, we should not load the page 4265 * into SVM's memory directly, we put them into colo_cache firstly. 4266 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4267 * Previously, we copied all these memory in preparing stage of COLO 4268 * while we need to stop VM, which is a time-consuming process. 4269 * Here we optimize it by a trick, back-up every page while in 4270 * migration process while COLO is enabled, though it affects the 4271 * speed of the migration, but it obviously reduce the downtime of 4272 * back-up all SVM'S memory in COLO preparing stage. 4273 */ 4274 if (migration_incoming_colo_enabled()) { 4275 if (migration_incoming_in_colo_state()) { 4276 /* In COLO stage, put all pages into cache temporarily */ 4277 host = colo_cache_from_block_offset(block, addr, true); 4278 } else { 4279 /* 4280 * In migration stage but before COLO stage, 4281 * Put all pages into both cache and SVM's memory. 4282 */ 4283 host_bak = colo_cache_from_block_offset(block, addr, false); 4284 } 4285 } 4286 if (!host) { 4287 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4288 ret = -EINVAL; 4289 break; 4290 } 4291 if (!migration_incoming_in_colo_state()) { 4292 ramblock_recv_bitmap_set(block, host); 4293 } 4294 4295 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4296 } 4297 4298 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4299 case RAM_SAVE_FLAG_MEM_SIZE: 4300 /* Synchronize RAM block list */ 4301 total_ram_bytes = addr; 4302 while (!ret && total_ram_bytes) { 4303 RAMBlock *block; 4304 char id[256]; 4305 ram_addr_t length; 4306 4307 len = qemu_get_byte(f); 4308 qemu_get_buffer(f, (uint8_t *)id, len); 4309 id[len] = 0; 4310 length = qemu_get_be64(f); 4311 4312 block = qemu_ram_block_by_name(id); 4313 if (block && !qemu_ram_is_migratable(block)) { 4314 error_report("block %s should not be migrated !", id); 4315 ret = -EINVAL; 4316 } else if (block) { 4317 if (length != block->used_length) { 4318 Error *local_err = NULL; 4319 4320 ret = qemu_ram_resize(block, length, 4321 &local_err); 4322 if (local_err) { 4323 error_report_err(local_err); 4324 } 4325 } 4326 /* For postcopy we need to check hugepage sizes match */ 4327 if (postcopy_advised && migrate_postcopy_ram() && 4328 block->page_size != qemu_host_page_size) { 4329 uint64_t remote_page_size = qemu_get_be64(f); 4330 if (remote_page_size != block->page_size) { 4331 error_report("Mismatched RAM page size %s " 4332 "(local) %zd != %" PRId64, 4333 id, block->page_size, 4334 remote_page_size); 4335 ret = -EINVAL; 4336 } 4337 } 4338 if (migrate_ignore_shared()) { 4339 hwaddr addr = qemu_get_be64(f); 4340 if (ramblock_is_ignored(block) && 4341 block->mr->addr != addr) { 4342 error_report("Mismatched GPAs for block %s " 4343 "%" PRId64 "!= %" PRId64, 4344 id, (uint64_t)addr, 4345 (uint64_t)block->mr->addr); 4346 ret = -EINVAL; 4347 } 4348 } 4349 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 4350 block->idstr); 4351 } else { 4352 error_report("Unknown ramblock \"%s\", cannot " 4353 "accept migration", id); 4354 ret = -EINVAL; 4355 } 4356 4357 total_ram_bytes -= length; 4358 } 4359 break; 4360 4361 case RAM_SAVE_FLAG_ZERO: 4362 ch = qemu_get_byte(f); 4363 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 4364 break; 4365 4366 case RAM_SAVE_FLAG_PAGE: 4367 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4368 break; 4369 4370 case RAM_SAVE_FLAG_COMPRESS_PAGE: 4371 len = qemu_get_be32(f); 4372 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 4373 error_report("Invalid compressed data length: %d", len); 4374 ret = -EINVAL; 4375 break; 4376 } 4377 decompress_data_with_multi_threads(f, host, len); 4378 break; 4379 4380 case RAM_SAVE_FLAG_XBZRLE: 4381 if (load_xbzrle(f, addr, host) < 0) { 4382 error_report("Failed to decompress XBZRLE page at " 4383 RAM_ADDR_FMT, addr); 4384 ret = -EINVAL; 4385 break; 4386 } 4387 break; 4388 case RAM_SAVE_FLAG_EOS: 4389 /* normal exit */ 4390 multifd_recv_sync_main(); 4391 break; 4392 default: 4393 if (flags & RAM_SAVE_FLAG_HOOK) { 4394 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 4395 } else { 4396 error_report("Unknown combination of migration flags: 0x%x", 4397 flags); 4398 ret = -EINVAL; 4399 } 4400 } 4401 if (!ret) { 4402 ret = qemu_file_get_error(f); 4403 } 4404 if (!ret && host_bak) { 4405 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4406 } 4407 } 4408 4409 ret |= wait_for_decompress_done(); 4410 return ret; 4411 } 4412 4413 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4414 { 4415 int ret = 0; 4416 static uint64_t seq_iter; 4417 /* 4418 * If system is running in postcopy mode, page inserts to host memory must 4419 * be atomic 4420 */ 4421 bool postcopy_running = postcopy_is_running(); 4422 4423 seq_iter++; 4424 4425 if (version_id != 4) { 4426 return -EINVAL; 4427 } 4428 4429 /* 4430 * This RCU critical section can be very long running. 4431 * When RCU reclaims in the code start to become numerous, 4432 * it will be necessary to reduce the granularity of this 4433 * critical section. 4434 */ 4435 WITH_RCU_READ_LOCK_GUARD() { 4436 if (postcopy_running) { 4437 /* 4438 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4439 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4440 * service fast page faults. 4441 */ 4442 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4443 } else { 4444 ret = ram_load_precopy(f); 4445 } 4446 } 4447 trace_ram_load_complete(ret, seq_iter); 4448 4449 return ret; 4450 } 4451 4452 static bool ram_has_postcopy(void *opaque) 4453 { 4454 RAMBlock *rb; 4455 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4456 if (ramblock_is_pmem(rb)) { 4457 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4458 "is not supported now!", rb->idstr, rb->host); 4459 return false; 4460 } 4461 } 4462 4463 return migrate_postcopy_ram(); 4464 } 4465 4466 /* Sync all the dirty bitmap with destination VM. */ 4467 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4468 { 4469 RAMBlock *block; 4470 QEMUFile *file = s->to_dst_file; 4471 int ramblock_count = 0; 4472 4473 trace_ram_dirty_bitmap_sync_start(); 4474 4475 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4476 qemu_savevm_send_recv_bitmap(file, block->idstr); 4477 trace_ram_dirty_bitmap_request(block->idstr); 4478 ramblock_count++; 4479 } 4480 4481 trace_ram_dirty_bitmap_sync_wait(); 4482 4483 /* Wait until all the ramblocks' dirty bitmap synced */ 4484 while (ramblock_count--) { 4485 qemu_sem_wait(&s->rp_state.rp_sem); 4486 } 4487 4488 trace_ram_dirty_bitmap_sync_complete(); 4489 4490 return 0; 4491 } 4492 4493 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 4494 { 4495 qemu_sem_post(&s->rp_state.rp_sem); 4496 } 4497 4498 /* 4499 * Read the received bitmap, revert it as the initial dirty bitmap. 4500 * This is only used when the postcopy migration is paused but wants 4501 * to resume from a middle point. 4502 */ 4503 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 4504 { 4505 int ret = -EINVAL; 4506 /* from_dst_file is always valid because we're within rp_thread */ 4507 QEMUFile *file = s->rp_state.from_dst_file; 4508 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 4509 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4510 uint64_t size, end_mark; 4511 4512 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4513 4514 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4515 error_report("%s: incorrect state %s", __func__, 4516 MigrationStatus_str(s->state)); 4517 return -EINVAL; 4518 } 4519 4520 /* 4521 * Note: see comments in ramblock_recv_bitmap_send() on why we 4522 * need the endianness conversion, and the paddings. 4523 */ 4524 local_size = ROUND_UP(local_size, 8); 4525 4526 /* Add paddings */ 4527 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4528 4529 size = qemu_get_be64(file); 4530 4531 /* The size of the bitmap should match with our ramblock */ 4532 if (size != local_size) { 4533 error_report("%s: ramblock '%s' bitmap size mismatch " 4534 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 4535 block->idstr, size, local_size); 4536 ret = -EINVAL; 4537 goto out; 4538 } 4539 4540 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4541 end_mark = qemu_get_be64(file); 4542 4543 ret = qemu_file_get_error(file); 4544 if (ret || size != local_size) { 4545 error_report("%s: read bitmap failed for ramblock '%s': %d" 4546 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 4547 __func__, block->idstr, ret, local_size, size); 4548 ret = -EIO; 4549 goto out; 4550 } 4551 4552 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4553 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, 4554 __func__, block->idstr, end_mark); 4555 ret = -EINVAL; 4556 goto out; 4557 } 4558 4559 /* 4560 * Endianness conversion. We are during postcopy (though paused). 4561 * The dirty bitmap won't change. We can directly modify it. 4562 */ 4563 bitmap_from_le(block->bmap, le_bitmap, nbits); 4564 4565 /* 4566 * What we received is "received bitmap". Revert it as the initial 4567 * dirty bitmap for this ramblock. 4568 */ 4569 bitmap_complement(block->bmap, block->bmap, nbits); 4570 4571 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4572 ramblock_dirty_bitmap_clear_discarded_pages(block); 4573 4574 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4575 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4576 4577 /* 4578 * We succeeded to sync bitmap for current ramblock. If this is 4579 * the last one to sync, we need to notify the main send thread. 4580 */ 4581 ram_dirty_bitmap_reload_notify(s); 4582 4583 ret = 0; 4584 out: 4585 g_free(le_bitmap); 4586 return ret; 4587 } 4588 4589 static int ram_resume_prepare(MigrationState *s, void *opaque) 4590 { 4591 RAMState *rs = *(RAMState **)opaque; 4592 int ret; 4593 4594 ret = ram_dirty_bitmap_sync_all(s, rs); 4595 if (ret) { 4596 return ret; 4597 } 4598 4599 ram_state_resume_prepare(rs, s->to_dst_file); 4600 4601 return 0; 4602 } 4603 4604 void postcopy_preempt_shutdown_file(MigrationState *s) 4605 { 4606 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4607 qemu_fflush(s->postcopy_qemufile_src); 4608 } 4609 4610 static SaveVMHandlers savevm_ram_handlers = { 4611 .save_setup = ram_save_setup, 4612 .save_live_iterate = ram_save_iterate, 4613 .save_live_complete_postcopy = ram_save_complete, 4614 .save_live_complete_precopy = ram_save_complete, 4615 .has_postcopy = ram_has_postcopy, 4616 .state_pending_exact = ram_state_pending_exact, 4617 .state_pending_estimate = ram_state_pending_estimate, 4618 .load_state = ram_load, 4619 .save_cleanup = ram_save_cleanup, 4620 .load_setup = ram_load_setup, 4621 .load_cleanup = ram_load_cleanup, 4622 .resume_prepare = ram_resume_prepare, 4623 }; 4624 4625 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4626 size_t old_size, size_t new_size) 4627 { 4628 PostcopyState ps = postcopy_state_get(); 4629 ram_addr_t offset; 4630 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4631 Error *err = NULL; 4632 4633 if (ramblock_is_ignored(rb)) { 4634 return; 4635 } 4636 4637 if (!migration_is_idle()) { 4638 /* 4639 * Precopy code on the source cannot deal with the size of RAM blocks 4640 * changing at random points in time - especially after sending the 4641 * RAM block sizes in the migration stream, they must no longer change. 4642 * Abort and indicate a proper reason. 4643 */ 4644 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4645 migration_cancel(err); 4646 error_free(err); 4647 } 4648 4649 switch (ps) { 4650 case POSTCOPY_INCOMING_ADVISE: 4651 /* 4652 * Update what ram_postcopy_incoming_init()->init_range() does at the 4653 * time postcopy was advised. Syncing RAM blocks with the source will 4654 * result in RAM resizes. 4655 */ 4656 if (old_size < new_size) { 4657 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4658 error_report("RAM block '%s' discard of resized RAM failed", 4659 rb->idstr); 4660 } 4661 } 4662 rb->postcopy_length = new_size; 4663 break; 4664 case POSTCOPY_INCOMING_NONE: 4665 case POSTCOPY_INCOMING_RUNNING: 4666 case POSTCOPY_INCOMING_END: 4667 /* 4668 * Once our guest is running, postcopy does no longer care about 4669 * resizes. When growing, the new memory was not available on the 4670 * source, no handler needed. 4671 */ 4672 break; 4673 default: 4674 error_report("RAM block '%s' resized during postcopy state: %d", 4675 rb->idstr, ps); 4676 exit(-1); 4677 } 4678 } 4679 4680 static RAMBlockNotifier ram_mig_ram_notifier = { 4681 .ram_block_resized = ram_mig_ram_block_resized, 4682 }; 4683 4684 void ram_mig_init(void) 4685 { 4686 qemu_mutex_init(&XBZRLE.lock); 4687 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4688 ram_block_notifier_add(&ram_mig_ram_notifier); 4689 } 4690