1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 #include "qemu/osdep.h" 29 #include "qemu-common.h" 30 #include "cpu.h" 31 #include <zlib.h> 32 #include "qapi-event.h" 33 #include "qemu/cutils.h" 34 #include "qemu/bitops.h" 35 #include "qemu/bitmap.h" 36 #include "qemu/timer.h" 37 #include "qemu/main-loop.h" 38 #include "xbzrle.h" 39 #include "migration/migration.h" 40 #include "migration/qemu-file.h" 41 #include "migration/vmstate.h" 42 #include "postcopy-ram.h" 43 #include "exec/address-spaces.h" 44 #include "migration/page_cache.h" 45 #include "qemu/error-report.h" 46 #include "trace.h" 47 #include "exec/ram_addr.h" 48 #include "qemu/rcu_queue.h" 49 #include "migration/colo.h" 50 51 /***********************************************************/ 52 /* ram save/restore */ 53 54 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 55 * worked for pages that where filled with the same char. We switched 56 * it to only search for the zero value. And to avoid confusion with 57 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 58 */ 59 60 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 61 #define RAM_SAVE_FLAG_ZERO 0x02 62 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 63 #define RAM_SAVE_FLAG_PAGE 0x08 64 #define RAM_SAVE_FLAG_EOS 0x10 65 #define RAM_SAVE_FLAG_CONTINUE 0x20 66 #define RAM_SAVE_FLAG_XBZRLE 0x40 67 /* 0x80 is reserved in migration.h start with 0x100 next */ 68 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 69 70 static uint8_t *ZERO_TARGET_PAGE; 71 72 static inline bool is_zero_range(uint8_t *p, uint64_t size) 73 { 74 return buffer_is_zero(p, size); 75 } 76 77 /* struct contains XBZRLE cache and a static page 78 used by the compression */ 79 static struct { 80 /* buffer used for XBZRLE encoding */ 81 uint8_t *encoded_buf; 82 /* buffer for storing page content */ 83 uint8_t *current_buf; 84 /* Cache for XBZRLE, Protected by lock. */ 85 PageCache *cache; 86 QemuMutex lock; 87 } XBZRLE; 88 89 /* buffer used for XBZRLE decoding */ 90 static uint8_t *xbzrle_decoded_buf; 91 92 static void XBZRLE_cache_lock(void) 93 { 94 if (migrate_use_xbzrle()) 95 qemu_mutex_lock(&XBZRLE.lock); 96 } 97 98 static void XBZRLE_cache_unlock(void) 99 { 100 if (migrate_use_xbzrle()) 101 qemu_mutex_unlock(&XBZRLE.lock); 102 } 103 104 /** 105 * xbzrle_cache_resize: resize the xbzrle cache 106 * 107 * This function is called from qmp_migrate_set_cache_size in main 108 * thread, possibly while a migration is in progress. A running 109 * migration may be using the cache and might finish during this call, 110 * hence changes to the cache are protected by XBZRLE.lock(). 111 * 112 * Returns the new_size or negative in case of error. 113 * 114 * @new_size: new cache size 115 */ 116 int64_t xbzrle_cache_resize(int64_t new_size) 117 { 118 PageCache *new_cache; 119 int64_t ret; 120 121 if (new_size < TARGET_PAGE_SIZE) { 122 return -1; 123 } 124 125 XBZRLE_cache_lock(); 126 127 if (XBZRLE.cache != NULL) { 128 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { 129 goto out_new_size; 130 } 131 new_cache = cache_init(new_size / TARGET_PAGE_SIZE, 132 TARGET_PAGE_SIZE); 133 if (!new_cache) { 134 error_report("Error creating cache"); 135 ret = -1; 136 goto out; 137 } 138 139 cache_fini(XBZRLE.cache); 140 XBZRLE.cache = new_cache; 141 } 142 143 out_new_size: 144 ret = pow2floor(new_size); 145 out: 146 XBZRLE_cache_unlock(); 147 return ret; 148 } 149 150 /* 151 * An outstanding page request, on the source, having been received 152 * and queued 153 */ 154 struct RAMSrcPageRequest { 155 RAMBlock *rb; 156 hwaddr offset; 157 hwaddr len; 158 159 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 160 }; 161 162 /* State of RAM for migration */ 163 struct RAMState { 164 /* QEMUFile used for this migration */ 165 QEMUFile *f; 166 /* Last block that we have visited searching for dirty pages */ 167 RAMBlock *last_seen_block; 168 /* Last block from where we have sent data */ 169 RAMBlock *last_sent_block; 170 /* Last dirty target page we have sent */ 171 ram_addr_t last_page; 172 /* last ram version we have seen */ 173 uint32_t last_version; 174 /* We are in the first round */ 175 bool ram_bulk_stage; 176 /* How many times we have dirty too many pages */ 177 int dirty_rate_high_cnt; 178 /* How many times we have synchronized the bitmap */ 179 uint64_t bitmap_sync_count; 180 /* these variables are used for bitmap sync */ 181 /* last time we did a full bitmap_sync */ 182 int64_t time_last_bitmap_sync; 183 /* bytes transferred at start_time */ 184 uint64_t bytes_xfer_prev; 185 /* number of dirty pages since start_time */ 186 uint64_t num_dirty_pages_period; 187 /* xbzrle misses since the beginning of the period */ 188 uint64_t xbzrle_cache_miss_prev; 189 /* number of iterations at the beginning of period */ 190 uint64_t iterations_prev; 191 /* Accounting fields */ 192 /* number of zero pages. It used to be pages filled by the same char. */ 193 uint64_t zero_pages; 194 /* number of normal transferred pages */ 195 uint64_t norm_pages; 196 /* Iterations since start */ 197 uint64_t iterations; 198 /* xbzrle transmitted bytes. Notice that this is with 199 * compression, they can't be calculated from the pages */ 200 uint64_t xbzrle_bytes; 201 /* xbzrle transmmited pages */ 202 uint64_t xbzrle_pages; 203 /* xbzrle number of cache miss */ 204 uint64_t xbzrle_cache_miss; 205 /* xbzrle miss rate */ 206 double xbzrle_cache_miss_rate; 207 /* xbzrle number of overflows */ 208 uint64_t xbzrle_overflows; 209 /* number of dirty bits in the bitmap */ 210 uint64_t migration_dirty_pages; 211 /* total number of bytes transferred */ 212 uint64_t bytes_transferred; 213 /* number of dirtied pages in the last second */ 214 uint64_t dirty_pages_rate; 215 /* Count of requests incoming from destination */ 216 uint64_t postcopy_requests; 217 /* protects modification of the bitmap */ 218 QemuMutex bitmap_mutex; 219 /* The RAMBlock used in the last src_page_requests */ 220 RAMBlock *last_req_rb; 221 /* Queue of outstanding page requests from the destination */ 222 QemuMutex src_page_req_mutex; 223 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests; 224 }; 225 typedef struct RAMState RAMState; 226 227 static RAMState ram_state; 228 229 uint64_t dup_mig_pages_transferred(void) 230 { 231 return ram_state.zero_pages; 232 } 233 234 uint64_t norm_mig_pages_transferred(void) 235 { 236 return ram_state.norm_pages; 237 } 238 239 uint64_t xbzrle_mig_bytes_transferred(void) 240 { 241 return ram_state.xbzrle_bytes; 242 } 243 244 uint64_t xbzrle_mig_pages_transferred(void) 245 { 246 return ram_state.xbzrle_pages; 247 } 248 249 uint64_t xbzrle_mig_pages_cache_miss(void) 250 { 251 return ram_state.xbzrle_cache_miss; 252 } 253 254 double xbzrle_mig_cache_miss_rate(void) 255 { 256 return ram_state.xbzrle_cache_miss_rate; 257 } 258 259 uint64_t xbzrle_mig_pages_overflow(void) 260 { 261 return ram_state.xbzrle_overflows; 262 } 263 264 uint64_t ram_bytes_transferred(void) 265 { 266 return ram_state.bytes_transferred; 267 } 268 269 uint64_t ram_bytes_remaining(void) 270 { 271 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE; 272 } 273 274 uint64_t ram_dirty_sync_count(void) 275 { 276 return ram_state.bitmap_sync_count; 277 } 278 279 uint64_t ram_dirty_pages_rate(void) 280 { 281 return ram_state.dirty_pages_rate; 282 } 283 284 uint64_t ram_postcopy_requests(void) 285 { 286 return ram_state.postcopy_requests; 287 } 288 289 /* used by the search for pages to send */ 290 struct PageSearchStatus { 291 /* Current block being searched */ 292 RAMBlock *block; 293 /* Current page to search from */ 294 unsigned long page; 295 /* Set once we wrap around */ 296 bool complete_round; 297 }; 298 typedef struct PageSearchStatus PageSearchStatus; 299 300 struct CompressParam { 301 bool done; 302 bool quit; 303 QEMUFile *file; 304 QemuMutex mutex; 305 QemuCond cond; 306 RAMBlock *block; 307 ram_addr_t offset; 308 }; 309 typedef struct CompressParam CompressParam; 310 311 struct DecompressParam { 312 bool done; 313 bool quit; 314 QemuMutex mutex; 315 QemuCond cond; 316 void *des; 317 uint8_t *compbuf; 318 int len; 319 }; 320 typedef struct DecompressParam DecompressParam; 321 322 static CompressParam *comp_param; 323 static QemuThread *compress_threads; 324 /* comp_done_cond is used to wake up the migration thread when 325 * one of the compression threads has finished the compression. 326 * comp_done_lock is used to co-work with comp_done_cond. 327 */ 328 static QemuMutex comp_done_lock; 329 static QemuCond comp_done_cond; 330 /* The empty QEMUFileOps will be used by file in CompressParam */ 331 static const QEMUFileOps empty_ops = { }; 332 333 static DecompressParam *decomp_param; 334 static QemuThread *decompress_threads; 335 static QemuMutex decomp_done_lock; 336 static QemuCond decomp_done_cond; 337 338 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 339 ram_addr_t offset); 340 341 static void *do_data_compress(void *opaque) 342 { 343 CompressParam *param = opaque; 344 RAMBlock *block; 345 ram_addr_t offset; 346 347 qemu_mutex_lock(¶m->mutex); 348 while (!param->quit) { 349 if (param->block) { 350 block = param->block; 351 offset = param->offset; 352 param->block = NULL; 353 qemu_mutex_unlock(¶m->mutex); 354 355 do_compress_ram_page(param->file, block, offset); 356 357 qemu_mutex_lock(&comp_done_lock); 358 param->done = true; 359 qemu_cond_signal(&comp_done_cond); 360 qemu_mutex_unlock(&comp_done_lock); 361 362 qemu_mutex_lock(¶m->mutex); 363 } else { 364 qemu_cond_wait(¶m->cond, ¶m->mutex); 365 } 366 } 367 qemu_mutex_unlock(¶m->mutex); 368 369 return NULL; 370 } 371 372 static inline void terminate_compression_threads(void) 373 { 374 int idx, thread_count; 375 376 thread_count = migrate_compress_threads(); 377 378 for (idx = 0; idx < thread_count; idx++) { 379 qemu_mutex_lock(&comp_param[idx].mutex); 380 comp_param[idx].quit = true; 381 qemu_cond_signal(&comp_param[idx].cond); 382 qemu_mutex_unlock(&comp_param[idx].mutex); 383 } 384 } 385 386 void migrate_compress_threads_join(void) 387 { 388 int i, thread_count; 389 390 if (!migrate_use_compression()) { 391 return; 392 } 393 terminate_compression_threads(); 394 thread_count = migrate_compress_threads(); 395 for (i = 0; i < thread_count; i++) { 396 qemu_thread_join(compress_threads + i); 397 qemu_fclose(comp_param[i].file); 398 qemu_mutex_destroy(&comp_param[i].mutex); 399 qemu_cond_destroy(&comp_param[i].cond); 400 } 401 qemu_mutex_destroy(&comp_done_lock); 402 qemu_cond_destroy(&comp_done_cond); 403 g_free(compress_threads); 404 g_free(comp_param); 405 compress_threads = NULL; 406 comp_param = NULL; 407 } 408 409 void migrate_compress_threads_create(void) 410 { 411 int i, thread_count; 412 413 if (!migrate_use_compression()) { 414 return; 415 } 416 thread_count = migrate_compress_threads(); 417 compress_threads = g_new0(QemuThread, thread_count); 418 comp_param = g_new0(CompressParam, thread_count); 419 qemu_cond_init(&comp_done_cond); 420 qemu_mutex_init(&comp_done_lock); 421 for (i = 0; i < thread_count; i++) { 422 /* comp_param[i].file is just used as a dummy buffer to save data, 423 * set its ops to empty. 424 */ 425 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 426 comp_param[i].done = true; 427 comp_param[i].quit = false; 428 qemu_mutex_init(&comp_param[i].mutex); 429 qemu_cond_init(&comp_param[i].cond); 430 qemu_thread_create(compress_threads + i, "compress", 431 do_data_compress, comp_param + i, 432 QEMU_THREAD_JOINABLE); 433 } 434 } 435 436 /** 437 * save_page_header: write page header to wire 438 * 439 * If this is the 1st block, it also writes the block identification 440 * 441 * Returns the number of bytes written 442 * 443 * @f: QEMUFile where to send the data 444 * @block: block that contains the page we want to send 445 * @offset: offset inside the block for the page 446 * in the lower bits, it contains flags 447 */ 448 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 449 ram_addr_t offset) 450 { 451 size_t size, len; 452 453 if (block == rs->last_sent_block) { 454 offset |= RAM_SAVE_FLAG_CONTINUE; 455 } 456 qemu_put_be64(f, offset); 457 size = 8; 458 459 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 460 len = strlen(block->idstr); 461 qemu_put_byte(f, len); 462 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 463 size += 1 + len; 464 rs->last_sent_block = block; 465 } 466 return size; 467 } 468 469 /** 470 * mig_throttle_guest_down: throotle down the guest 471 * 472 * Reduce amount of guest cpu execution to hopefully slow down memory 473 * writes. If guest dirty memory rate is reduced below the rate at 474 * which we can transfer pages to the destination then we should be 475 * able to complete migration. Some workloads dirty memory way too 476 * fast and will not effectively converge, even with auto-converge. 477 */ 478 static void mig_throttle_guest_down(void) 479 { 480 MigrationState *s = migrate_get_current(); 481 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 482 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 483 484 /* We have not started throttling yet. Let's start it. */ 485 if (!cpu_throttle_active()) { 486 cpu_throttle_set(pct_initial); 487 } else { 488 /* Throttling already on, just increase the rate */ 489 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); 490 } 491 } 492 493 /** 494 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 495 * 496 * @rs: current RAM state 497 * @current_addr: address for the zero page 498 * 499 * Update the xbzrle cache to reflect a page that's been sent as all 0. 500 * The important thing is that a stale (not-yet-0'd) page be replaced 501 * by the new data. 502 * As a bonus, if the page wasn't in the cache it gets added so that 503 * when a small write is made into the 0'd page it gets XBZRLE sent. 504 */ 505 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 506 { 507 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 508 return; 509 } 510 511 /* We don't care if this fails to allocate a new cache page 512 * as long as it updated an old one */ 513 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE, 514 rs->bitmap_sync_count); 515 } 516 517 #define ENCODING_FLAG_XBZRLE 0x1 518 519 /** 520 * save_xbzrle_page: compress and send current page 521 * 522 * Returns: 1 means that we wrote the page 523 * 0 means that page is identical to the one already sent 524 * -1 means that xbzrle would be longer than normal 525 * 526 * @rs: current RAM state 527 * @current_data: pointer to the address of the page contents 528 * @current_addr: addr of the page 529 * @block: block that contains the page we want to send 530 * @offset: offset inside the block for the page 531 * @last_stage: if we are at the completion stage 532 */ 533 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 534 ram_addr_t current_addr, RAMBlock *block, 535 ram_addr_t offset, bool last_stage) 536 { 537 int encoded_len = 0, bytes_xbzrle; 538 uint8_t *prev_cached_page; 539 540 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) { 541 rs->xbzrle_cache_miss++; 542 if (!last_stage) { 543 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 544 rs->bitmap_sync_count) == -1) { 545 return -1; 546 } else { 547 /* update *current_data when the page has been 548 inserted into cache */ 549 *current_data = get_cached_data(XBZRLE.cache, current_addr); 550 } 551 } 552 return -1; 553 } 554 555 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 556 557 /* save current buffer into memory */ 558 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 559 560 /* XBZRLE encoding (if there is no overflow) */ 561 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 562 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 563 TARGET_PAGE_SIZE); 564 if (encoded_len == 0) { 565 trace_save_xbzrle_page_skipping(); 566 return 0; 567 } else if (encoded_len == -1) { 568 trace_save_xbzrle_page_overflow(); 569 rs->xbzrle_overflows++; 570 /* update data in the cache */ 571 if (!last_stage) { 572 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); 573 *current_data = prev_cached_page; 574 } 575 return -1; 576 } 577 578 /* we need to update the data in the cache, in order to get the same data */ 579 if (!last_stage) { 580 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 581 } 582 583 /* Send XBZRLE based compressed page */ 584 bytes_xbzrle = save_page_header(rs, rs->f, block, 585 offset | RAM_SAVE_FLAG_XBZRLE); 586 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 587 qemu_put_be16(rs->f, encoded_len); 588 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 589 bytes_xbzrle += encoded_len + 1 + 2; 590 rs->xbzrle_pages++; 591 rs->xbzrle_bytes += bytes_xbzrle; 592 rs->bytes_transferred += bytes_xbzrle; 593 594 return 1; 595 } 596 597 /** 598 * migration_bitmap_find_dirty: find the next dirty page from start 599 * 600 * Called with rcu_read_lock() to protect migration_bitmap 601 * 602 * Returns the byte offset within memory region of the start of a dirty page 603 * 604 * @rs: current RAM state 605 * @rb: RAMBlock where to search for dirty pages 606 * @start: page where we start the search 607 */ 608 static inline 609 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 610 unsigned long start) 611 { 612 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 613 unsigned long *bitmap = rb->bmap; 614 unsigned long next; 615 616 if (rs->ram_bulk_stage && start > 0) { 617 next = start + 1; 618 } else { 619 next = find_next_bit(bitmap, size, start); 620 } 621 622 return next; 623 } 624 625 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 626 RAMBlock *rb, 627 unsigned long page) 628 { 629 bool ret; 630 631 ret = test_and_clear_bit(page, rb->bmap); 632 633 if (ret) { 634 rs->migration_dirty_pages--; 635 } 636 return ret; 637 } 638 639 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb, 640 ram_addr_t start, ram_addr_t length) 641 { 642 rs->migration_dirty_pages += 643 cpu_physical_memory_sync_dirty_bitmap(rb, start, length, 644 &rs->num_dirty_pages_period); 645 } 646 647 /** 648 * ram_pagesize_summary: calculate all the pagesizes of a VM 649 * 650 * Returns a summary bitmap of the page sizes of all RAMBlocks 651 * 652 * For VMs with just normal pages this is equivalent to the host page 653 * size. If it's got some huge pages then it's the OR of all the 654 * different page sizes. 655 */ 656 uint64_t ram_pagesize_summary(void) 657 { 658 RAMBlock *block; 659 uint64_t summary = 0; 660 661 RAMBLOCK_FOREACH(block) { 662 summary |= block->page_size; 663 } 664 665 return summary; 666 } 667 668 static void migration_bitmap_sync(RAMState *rs) 669 { 670 RAMBlock *block; 671 int64_t end_time; 672 uint64_t bytes_xfer_now; 673 674 rs->bitmap_sync_count++; 675 676 if (!rs->time_last_bitmap_sync) { 677 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 678 } 679 680 trace_migration_bitmap_sync_start(); 681 memory_global_dirty_log_sync(); 682 683 qemu_mutex_lock(&rs->bitmap_mutex); 684 rcu_read_lock(); 685 RAMBLOCK_FOREACH(block) { 686 migration_bitmap_sync_range(rs, block, 0, block->used_length); 687 } 688 rcu_read_unlock(); 689 qemu_mutex_unlock(&rs->bitmap_mutex); 690 691 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 692 693 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 694 695 /* more than 1 second = 1000 millisecons */ 696 if (end_time > rs->time_last_bitmap_sync + 1000) { 697 /* calculate period counters */ 698 rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000 699 / (end_time - rs->time_last_bitmap_sync); 700 bytes_xfer_now = ram_bytes_transferred(); 701 702 if (migrate_auto_converge()) { 703 /* The following detection logic can be refined later. For now: 704 Check to see if the dirtied bytes is 50% more than the approx. 705 amount of bytes that just got transferred since the last time we 706 were in this routine. If that happens twice, start or increase 707 throttling */ 708 709 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE > 710 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) && 711 (++rs->dirty_rate_high_cnt >= 2)) { 712 trace_migration_throttle(); 713 rs->dirty_rate_high_cnt = 0; 714 mig_throttle_guest_down(); 715 } 716 } 717 718 if (migrate_use_xbzrle()) { 719 if (rs->iterations_prev != rs->iterations) { 720 rs->xbzrle_cache_miss_rate = 721 (double)(rs->xbzrle_cache_miss - 722 rs->xbzrle_cache_miss_prev) / 723 (rs->iterations - rs->iterations_prev); 724 } 725 rs->iterations_prev = rs->iterations; 726 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss; 727 } 728 729 /* reset period counters */ 730 rs->time_last_bitmap_sync = end_time; 731 rs->num_dirty_pages_period = 0; 732 rs->bytes_xfer_prev = bytes_xfer_now; 733 } 734 if (migrate_use_events()) { 735 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL); 736 } 737 } 738 739 /** 740 * save_zero_page: send the zero page to the stream 741 * 742 * Returns the number of pages written. 743 * 744 * @rs: current RAM state 745 * @block: block that contains the page we want to send 746 * @offset: offset inside the block for the page 747 * @p: pointer to the page 748 */ 749 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 750 uint8_t *p) 751 { 752 int pages = -1; 753 754 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 755 rs->zero_pages++; 756 rs->bytes_transferred += 757 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO); 758 qemu_put_byte(rs->f, 0); 759 rs->bytes_transferred += 1; 760 pages = 1; 761 } 762 763 return pages; 764 } 765 766 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 767 { 768 if (!migrate_release_ram() || !migration_in_postcopy()) { 769 return; 770 } 771 772 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS); 773 } 774 775 /** 776 * ram_save_page: send the given page to the stream 777 * 778 * Returns the number of pages written. 779 * < 0 - error 780 * >=0 - Number of pages written - this might legally be 0 781 * if xbzrle noticed the page was the same. 782 * 783 * @rs: current RAM state 784 * @block: block that contains the page we want to send 785 * @offset: offset inside the block for the page 786 * @last_stage: if we are at the completion stage 787 */ 788 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 789 { 790 int pages = -1; 791 uint64_t bytes_xmit; 792 ram_addr_t current_addr; 793 uint8_t *p; 794 int ret; 795 bool send_async = true; 796 RAMBlock *block = pss->block; 797 ram_addr_t offset = pss->page << TARGET_PAGE_BITS; 798 799 p = block->host + offset; 800 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 801 802 /* In doubt sent page as normal */ 803 bytes_xmit = 0; 804 ret = ram_control_save_page(rs->f, block->offset, 805 offset, TARGET_PAGE_SIZE, &bytes_xmit); 806 if (bytes_xmit) { 807 rs->bytes_transferred += bytes_xmit; 808 pages = 1; 809 } 810 811 XBZRLE_cache_lock(); 812 813 current_addr = block->offset + offset; 814 815 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 816 if (ret != RAM_SAVE_CONTROL_DELAYED) { 817 if (bytes_xmit > 0) { 818 rs->norm_pages++; 819 } else if (bytes_xmit == 0) { 820 rs->zero_pages++; 821 } 822 } 823 } else { 824 pages = save_zero_page(rs, block, offset, p); 825 if (pages > 0) { 826 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 827 * page would be stale 828 */ 829 xbzrle_cache_zero_page(rs, current_addr); 830 ram_release_pages(block->idstr, offset, pages); 831 } else if (!rs->ram_bulk_stage && 832 !migration_in_postcopy() && migrate_use_xbzrle()) { 833 pages = save_xbzrle_page(rs, &p, current_addr, block, 834 offset, last_stage); 835 if (!last_stage) { 836 /* Can't send this cached data async, since the cache page 837 * might get updated before it gets to the wire 838 */ 839 send_async = false; 840 } 841 } 842 } 843 844 /* XBZRLE overflow or normal page */ 845 if (pages == -1) { 846 rs->bytes_transferred += save_page_header(rs, rs->f, block, 847 offset | RAM_SAVE_FLAG_PAGE); 848 if (send_async) { 849 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, 850 migrate_release_ram() & 851 migration_in_postcopy()); 852 } else { 853 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE); 854 } 855 rs->bytes_transferred += TARGET_PAGE_SIZE; 856 pages = 1; 857 rs->norm_pages++; 858 } 859 860 XBZRLE_cache_unlock(); 861 862 return pages; 863 } 864 865 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 866 ram_addr_t offset) 867 { 868 RAMState *rs = &ram_state; 869 int bytes_sent, blen; 870 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 871 872 bytes_sent = save_page_header(rs, f, block, offset | 873 RAM_SAVE_FLAG_COMPRESS_PAGE); 874 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 875 migrate_compress_level()); 876 if (blen < 0) { 877 bytes_sent = 0; 878 qemu_file_set_error(migrate_get_current()->to_dst_file, blen); 879 error_report("compressed data failed!"); 880 } else { 881 bytes_sent += blen; 882 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 883 } 884 885 return bytes_sent; 886 } 887 888 static void flush_compressed_data(RAMState *rs) 889 { 890 int idx, len, thread_count; 891 892 if (!migrate_use_compression()) { 893 return; 894 } 895 thread_count = migrate_compress_threads(); 896 897 qemu_mutex_lock(&comp_done_lock); 898 for (idx = 0; idx < thread_count; idx++) { 899 while (!comp_param[idx].done) { 900 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 901 } 902 } 903 qemu_mutex_unlock(&comp_done_lock); 904 905 for (idx = 0; idx < thread_count; idx++) { 906 qemu_mutex_lock(&comp_param[idx].mutex); 907 if (!comp_param[idx].quit) { 908 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 909 rs->bytes_transferred += len; 910 } 911 qemu_mutex_unlock(&comp_param[idx].mutex); 912 } 913 } 914 915 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 916 ram_addr_t offset) 917 { 918 param->block = block; 919 param->offset = offset; 920 } 921 922 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 923 ram_addr_t offset) 924 { 925 int idx, thread_count, bytes_xmit = -1, pages = -1; 926 927 thread_count = migrate_compress_threads(); 928 qemu_mutex_lock(&comp_done_lock); 929 while (true) { 930 for (idx = 0; idx < thread_count; idx++) { 931 if (comp_param[idx].done) { 932 comp_param[idx].done = false; 933 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 934 qemu_mutex_lock(&comp_param[idx].mutex); 935 set_compress_params(&comp_param[idx], block, offset); 936 qemu_cond_signal(&comp_param[idx].cond); 937 qemu_mutex_unlock(&comp_param[idx].mutex); 938 pages = 1; 939 rs->norm_pages++; 940 rs->bytes_transferred += bytes_xmit; 941 break; 942 } 943 } 944 if (pages > 0) { 945 break; 946 } else { 947 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 948 } 949 } 950 qemu_mutex_unlock(&comp_done_lock); 951 952 return pages; 953 } 954 955 /** 956 * ram_save_compressed_page: compress the given page and send it to the stream 957 * 958 * Returns the number of pages written. 959 * 960 * @rs: current RAM state 961 * @block: block that contains the page we want to send 962 * @offset: offset inside the block for the page 963 * @last_stage: if we are at the completion stage 964 */ 965 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, 966 bool last_stage) 967 { 968 int pages = -1; 969 uint64_t bytes_xmit = 0; 970 uint8_t *p; 971 int ret, blen; 972 RAMBlock *block = pss->block; 973 ram_addr_t offset = pss->page << TARGET_PAGE_BITS; 974 975 p = block->host + offset; 976 977 ret = ram_control_save_page(rs->f, block->offset, 978 offset, TARGET_PAGE_SIZE, &bytes_xmit); 979 if (bytes_xmit) { 980 rs->bytes_transferred += bytes_xmit; 981 pages = 1; 982 } 983 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 984 if (ret != RAM_SAVE_CONTROL_DELAYED) { 985 if (bytes_xmit > 0) { 986 rs->norm_pages++; 987 } else if (bytes_xmit == 0) { 988 rs->zero_pages++; 989 } 990 } 991 } else { 992 /* When starting the process of a new block, the first page of 993 * the block should be sent out before other pages in the same 994 * block, and all the pages in last block should have been sent 995 * out, keeping this order is important, because the 'cont' flag 996 * is used to avoid resending the block name. 997 */ 998 if (block != rs->last_sent_block) { 999 flush_compressed_data(rs); 1000 pages = save_zero_page(rs, block, offset, p); 1001 if (pages == -1) { 1002 /* Make sure the first page is sent out before other pages */ 1003 bytes_xmit = save_page_header(rs, rs->f, block, offset | 1004 RAM_SAVE_FLAG_COMPRESS_PAGE); 1005 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE, 1006 migrate_compress_level()); 1007 if (blen > 0) { 1008 rs->bytes_transferred += bytes_xmit + blen; 1009 rs->norm_pages++; 1010 pages = 1; 1011 } else { 1012 qemu_file_set_error(rs->f, blen); 1013 error_report("compressed data failed!"); 1014 } 1015 } 1016 if (pages > 0) { 1017 ram_release_pages(block->idstr, offset, pages); 1018 } 1019 } else { 1020 pages = save_zero_page(rs, block, offset, p); 1021 if (pages == -1) { 1022 pages = compress_page_with_multi_thread(rs, block, offset); 1023 } else { 1024 ram_release_pages(block->idstr, offset, pages); 1025 } 1026 } 1027 } 1028 1029 return pages; 1030 } 1031 1032 /** 1033 * find_dirty_block: find the next dirty page and update any state 1034 * associated with the search process. 1035 * 1036 * Returns if a page is found 1037 * 1038 * @rs: current RAM state 1039 * @pss: data about the state of the current dirty page scan 1040 * @again: set to false if the search has scanned the whole of RAM 1041 */ 1042 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1043 { 1044 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1045 if (pss->complete_round && pss->block == rs->last_seen_block && 1046 pss->page >= rs->last_page) { 1047 /* 1048 * We've been once around the RAM and haven't found anything. 1049 * Give up. 1050 */ 1051 *again = false; 1052 return false; 1053 } 1054 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) { 1055 /* Didn't find anything in this RAM Block */ 1056 pss->page = 0; 1057 pss->block = QLIST_NEXT_RCU(pss->block, next); 1058 if (!pss->block) { 1059 /* Hit the end of the list */ 1060 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1061 /* Flag that we've looped */ 1062 pss->complete_round = true; 1063 rs->ram_bulk_stage = false; 1064 if (migrate_use_xbzrle()) { 1065 /* If xbzrle is on, stop using the data compression at this 1066 * point. In theory, xbzrle can do better than compression. 1067 */ 1068 flush_compressed_data(rs); 1069 } 1070 } 1071 /* Didn't find anything this time, but try again on the new block */ 1072 *again = true; 1073 return false; 1074 } else { 1075 /* Can go around again, but... */ 1076 *again = true; 1077 /* We've found something so probably don't need to */ 1078 return true; 1079 } 1080 } 1081 1082 /** 1083 * unqueue_page: gets a page of the queue 1084 * 1085 * Helper for 'get_queued_page' - gets a page off the queue 1086 * 1087 * Returns the block of the page (or NULL if none available) 1088 * 1089 * @rs: current RAM state 1090 * @offset: used to return the offset within the RAMBlock 1091 */ 1092 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1093 { 1094 RAMBlock *block = NULL; 1095 1096 qemu_mutex_lock(&rs->src_page_req_mutex); 1097 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1098 struct RAMSrcPageRequest *entry = 1099 QSIMPLEQ_FIRST(&rs->src_page_requests); 1100 block = entry->rb; 1101 *offset = entry->offset; 1102 1103 if (entry->len > TARGET_PAGE_SIZE) { 1104 entry->len -= TARGET_PAGE_SIZE; 1105 entry->offset += TARGET_PAGE_SIZE; 1106 } else { 1107 memory_region_unref(block->mr); 1108 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1109 g_free(entry); 1110 } 1111 } 1112 qemu_mutex_unlock(&rs->src_page_req_mutex); 1113 1114 return block; 1115 } 1116 1117 /** 1118 * get_queued_page: unqueue a page from the postocpy requests 1119 * 1120 * Skips pages that are already sent (!dirty) 1121 * 1122 * Returns if a queued page is found 1123 * 1124 * @rs: current RAM state 1125 * @pss: data about the state of the current dirty page scan 1126 */ 1127 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1128 { 1129 RAMBlock *block; 1130 ram_addr_t offset; 1131 bool dirty; 1132 1133 do { 1134 block = unqueue_page(rs, &offset); 1135 /* 1136 * We're sending this page, and since it's postcopy nothing else 1137 * will dirty it, and we must make sure it doesn't get sent again 1138 * even if this queue request was received after the background 1139 * search already sent it. 1140 */ 1141 if (block) { 1142 unsigned long page; 1143 1144 page = offset >> TARGET_PAGE_BITS; 1145 dirty = test_bit(page, block->bmap); 1146 if (!dirty) { 1147 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1148 page, test_bit(page, block->unsentmap)); 1149 } else { 1150 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1151 } 1152 } 1153 1154 } while (block && !dirty); 1155 1156 if (block) { 1157 /* 1158 * As soon as we start servicing pages out of order, then we have 1159 * to kill the bulk stage, since the bulk stage assumes 1160 * in (migration_bitmap_find_and_reset_dirty) that every page is 1161 * dirty, that's no longer true. 1162 */ 1163 rs->ram_bulk_stage = false; 1164 1165 /* 1166 * We want the background search to continue from the queued page 1167 * since the guest is likely to want other pages near to the page 1168 * it just requested. 1169 */ 1170 pss->block = block; 1171 pss->page = offset >> TARGET_PAGE_BITS; 1172 } 1173 1174 return !!block; 1175 } 1176 1177 /** 1178 * migration_page_queue_free: drop any remaining pages in the ram 1179 * request queue 1180 * 1181 * It should be empty at the end anyway, but in error cases there may 1182 * be some left. in case that there is any page left, we drop it. 1183 * 1184 */ 1185 void migration_page_queue_free(void) 1186 { 1187 struct RAMSrcPageRequest *mspr, *next_mspr; 1188 RAMState *rs = &ram_state; 1189 /* This queue generally should be empty - but in the case of a failed 1190 * migration might have some droppings in. 1191 */ 1192 rcu_read_lock(); 1193 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1194 memory_region_unref(mspr->rb->mr); 1195 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1196 g_free(mspr); 1197 } 1198 rcu_read_unlock(); 1199 } 1200 1201 /** 1202 * ram_save_queue_pages: queue the page for transmission 1203 * 1204 * A request from postcopy destination for example. 1205 * 1206 * Returns zero on success or negative on error 1207 * 1208 * @rbname: Name of the RAMBLock of the request. NULL means the 1209 * same that last one. 1210 * @start: starting address from the start of the RAMBlock 1211 * @len: length (in bytes) to send 1212 */ 1213 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1214 { 1215 RAMBlock *ramblock; 1216 RAMState *rs = &ram_state; 1217 1218 rs->postcopy_requests++; 1219 rcu_read_lock(); 1220 if (!rbname) { 1221 /* Reuse last RAMBlock */ 1222 ramblock = rs->last_req_rb; 1223 1224 if (!ramblock) { 1225 /* 1226 * Shouldn't happen, we can't reuse the last RAMBlock if 1227 * it's the 1st request. 1228 */ 1229 error_report("ram_save_queue_pages no previous block"); 1230 goto err; 1231 } 1232 } else { 1233 ramblock = qemu_ram_block_by_name(rbname); 1234 1235 if (!ramblock) { 1236 /* We shouldn't be asked for a non-existent RAMBlock */ 1237 error_report("ram_save_queue_pages no block '%s'", rbname); 1238 goto err; 1239 } 1240 rs->last_req_rb = ramblock; 1241 } 1242 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1243 if (start+len > ramblock->used_length) { 1244 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1245 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1246 __func__, start, len, ramblock->used_length); 1247 goto err; 1248 } 1249 1250 struct RAMSrcPageRequest *new_entry = 1251 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1252 new_entry->rb = ramblock; 1253 new_entry->offset = start; 1254 new_entry->len = len; 1255 1256 memory_region_ref(ramblock->mr); 1257 qemu_mutex_lock(&rs->src_page_req_mutex); 1258 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1259 qemu_mutex_unlock(&rs->src_page_req_mutex); 1260 rcu_read_unlock(); 1261 1262 return 0; 1263 1264 err: 1265 rcu_read_unlock(); 1266 return -1; 1267 } 1268 1269 /** 1270 * ram_save_target_page: save one target page 1271 * 1272 * Returns the number of pages written 1273 * 1274 * @rs: current RAM state 1275 * @ms: current migration state 1276 * @pss: data about the page we want to send 1277 * @last_stage: if we are at the completion stage 1278 */ 1279 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1280 bool last_stage) 1281 { 1282 int res = 0; 1283 1284 /* Check the pages is dirty and if it is send it */ 1285 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 1286 /* 1287 * If xbzrle is on, stop using the data compression after first 1288 * round of migration even if compression is enabled. In theory, 1289 * xbzrle can do better than compression. 1290 */ 1291 if (migrate_use_compression() && 1292 (rs->ram_bulk_stage || !migrate_use_xbzrle())) { 1293 res = ram_save_compressed_page(rs, pss, last_stage); 1294 } else { 1295 res = ram_save_page(rs, pss, last_stage); 1296 } 1297 1298 if (res < 0) { 1299 return res; 1300 } 1301 if (pss->block->unsentmap) { 1302 clear_bit(pss->page, pss->block->unsentmap); 1303 } 1304 } 1305 1306 return res; 1307 } 1308 1309 /** 1310 * ram_save_host_page: save a whole host page 1311 * 1312 * Starting at *offset send pages up to the end of the current host 1313 * page. It's valid for the initial offset to point into the middle of 1314 * a host page in which case the remainder of the hostpage is sent. 1315 * Only dirty target pages are sent. Note that the host page size may 1316 * be a huge page for this block. 1317 * The saving stops at the boundary of the used_length of the block 1318 * if the RAMBlock isn't a multiple of the host page size. 1319 * 1320 * Returns the number of pages written or negative on error 1321 * 1322 * @rs: current RAM state 1323 * @ms: current migration state 1324 * @pss: data about the page we want to send 1325 * @last_stage: if we are at the completion stage 1326 */ 1327 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1328 bool last_stage) 1329 { 1330 int tmppages, pages = 0; 1331 size_t pagesize_bits = 1332 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1333 1334 do { 1335 tmppages = ram_save_target_page(rs, pss, last_stage); 1336 if (tmppages < 0) { 1337 return tmppages; 1338 } 1339 1340 pages += tmppages; 1341 pss->page++; 1342 } while ((pss->page & (pagesize_bits - 1)) && 1343 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); 1344 1345 /* The offset we leave with is the last one we looked at */ 1346 pss->page--; 1347 return pages; 1348 } 1349 1350 /** 1351 * ram_find_and_save_block: finds a dirty page and sends it to f 1352 * 1353 * Called within an RCU critical section. 1354 * 1355 * Returns the number of pages written where zero means no dirty pages 1356 * 1357 * @rs: current RAM state 1358 * @last_stage: if we are at the completion stage 1359 * 1360 * On systems where host-page-size > target-page-size it will send all the 1361 * pages in a host page that are dirty. 1362 */ 1363 1364 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 1365 { 1366 PageSearchStatus pss; 1367 int pages = 0; 1368 bool again, found; 1369 1370 /* No dirty page as there is zero RAM */ 1371 if (!ram_bytes_total()) { 1372 return pages; 1373 } 1374 1375 pss.block = rs->last_seen_block; 1376 pss.page = rs->last_page; 1377 pss.complete_round = false; 1378 1379 if (!pss.block) { 1380 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1381 } 1382 1383 do { 1384 again = true; 1385 found = get_queued_page(rs, &pss); 1386 1387 if (!found) { 1388 /* priority queue empty, so just search for something dirty */ 1389 found = find_dirty_block(rs, &pss, &again); 1390 } 1391 1392 if (found) { 1393 pages = ram_save_host_page(rs, &pss, last_stage); 1394 } 1395 } while (!pages && again); 1396 1397 rs->last_seen_block = pss.block; 1398 rs->last_page = pss.page; 1399 1400 return pages; 1401 } 1402 1403 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1404 { 1405 uint64_t pages = size / TARGET_PAGE_SIZE; 1406 RAMState *rs = &ram_state; 1407 1408 if (zero) { 1409 rs->zero_pages += pages; 1410 } else { 1411 rs->norm_pages += pages; 1412 rs->bytes_transferred += size; 1413 qemu_update_position(f, size); 1414 } 1415 } 1416 1417 uint64_t ram_bytes_total(void) 1418 { 1419 RAMBlock *block; 1420 uint64_t total = 0; 1421 1422 rcu_read_lock(); 1423 RAMBLOCK_FOREACH(block) { 1424 total += block->used_length; 1425 } 1426 rcu_read_unlock(); 1427 return total; 1428 } 1429 1430 void free_xbzrle_decoded_buf(void) 1431 { 1432 g_free(xbzrle_decoded_buf); 1433 xbzrle_decoded_buf = NULL; 1434 } 1435 1436 static void ram_migration_cleanup(void *opaque) 1437 { 1438 RAMBlock *block; 1439 1440 /* caller have hold iothread lock or is in a bh, so there is 1441 * no writing race against this migration_bitmap 1442 */ 1443 memory_global_dirty_log_stop(); 1444 1445 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1446 g_free(block->bmap); 1447 block->bmap = NULL; 1448 g_free(block->unsentmap); 1449 block->unsentmap = NULL; 1450 } 1451 1452 XBZRLE_cache_lock(); 1453 if (XBZRLE.cache) { 1454 cache_fini(XBZRLE.cache); 1455 g_free(XBZRLE.encoded_buf); 1456 g_free(XBZRLE.current_buf); 1457 g_free(ZERO_TARGET_PAGE); 1458 XBZRLE.cache = NULL; 1459 XBZRLE.encoded_buf = NULL; 1460 XBZRLE.current_buf = NULL; 1461 } 1462 XBZRLE_cache_unlock(); 1463 } 1464 1465 static void ram_state_reset(RAMState *rs) 1466 { 1467 rs->last_seen_block = NULL; 1468 rs->last_sent_block = NULL; 1469 rs->last_page = 0; 1470 rs->last_version = ram_list.version; 1471 rs->ram_bulk_stage = true; 1472 } 1473 1474 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1475 1476 /* 1477 * 'expected' is the value you expect the bitmap mostly to be full 1478 * of; it won't bother printing lines that are all this value. 1479 * If 'todump' is null the migration bitmap is dumped. 1480 */ 1481 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 1482 unsigned long pages) 1483 { 1484 int64_t cur; 1485 int64_t linelen = 128; 1486 char linebuf[129]; 1487 1488 for (cur = 0; cur < pages; cur += linelen) { 1489 int64_t curb; 1490 bool found = false; 1491 /* 1492 * Last line; catch the case where the line length 1493 * is longer than remaining ram 1494 */ 1495 if (cur + linelen > pages) { 1496 linelen = pages - cur; 1497 } 1498 for (curb = 0; curb < linelen; curb++) { 1499 bool thisbit = test_bit(cur + curb, todump); 1500 linebuf[curb] = thisbit ? '1' : '.'; 1501 found = found || (thisbit != expected); 1502 } 1503 if (found) { 1504 linebuf[curb] = '\0'; 1505 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1506 } 1507 } 1508 } 1509 1510 /* **** functions for postcopy ***** */ 1511 1512 void ram_postcopy_migrated_memory_release(MigrationState *ms) 1513 { 1514 struct RAMBlock *block; 1515 1516 RAMBLOCK_FOREACH(block) { 1517 unsigned long *bitmap = block->bmap; 1518 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 1519 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 1520 1521 while (run_start < range) { 1522 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 1523 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS, 1524 (run_end - run_start) << TARGET_PAGE_BITS); 1525 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 1526 } 1527 } 1528 } 1529 1530 /** 1531 * postcopy_send_discard_bm_ram: discard a RAMBlock 1532 * 1533 * Returns zero on success 1534 * 1535 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1536 * Note: At this point the 'unsentmap' is the processed bitmap combined 1537 * with the dirtymap; so a '1' means it's either dirty or unsent. 1538 * 1539 * @ms: current migration state 1540 * @pds: state for postcopy 1541 * @start: RAMBlock starting page 1542 * @length: RAMBlock size 1543 */ 1544 static int postcopy_send_discard_bm_ram(MigrationState *ms, 1545 PostcopyDiscardState *pds, 1546 RAMBlock *block) 1547 { 1548 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 1549 unsigned long current; 1550 unsigned long *unsentmap = block->unsentmap; 1551 1552 for (current = 0; current < end; ) { 1553 unsigned long one = find_next_bit(unsentmap, end, current); 1554 1555 if (one <= end) { 1556 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); 1557 unsigned long discard_length; 1558 1559 if (zero >= end) { 1560 discard_length = end - one; 1561 } else { 1562 discard_length = zero - one; 1563 } 1564 if (discard_length) { 1565 postcopy_discard_send_range(ms, pds, one, discard_length); 1566 } 1567 current = one + discard_length; 1568 } else { 1569 current = one; 1570 } 1571 } 1572 1573 return 0; 1574 } 1575 1576 /** 1577 * postcopy_each_ram_send_discard: discard all RAMBlocks 1578 * 1579 * Returns 0 for success or negative for error 1580 * 1581 * Utility for the outgoing postcopy code. 1582 * Calls postcopy_send_discard_bm_ram for each RAMBlock 1583 * passing it bitmap indexes and name. 1584 * (qemu_ram_foreach_block ends up passing unscaled lengths 1585 * which would mean postcopy code would have to deal with target page) 1586 * 1587 * @ms: current migration state 1588 */ 1589 static int postcopy_each_ram_send_discard(MigrationState *ms) 1590 { 1591 struct RAMBlock *block; 1592 int ret; 1593 1594 RAMBLOCK_FOREACH(block) { 1595 PostcopyDiscardState *pds = 1596 postcopy_discard_send_init(ms, block->idstr); 1597 1598 /* 1599 * Postcopy sends chunks of bitmap over the wire, but it 1600 * just needs indexes at this point, avoids it having 1601 * target page specific code. 1602 */ 1603 ret = postcopy_send_discard_bm_ram(ms, pds, block); 1604 postcopy_discard_send_finish(ms, pds); 1605 if (ret) { 1606 return ret; 1607 } 1608 } 1609 1610 return 0; 1611 } 1612 1613 /** 1614 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages 1615 * 1616 * Helper for postcopy_chunk_hostpages; it's called twice to 1617 * canonicalize the two bitmaps, that are similar, but one is 1618 * inverted. 1619 * 1620 * Postcopy requires that all target pages in a hostpage are dirty or 1621 * clean, not a mix. This function canonicalizes the bitmaps. 1622 * 1623 * @ms: current migration state 1624 * @unsent_pass: if true we need to canonicalize partially unsent host pages 1625 * otherwise we need to canonicalize partially dirty host pages 1626 * @block: block that contains the page we want to canonicalize 1627 * @pds: state for postcopy 1628 */ 1629 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, 1630 RAMBlock *block, 1631 PostcopyDiscardState *pds) 1632 { 1633 RAMState *rs = &ram_state; 1634 unsigned long *bitmap = block->bmap; 1635 unsigned long *unsentmap = block->unsentmap; 1636 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 1637 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 1638 unsigned long run_start; 1639 1640 if (block->page_size == TARGET_PAGE_SIZE) { 1641 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 1642 return; 1643 } 1644 1645 if (unsent_pass) { 1646 /* Find a sent page */ 1647 run_start = find_next_zero_bit(unsentmap, pages, 0); 1648 } else { 1649 /* Find a dirty page */ 1650 run_start = find_next_bit(bitmap, pages, 0); 1651 } 1652 1653 while (run_start < pages) { 1654 bool do_fixup = false; 1655 unsigned long fixup_start_addr; 1656 unsigned long host_offset; 1657 1658 /* 1659 * If the start of this run of pages is in the middle of a host 1660 * page, then we need to fixup this host page. 1661 */ 1662 host_offset = run_start % host_ratio; 1663 if (host_offset) { 1664 do_fixup = true; 1665 run_start -= host_offset; 1666 fixup_start_addr = run_start; 1667 /* For the next pass */ 1668 run_start = run_start + host_ratio; 1669 } else { 1670 /* Find the end of this run */ 1671 unsigned long run_end; 1672 if (unsent_pass) { 1673 run_end = find_next_bit(unsentmap, pages, run_start + 1); 1674 } else { 1675 run_end = find_next_zero_bit(bitmap, pages, run_start + 1); 1676 } 1677 /* 1678 * If the end isn't at the start of a host page, then the 1679 * run doesn't finish at the end of a host page 1680 * and we need to discard. 1681 */ 1682 host_offset = run_end % host_ratio; 1683 if (host_offset) { 1684 do_fixup = true; 1685 fixup_start_addr = run_end - host_offset; 1686 /* 1687 * This host page has gone, the next loop iteration starts 1688 * from after the fixup 1689 */ 1690 run_start = fixup_start_addr + host_ratio; 1691 } else { 1692 /* 1693 * No discards on this iteration, next loop starts from 1694 * next sent/dirty page 1695 */ 1696 run_start = run_end + 1; 1697 } 1698 } 1699 1700 if (do_fixup) { 1701 unsigned long page; 1702 1703 /* Tell the destination to discard this page */ 1704 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { 1705 /* For the unsent_pass we: 1706 * discard partially sent pages 1707 * For the !unsent_pass (dirty) we: 1708 * discard partially dirty pages that were sent 1709 * (any partially sent pages were already discarded 1710 * by the previous unsent_pass) 1711 */ 1712 postcopy_discard_send_range(ms, pds, fixup_start_addr, 1713 host_ratio); 1714 } 1715 1716 /* Clean up the bitmap */ 1717 for (page = fixup_start_addr; 1718 page < fixup_start_addr + host_ratio; page++) { 1719 /* All pages in this host page are now not sent */ 1720 set_bit(page, unsentmap); 1721 1722 /* 1723 * Remark them as dirty, updating the count for any pages 1724 * that weren't previously dirty. 1725 */ 1726 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 1727 } 1728 } 1729 1730 if (unsent_pass) { 1731 /* Find the next sent page for the next iteration */ 1732 run_start = find_next_zero_bit(unsentmap, pages, run_start); 1733 } else { 1734 /* Find the next dirty page for the next iteration */ 1735 run_start = find_next_bit(bitmap, pages, run_start); 1736 } 1737 } 1738 } 1739 1740 /** 1741 * postcopy_chuck_hostpages: discrad any partially sent host page 1742 * 1743 * Utility for the outgoing postcopy code. 1744 * 1745 * Discard any partially sent host-page size chunks, mark any partially 1746 * dirty host-page size chunks as all dirty. In this case the host-page 1747 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 1748 * 1749 * Returns zero on success 1750 * 1751 * @ms: current migration state 1752 * @block: block we want to work with 1753 */ 1754 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 1755 { 1756 PostcopyDiscardState *pds = 1757 postcopy_discard_send_init(ms, block->idstr); 1758 1759 /* First pass: Discard all partially sent host pages */ 1760 postcopy_chunk_hostpages_pass(ms, true, block, pds); 1761 /* 1762 * Second pass: Ensure that all partially dirty host pages are made 1763 * fully dirty. 1764 */ 1765 postcopy_chunk_hostpages_pass(ms, false, block, pds); 1766 1767 postcopy_discard_send_finish(ms, pds); 1768 return 0; 1769 } 1770 1771 /** 1772 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 1773 * 1774 * Returns zero on success 1775 * 1776 * Transmit the set of pages to be discarded after precopy to the target 1777 * these are pages that: 1778 * a) Have been previously transmitted but are now dirty again 1779 * b) Pages that have never been transmitted, this ensures that 1780 * any pages on the destination that have been mapped by background 1781 * tasks get discarded (transparent huge pages is the specific concern) 1782 * Hopefully this is pretty sparse 1783 * 1784 * @ms: current migration state 1785 */ 1786 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 1787 { 1788 RAMState *rs = &ram_state; 1789 RAMBlock *block; 1790 int ret; 1791 1792 rcu_read_lock(); 1793 1794 /* This should be our last sync, the src is now paused */ 1795 migration_bitmap_sync(rs); 1796 1797 /* Easiest way to make sure we don't resume in the middle of a host-page */ 1798 rs->last_seen_block = NULL; 1799 rs->last_sent_block = NULL; 1800 rs->last_page = 0; 1801 1802 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1803 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 1804 unsigned long *bitmap = block->bmap; 1805 unsigned long *unsentmap = block->unsentmap; 1806 1807 if (!unsentmap) { 1808 /* We don't have a safe way to resize the sentmap, so 1809 * if the bitmap was resized it will be NULL at this 1810 * point. 1811 */ 1812 error_report("migration ram resized during precopy phase"); 1813 rcu_read_unlock(); 1814 return -EINVAL; 1815 } 1816 /* Deal with TPS != HPS and huge pages */ 1817 ret = postcopy_chunk_hostpages(ms, block); 1818 if (ret) { 1819 rcu_read_unlock(); 1820 return ret; 1821 } 1822 1823 /* 1824 * Update the unsentmap to be unsentmap = unsentmap | dirty 1825 */ 1826 bitmap_or(unsentmap, unsentmap, bitmap, pages); 1827 #ifdef DEBUG_POSTCOPY 1828 ram_debug_dump_bitmap(unsentmap, true, pages); 1829 #endif 1830 } 1831 trace_ram_postcopy_send_discard_bitmap(); 1832 1833 ret = postcopy_each_ram_send_discard(ms); 1834 rcu_read_unlock(); 1835 1836 return ret; 1837 } 1838 1839 /** 1840 * ram_discard_range: discard dirtied pages at the beginning of postcopy 1841 * 1842 * Returns zero on success 1843 * 1844 * @rbname: name of the RAMBlock of the request. NULL means the 1845 * same that last one. 1846 * @start: RAMBlock starting page 1847 * @length: RAMBlock size 1848 */ 1849 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 1850 { 1851 int ret = -1; 1852 1853 trace_ram_discard_range(rbname, start, length); 1854 1855 rcu_read_lock(); 1856 RAMBlock *rb = qemu_ram_block_by_name(rbname); 1857 1858 if (!rb) { 1859 error_report("ram_discard_range: Failed to find block '%s'", rbname); 1860 goto err; 1861 } 1862 1863 ret = ram_block_discard_range(rb, start, length); 1864 1865 err: 1866 rcu_read_unlock(); 1867 1868 return ret; 1869 } 1870 1871 static int ram_state_init(RAMState *rs) 1872 { 1873 memset(rs, 0, sizeof(*rs)); 1874 qemu_mutex_init(&rs->bitmap_mutex); 1875 qemu_mutex_init(&rs->src_page_req_mutex); 1876 QSIMPLEQ_INIT(&rs->src_page_requests); 1877 1878 if (migrate_use_xbzrle()) { 1879 XBZRLE_cache_lock(); 1880 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE); 1881 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / 1882 TARGET_PAGE_SIZE, 1883 TARGET_PAGE_SIZE); 1884 if (!XBZRLE.cache) { 1885 XBZRLE_cache_unlock(); 1886 error_report("Error creating cache"); 1887 return -1; 1888 } 1889 XBZRLE_cache_unlock(); 1890 1891 /* We prefer not to abort if there is no memory */ 1892 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 1893 if (!XBZRLE.encoded_buf) { 1894 error_report("Error allocating encoded_buf"); 1895 return -1; 1896 } 1897 1898 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 1899 if (!XBZRLE.current_buf) { 1900 error_report("Error allocating current_buf"); 1901 g_free(XBZRLE.encoded_buf); 1902 XBZRLE.encoded_buf = NULL; 1903 return -1; 1904 } 1905 } 1906 1907 /* For memory_global_dirty_log_start below. */ 1908 qemu_mutex_lock_iothread(); 1909 1910 qemu_mutex_lock_ramlist(); 1911 rcu_read_lock(); 1912 ram_state_reset(rs); 1913 1914 /* Skip setting bitmap if there is no RAM */ 1915 if (ram_bytes_total()) { 1916 RAMBlock *block; 1917 1918 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1919 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 1920 1921 block->bmap = bitmap_new(pages); 1922 bitmap_set(block->bmap, 0, pages); 1923 if (migrate_postcopy_ram()) { 1924 block->unsentmap = bitmap_new(pages); 1925 bitmap_set(block->unsentmap, 0, pages); 1926 } 1927 } 1928 } 1929 1930 /* 1931 * Count the total number of pages used by ram blocks not including any 1932 * gaps due to alignment or unplugs. 1933 */ 1934 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 1935 1936 memory_global_dirty_log_start(); 1937 migration_bitmap_sync(rs); 1938 qemu_mutex_unlock_ramlist(); 1939 qemu_mutex_unlock_iothread(); 1940 rcu_read_unlock(); 1941 1942 return 0; 1943 } 1944 1945 /* 1946 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 1947 * long-running RCU critical section. When rcu-reclaims in the code 1948 * start to become numerous it will be necessary to reduce the 1949 * granularity of these critical sections. 1950 */ 1951 1952 /** 1953 * ram_save_setup: Setup RAM for migration 1954 * 1955 * Returns zero to indicate success and negative for error 1956 * 1957 * @f: QEMUFile where to send the data 1958 * @opaque: RAMState pointer 1959 */ 1960 static int ram_save_setup(QEMUFile *f, void *opaque) 1961 { 1962 RAMState *rs = opaque; 1963 RAMBlock *block; 1964 1965 /* migration has already setup the bitmap, reuse it. */ 1966 if (!migration_in_colo_state()) { 1967 if (ram_state_init(rs) < 0) { 1968 return -1; 1969 } 1970 } 1971 rs->f = f; 1972 1973 rcu_read_lock(); 1974 1975 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); 1976 1977 RAMBLOCK_FOREACH(block) { 1978 qemu_put_byte(f, strlen(block->idstr)); 1979 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 1980 qemu_put_be64(f, block->used_length); 1981 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) { 1982 qemu_put_be64(f, block->page_size); 1983 } 1984 } 1985 1986 rcu_read_unlock(); 1987 1988 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 1989 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 1990 1991 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 1992 1993 return 0; 1994 } 1995 1996 /** 1997 * ram_save_iterate: iterative stage for migration 1998 * 1999 * Returns zero to indicate success and negative for error 2000 * 2001 * @f: QEMUFile where to send the data 2002 * @opaque: RAMState pointer 2003 */ 2004 static int ram_save_iterate(QEMUFile *f, void *opaque) 2005 { 2006 RAMState *rs = opaque; 2007 int ret; 2008 int i; 2009 int64_t t0; 2010 int done = 0; 2011 2012 rcu_read_lock(); 2013 if (ram_list.version != rs->last_version) { 2014 ram_state_reset(rs); 2015 } 2016 2017 /* Read version before ram_list.blocks */ 2018 smp_rmb(); 2019 2020 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2021 2022 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2023 i = 0; 2024 while ((ret = qemu_file_rate_limit(f)) == 0) { 2025 int pages; 2026 2027 pages = ram_find_and_save_block(rs, false); 2028 /* no more pages to sent */ 2029 if (pages == 0) { 2030 done = 1; 2031 break; 2032 } 2033 rs->iterations++; 2034 2035 /* we want to check in the 1st loop, just in case it was the 1st time 2036 and we had to sync the dirty bitmap. 2037 qemu_get_clock_ns() is a bit expensive, so we only check each some 2038 iterations 2039 */ 2040 if ((i & 63) == 0) { 2041 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; 2042 if (t1 > MAX_WAIT) { 2043 trace_ram_save_iterate_big_wait(t1, i); 2044 break; 2045 } 2046 } 2047 i++; 2048 } 2049 flush_compressed_data(rs); 2050 rcu_read_unlock(); 2051 2052 /* 2053 * Must occur before EOS (or any QEMUFile operation) 2054 * because of RDMA protocol. 2055 */ 2056 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2057 2058 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2059 rs->bytes_transferred += 8; 2060 2061 ret = qemu_file_get_error(f); 2062 if (ret < 0) { 2063 return ret; 2064 } 2065 2066 return done; 2067 } 2068 2069 /** 2070 * ram_save_complete: function called to send the remaining amount of ram 2071 * 2072 * Returns zero to indicate success 2073 * 2074 * Called with iothread lock 2075 * 2076 * @f: QEMUFile where to send the data 2077 * @opaque: RAMState pointer 2078 */ 2079 static int ram_save_complete(QEMUFile *f, void *opaque) 2080 { 2081 RAMState *rs = opaque; 2082 2083 rcu_read_lock(); 2084 2085 if (!migration_in_postcopy()) { 2086 migration_bitmap_sync(rs); 2087 } 2088 2089 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2090 2091 /* try transferring iterative blocks of memory */ 2092 2093 /* flush all remaining blocks regardless of rate limiting */ 2094 while (true) { 2095 int pages; 2096 2097 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2098 /* no more blocks to sent */ 2099 if (pages == 0) { 2100 break; 2101 } 2102 } 2103 2104 flush_compressed_data(rs); 2105 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2106 2107 rcu_read_unlock(); 2108 2109 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2110 2111 return 0; 2112 } 2113 2114 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2115 uint64_t *non_postcopiable_pending, 2116 uint64_t *postcopiable_pending) 2117 { 2118 RAMState *rs = opaque; 2119 uint64_t remaining_size; 2120 2121 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2122 2123 if (!migration_in_postcopy() && 2124 remaining_size < max_size) { 2125 qemu_mutex_lock_iothread(); 2126 rcu_read_lock(); 2127 migration_bitmap_sync(rs); 2128 rcu_read_unlock(); 2129 qemu_mutex_unlock_iothread(); 2130 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2131 } 2132 2133 /* We can do postcopy, and all the data is postcopiable */ 2134 *postcopiable_pending += remaining_size; 2135 } 2136 2137 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2138 { 2139 unsigned int xh_len; 2140 int xh_flags; 2141 uint8_t *loaded_data; 2142 2143 if (!xbzrle_decoded_buf) { 2144 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2145 } 2146 loaded_data = xbzrle_decoded_buf; 2147 2148 /* extract RLE header */ 2149 xh_flags = qemu_get_byte(f); 2150 xh_len = qemu_get_be16(f); 2151 2152 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2153 error_report("Failed to load XBZRLE page - wrong compression!"); 2154 return -1; 2155 } 2156 2157 if (xh_len > TARGET_PAGE_SIZE) { 2158 error_report("Failed to load XBZRLE page - len overflow!"); 2159 return -1; 2160 } 2161 /* load data and decode */ 2162 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2163 2164 /* decode RLE */ 2165 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2166 TARGET_PAGE_SIZE) == -1) { 2167 error_report("Failed to load XBZRLE page - decode error!"); 2168 return -1; 2169 } 2170 2171 return 0; 2172 } 2173 2174 /** 2175 * ram_block_from_stream: read a RAMBlock id from the migration stream 2176 * 2177 * Must be called from within a rcu critical section. 2178 * 2179 * Returns a pointer from within the RCU-protected ram_list. 2180 * 2181 * @f: QEMUFile where to read the data from 2182 * @flags: Page flags (mostly to see if it's a continuation of previous block) 2183 */ 2184 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 2185 { 2186 static RAMBlock *block = NULL; 2187 char id[256]; 2188 uint8_t len; 2189 2190 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2191 if (!block) { 2192 error_report("Ack, bad migration stream!"); 2193 return NULL; 2194 } 2195 return block; 2196 } 2197 2198 len = qemu_get_byte(f); 2199 qemu_get_buffer(f, (uint8_t *)id, len); 2200 id[len] = 0; 2201 2202 block = qemu_ram_block_by_name(id); 2203 if (!block) { 2204 error_report("Can't find block %s", id); 2205 return NULL; 2206 } 2207 2208 return block; 2209 } 2210 2211 static inline void *host_from_ram_block_offset(RAMBlock *block, 2212 ram_addr_t offset) 2213 { 2214 if (!offset_in_ramblock(block, offset)) { 2215 return NULL; 2216 } 2217 2218 return block->host + offset; 2219 } 2220 2221 /** 2222 * ram_handle_compressed: handle the zero page case 2223 * 2224 * If a page (or a whole RDMA chunk) has been 2225 * determined to be zero, then zap it. 2226 * 2227 * @host: host address for the zero page 2228 * @ch: what the page is filled from. We only support zero 2229 * @size: size of the zero page 2230 */ 2231 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2232 { 2233 if (ch != 0 || !is_zero_range(host, size)) { 2234 memset(host, ch, size); 2235 } 2236 } 2237 2238 static void *do_data_decompress(void *opaque) 2239 { 2240 DecompressParam *param = opaque; 2241 unsigned long pagesize; 2242 uint8_t *des; 2243 int len; 2244 2245 qemu_mutex_lock(¶m->mutex); 2246 while (!param->quit) { 2247 if (param->des) { 2248 des = param->des; 2249 len = param->len; 2250 param->des = 0; 2251 qemu_mutex_unlock(¶m->mutex); 2252 2253 pagesize = TARGET_PAGE_SIZE; 2254 /* uncompress() will return failed in some case, especially 2255 * when the page is dirted when doing the compression, it's 2256 * not a problem because the dirty page will be retransferred 2257 * and uncompress() won't break the data in other pages. 2258 */ 2259 uncompress((Bytef *)des, &pagesize, 2260 (const Bytef *)param->compbuf, len); 2261 2262 qemu_mutex_lock(&decomp_done_lock); 2263 param->done = true; 2264 qemu_cond_signal(&decomp_done_cond); 2265 qemu_mutex_unlock(&decomp_done_lock); 2266 2267 qemu_mutex_lock(¶m->mutex); 2268 } else { 2269 qemu_cond_wait(¶m->cond, ¶m->mutex); 2270 } 2271 } 2272 qemu_mutex_unlock(¶m->mutex); 2273 2274 return NULL; 2275 } 2276 2277 static void wait_for_decompress_done(void) 2278 { 2279 int idx, thread_count; 2280 2281 if (!migrate_use_compression()) { 2282 return; 2283 } 2284 2285 thread_count = migrate_decompress_threads(); 2286 qemu_mutex_lock(&decomp_done_lock); 2287 for (idx = 0; idx < thread_count; idx++) { 2288 while (!decomp_param[idx].done) { 2289 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2290 } 2291 } 2292 qemu_mutex_unlock(&decomp_done_lock); 2293 } 2294 2295 void migrate_decompress_threads_create(void) 2296 { 2297 int i, thread_count; 2298 2299 thread_count = migrate_decompress_threads(); 2300 decompress_threads = g_new0(QemuThread, thread_count); 2301 decomp_param = g_new0(DecompressParam, thread_count); 2302 qemu_mutex_init(&decomp_done_lock); 2303 qemu_cond_init(&decomp_done_cond); 2304 for (i = 0; i < thread_count; i++) { 2305 qemu_mutex_init(&decomp_param[i].mutex); 2306 qemu_cond_init(&decomp_param[i].cond); 2307 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2308 decomp_param[i].done = true; 2309 decomp_param[i].quit = false; 2310 qemu_thread_create(decompress_threads + i, "decompress", 2311 do_data_decompress, decomp_param + i, 2312 QEMU_THREAD_JOINABLE); 2313 } 2314 } 2315 2316 void migrate_decompress_threads_join(void) 2317 { 2318 int i, thread_count; 2319 2320 thread_count = migrate_decompress_threads(); 2321 for (i = 0; i < thread_count; i++) { 2322 qemu_mutex_lock(&decomp_param[i].mutex); 2323 decomp_param[i].quit = true; 2324 qemu_cond_signal(&decomp_param[i].cond); 2325 qemu_mutex_unlock(&decomp_param[i].mutex); 2326 } 2327 for (i = 0; i < thread_count; i++) { 2328 qemu_thread_join(decompress_threads + i); 2329 qemu_mutex_destroy(&decomp_param[i].mutex); 2330 qemu_cond_destroy(&decomp_param[i].cond); 2331 g_free(decomp_param[i].compbuf); 2332 } 2333 g_free(decompress_threads); 2334 g_free(decomp_param); 2335 decompress_threads = NULL; 2336 decomp_param = NULL; 2337 } 2338 2339 static void decompress_data_with_multi_threads(QEMUFile *f, 2340 void *host, int len) 2341 { 2342 int idx, thread_count; 2343 2344 thread_count = migrate_decompress_threads(); 2345 qemu_mutex_lock(&decomp_done_lock); 2346 while (true) { 2347 for (idx = 0; idx < thread_count; idx++) { 2348 if (decomp_param[idx].done) { 2349 decomp_param[idx].done = false; 2350 qemu_mutex_lock(&decomp_param[idx].mutex); 2351 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2352 decomp_param[idx].des = host; 2353 decomp_param[idx].len = len; 2354 qemu_cond_signal(&decomp_param[idx].cond); 2355 qemu_mutex_unlock(&decomp_param[idx].mutex); 2356 break; 2357 } 2358 } 2359 if (idx < thread_count) { 2360 break; 2361 } else { 2362 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2363 } 2364 } 2365 qemu_mutex_unlock(&decomp_done_lock); 2366 } 2367 2368 /** 2369 * ram_postcopy_incoming_init: allocate postcopy data structures 2370 * 2371 * Returns 0 for success and negative if there was one error 2372 * 2373 * @mis: current migration incoming state 2374 * 2375 * Allocate data structures etc needed by incoming migration with 2376 * postcopy-ram. postcopy-ram's similarly names 2377 * postcopy_ram_incoming_init does the work. 2378 */ 2379 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 2380 { 2381 unsigned long ram_pages = last_ram_page(); 2382 2383 return postcopy_ram_incoming_init(mis, ram_pages); 2384 } 2385 2386 /** 2387 * ram_load_postcopy: load a page in postcopy case 2388 * 2389 * Returns 0 for success or -errno in case of error 2390 * 2391 * Called in postcopy mode by ram_load(). 2392 * rcu_read_lock is taken prior to this being called. 2393 * 2394 * @f: QEMUFile where to send the data 2395 */ 2396 static int ram_load_postcopy(QEMUFile *f) 2397 { 2398 int flags = 0, ret = 0; 2399 bool place_needed = false; 2400 bool matching_page_sizes = false; 2401 MigrationIncomingState *mis = migration_incoming_get_current(); 2402 /* Temporary page that is later 'placed' */ 2403 void *postcopy_host_page = postcopy_get_tmp_page(mis); 2404 void *last_host = NULL; 2405 bool all_zero = false; 2406 2407 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2408 ram_addr_t addr; 2409 void *host = NULL; 2410 void *page_buffer = NULL; 2411 void *place_source = NULL; 2412 RAMBlock *block = NULL; 2413 uint8_t ch; 2414 2415 addr = qemu_get_be64(f); 2416 flags = addr & ~TARGET_PAGE_MASK; 2417 addr &= TARGET_PAGE_MASK; 2418 2419 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 2420 place_needed = false; 2421 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { 2422 block = ram_block_from_stream(f, flags); 2423 2424 host = host_from_ram_block_offset(block, addr); 2425 if (!host) { 2426 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2427 ret = -EINVAL; 2428 break; 2429 } 2430 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE; 2431 /* 2432 * Postcopy requires that we place whole host pages atomically; 2433 * these may be huge pages for RAMBlocks that are backed by 2434 * hugetlbfs. 2435 * To make it atomic, the data is read into a temporary page 2436 * that's moved into place later. 2437 * The migration protocol uses, possibly smaller, target-pages 2438 * however the source ensures it always sends all the components 2439 * of a host page in order. 2440 */ 2441 page_buffer = postcopy_host_page + 2442 ((uintptr_t)host & (block->page_size - 1)); 2443 /* If all TP are zero then we can optimise the place */ 2444 if (!((uintptr_t)host & (block->page_size - 1))) { 2445 all_zero = true; 2446 } else { 2447 /* not the 1st TP within the HP */ 2448 if (host != (last_host + TARGET_PAGE_SIZE)) { 2449 error_report("Non-sequential target page %p/%p", 2450 host, last_host); 2451 ret = -EINVAL; 2452 break; 2453 } 2454 } 2455 2456 2457 /* 2458 * If it's the last part of a host page then we place the host 2459 * page 2460 */ 2461 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & 2462 (block->page_size - 1)) == 0; 2463 place_source = postcopy_host_page; 2464 } 2465 last_host = host; 2466 2467 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2468 case RAM_SAVE_FLAG_ZERO: 2469 ch = qemu_get_byte(f); 2470 memset(page_buffer, ch, TARGET_PAGE_SIZE); 2471 if (ch) { 2472 all_zero = false; 2473 } 2474 break; 2475 2476 case RAM_SAVE_FLAG_PAGE: 2477 all_zero = false; 2478 if (!place_needed || !matching_page_sizes) { 2479 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 2480 } else { 2481 /* Avoids the qemu_file copy during postcopy, which is 2482 * going to do a copy later; can only do it when we 2483 * do this read in one go (matching page sizes) 2484 */ 2485 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 2486 TARGET_PAGE_SIZE); 2487 } 2488 break; 2489 case RAM_SAVE_FLAG_EOS: 2490 /* normal exit */ 2491 break; 2492 default: 2493 error_report("Unknown combination of migration flags: %#x" 2494 " (postcopy mode)", flags); 2495 ret = -EINVAL; 2496 } 2497 2498 if (place_needed) { 2499 /* This gets called at the last target page in the host page */ 2500 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size; 2501 2502 if (all_zero) { 2503 ret = postcopy_place_page_zero(mis, place_dest, 2504 block->page_size); 2505 } else { 2506 ret = postcopy_place_page(mis, place_dest, 2507 place_source, block->page_size); 2508 } 2509 } 2510 if (!ret) { 2511 ret = qemu_file_get_error(f); 2512 } 2513 } 2514 2515 return ret; 2516 } 2517 2518 static int ram_load(QEMUFile *f, void *opaque, int version_id) 2519 { 2520 int flags = 0, ret = 0; 2521 static uint64_t seq_iter; 2522 int len = 0; 2523 /* 2524 * If system is running in postcopy mode, page inserts to host memory must 2525 * be atomic 2526 */ 2527 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; 2528 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 2529 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE; 2530 2531 seq_iter++; 2532 2533 if (version_id != 4) { 2534 ret = -EINVAL; 2535 } 2536 2537 /* This RCU critical section can be very long running. 2538 * When RCU reclaims in the code start to become numerous, 2539 * it will be necessary to reduce the granularity of this 2540 * critical section. 2541 */ 2542 rcu_read_lock(); 2543 2544 if (postcopy_running) { 2545 ret = ram_load_postcopy(f); 2546 } 2547 2548 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2549 ram_addr_t addr, total_ram_bytes; 2550 void *host = NULL; 2551 uint8_t ch; 2552 2553 addr = qemu_get_be64(f); 2554 flags = addr & ~TARGET_PAGE_MASK; 2555 addr &= TARGET_PAGE_MASK; 2556 2557 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 2558 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 2559 RAMBlock *block = ram_block_from_stream(f, flags); 2560 2561 host = host_from_ram_block_offset(block, addr); 2562 if (!host) { 2563 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2564 ret = -EINVAL; 2565 break; 2566 } 2567 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 2568 } 2569 2570 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2571 case RAM_SAVE_FLAG_MEM_SIZE: 2572 /* Synchronize RAM block list */ 2573 total_ram_bytes = addr; 2574 while (!ret && total_ram_bytes) { 2575 RAMBlock *block; 2576 char id[256]; 2577 ram_addr_t length; 2578 2579 len = qemu_get_byte(f); 2580 qemu_get_buffer(f, (uint8_t *)id, len); 2581 id[len] = 0; 2582 length = qemu_get_be64(f); 2583 2584 block = qemu_ram_block_by_name(id); 2585 if (block) { 2586 if (length != block->used_length) { 2587 Error *local_err = NULL; 2588 2589 ret = qemu_ram_resize(block, length, 2590 &local_err); 2591 if (local_err) { 2592 error_report_err(local_err); 2593 } 2594 } 2595 /* For postcopy we need to check hugepage sizes match */ 2596 if (postcopy_advised && 2597 block->page_size != qemu_host_page_size) { 2598 uint64_t remote_page_size = qemu_get_be64(f); 2599 if (remote_page_size != block->page_size) { 2600 error_report("Mismatched RAM page size %s " 2601 "(local) %zd != %" PRId64, 2602 id, block->page_size, 2603 remote_page_size); 2604 ret = -EINVAL; 2605 } 2606 } 2607 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 2608 block->idstr); 2609 } else { 2610 error_report("Unknown ramblock \"%s\", cannot " 2611 "accept migration", id); 2612 ret = -EINVAL; 2613 } 2614 2615 total_ram_bytes -= length; 2616 } 2617 break; 2618 2619 case RAM_SAVE_FLAG_ZERO: 2620 ch = qemu_get_byte(f); 2621 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 2622 break; 2623 2624 case RAM_SAVE_FLAG_PAGE: 2625 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 2626 break; 2627 2628 case RAM_SAVE_FLAG_COMPRESS_PAGE: 2629 len = qemu_get_be32(f); 2630 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 2631 error_report("Invalid compressed data length: %d", len); 2632 ret = -EINVAL; 2633 break; 2634 } 2635 decompress_data_with_multi_threads(f, host, len); 2636 break; 2637 2638 case RAM_SAVE_FLAG_XBZRLE: 2639 if (load_xbzrle(f, addr, host) < 0) { 2640 error_report("Failed to decompress XBZRLE page at " 2641 RAM_ADDR_FMT, addr); 2642 ret = -EINVAL; 2643 break; 2644 } 2645 break; 2646 case RAM_SAVE_FLAG_EOS: 2647 /* normal exit */ 2648 break; 2649 default: 2650 if (flags & RAM_SAVE_FLAG_HOOK) { 2651 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 2652 } else { 2653 error_report("Unknown combination of migration flags: %#x", 2654 flags); 2655 ret = -EINVAL; 2656 } 2657 } 2658 if (!ret) { 2659 ret = qemu_file_get_error(f); 2660 } 2661 } 2662 2663 wait_for_decompress_done(); 2664 rcu_read_unlock(); 2665 trace_ram_load_complete(ret, seq_iter); 2666 return ret; 2667 } 2668 2669 static SaveVMHandlers savevm_ram_handlers = { 2670 .save_live_setup = ram_save_setup, 2671 .save_live_iterate = ram_save_iterate, 2672 .save_live_complete_postcopy = ram_save_complete, 2673 .save_live_complete_precopy = ram_save_complete, 2674 .save_live_pending = ram_save_pending, 2675 .load_state = ram_load, 2676 .cleanup = ram_migration_cleanup, 2677 }; 2678 2679 void ram_mig_init(void) 2680 { 2681 qemu_mutex_init(&XBZRLE.lock); 2682 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state); 2683 } 2684