1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 #include "qemu/osdep.h" 29 #include "qemu-common.h" 30 #include "cpu.h" 31 #include <zlib.h> 32 #include "qapi-event.h" 33 #include "qemu/cutils.h" 34 #include "qemu/bitops.h" 35 #include "qemu/bitmap.h" 36 #include "qemu/timer.h" 37 #include "qemu/main-loop.h" 38 #include "migration/migration.h" 39 #include "migration/postcopy-ram.h" 40 #include "exec/address-spaces.h" 41 #include "migration/page_cache.h" 42 #include "qemu/error-report.h" 43 #include "trace.h" 44 #include "exec/ram_addr.h" 45 #include "qemu/rcu_queue.h" 46 #include "migration/colo.h" 47 48 #ifdef DEBUG_MIGRATION_RAM 49 #define DPRINTF(fmt, ...) \ 50 do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0) 51 #else 52 #define DPRINTF(fmt, ...) \ 53 do { } while (0) 54 #endif 55 56 static int dirty_rate_high_cnt; 57 58 static uint64_t bitmap_sync_count; 59 60 /***********************************************************/ 61 /* ram save/restore */ 62 63 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 64 #define RAM_SAVE_FLAG_COMPRESS 0x02 65 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 66 #define RAM_SAVE_FLAG_PAGE 0x08 67 #define RAM_SAVE_FLAG_EOS 0x10 68 #define RAM_SAVE_FLAG_CONTINUE 0x20 69 #define RAM_SAVE_FLAG_XBZRLE 0x40 70 /* 0x80 is reserved in migration.h start with 0x100 next */ 71 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 72 73 static uint8_t *ZERO_TARGET_PAGE; 74 75 static inline bool is_zero_range(uint8_t *p, uint64_t size) 76 { 77 return buffer_is_zero(p, size); 78 } 79 80 /* struct contains XBZRLE cache and a static page 81 used by the compression */ 82 static struct { 83 /* buffer used for XBZRLE encoding */ 84 uint8_t *encoded_buf; 85 /* buffer for storing page content */ 86 uint8_t *current_buf; 87 /* Cache for XBZRLE, Protected by lock. */ 88 PageCache *cache; 89 QemuMutex lock; 90 } XBZRLE; 91 92 /* buffer used for XBZRLE decoding */ 93 static uint8_t *xbzrle_decoded_buf; 94 95 static void XBZRLE_cache_lock(void) 96 { 97 if (migrate_use_xbzrle()) 98 qemu_mutex_lock(&XBZRLE.lock); 99 } 100 101 static void XBZRLE_cache_unlock(void) 102 { 103 if (migrate_use_xbzrle()) 104 qemu_mutex_unlock(&XBZRLE.lock); 105 } 106 107 /* 108 * called from qmp_migrate_set_cache_size in main thread, possibly while 109 * a migration is in progress. 110 * A running migration maybe using the cache and might finish during this 111 * call, hence changes to the cache are protected by XBZRLE.lock(). 112 */ 113 int64_t xbzrle_cache_resize(int64_t new_size) 114 { 115 PageCache *new_cache; 116 int64_t ret; 117 118 if (new_size < TARGET_PAGE_SIZE) { 119 return -1; 120 } 121 122 XBZRLE_cache_lock(); 123 124 if (XBZRLE.cache != NULL) { 125 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { 126 goto out_new_size; 127 } 128 new_cache = cache_init(new_size / TARGET_PAGE_SIZE, 129 TARGET_PAGE_SIZE); 130 if (!new_cache) { 131 error_report("Error creating cache"); 132 ret = -1; 133 goto out; 134 } 135 136 cache_fini(XBZRLE.cache); 137 XBZRLE.cache = new_cache; 138 } 139 140 out_new_size: 141 ret = pow2floor(new_size); 142 out: 143 XBZRLE_cache_unlock(); 144 return ret; 145 } 146 147 /* accounting for migration statistics */ 148 typedef struct AccountingInfo { 149 uint64_t dup_pages; 150 uint64_t skipped_pages; 151 uint64_t norm_pages; 152 uint64_t iterations; 153 uint64_t xbzrle_bytes; 154 uint64_t xbzrle_pages; 155 uint64_t xbzrle_cache_miss; 156 double xbzrle_cache_miss_rate; 157 uint64_t xbzrle_overflows; 158 } AccountingInfo; 159 160 static AccountingInfo acct_info; 161 162 static void acct_clear(void) 163 { 164 memset(&acct_info, 0, sizeof(acct_info)); 165 } 166 167 uint64_t dup_mig_bytes_transferred(void) 168 { 169 return acct_info.dup_pages * TARGET_PAGE_SIZE; 170 } 171 172 uint64_t dup_mig_pages_transferred(void) 173 { 174 return acct_info.dup_pages; 175 } 176 177 uint64_t skipped_mig_bytes_transferred(void) 178 { 179 return acct_info.skipped_pages * TARGET_PAGE_SIZE; 180 } 181 182 uint64_t skipped_mig_pages_transferred(void) 183 { 184 return acct_info.skipped_pages; 185 } 186 187 uint64_t norm_mig_bytes_transferred(void) 188 { 189 return acct_info.norm_pages * TARGET_PAGE_SIZE; 190 } 191 192 uint64_t norm_mig_pages_transferred(void) 193 { 194 return acct_info.norm_pages; 195 } 196 197 uint64_t xbzrle_mig_bytes_transferred(void) 198 { 199 return acct_info.xbzrle_bytes; 200 } 201 202 uint64_t xbzrle_mig_pages_transferred(void) 203 { 204 return acct_info.xbzrle_pages; 205 } 206 207 uint64_t xbzrle_mig_pages_cache_miss(void) 208 { 209 return acct_info.xbzrle_cache_miss; 210 } 211 212 double xbzrle_mig_cache_miss_rate(void) 213 { 214 return acct_info.xbzrle_cache_miss_rate; 215 } 216 217 uint64_t xbzrle_mig_pages_overflow(void) 218 { 219 return acct_info.xbzrle_overflows; 220 } 221 222 /* This is the last block that we have visited serching for dirty pages 223 */ 224 static RAMBlock *last_seen_block; 225 /* This is the last block from where we have sent data */ 226 static RAMBlock *last_sent_block; 227 static ram_addr_t last_offset; 228 static QemuMutex migration_bitmap_mutex; 229 static uint64_t migration_dirty_pages; 230 static uint32_t last_version; 231 static bool ram_bulk_stage; 232 233 /* used by the search for pages to send */ 234 struct PageSearchStatus { 235 /* Current block being searched */ 236 RAMBlock *block; 237 /* Current offset to search from */ 238 ram_addr_t offset; 239 /* Set once we wrap around */ 240 bool complete_round; 241 }; 242 typedef struct PageSearchStatus PageSearchStatus; 243 244 static struct BitmapRcu { 245 struct rcu_head rcu; 246 /* Main migration bitmap */ 247 unsigned long *bmap; 248 /* bitmap of pages that haven't been sent even once 249 * only maintained and used in postcopy at the moment 250 * where it's used to send the dirtymap at the start 251 * of the postcopy phase 252 */ 253 unsigned long *unsentmap; 254 } *migration_bitmap_rcu; 255 256 struct CompressParam { 257 bool done; 258 bool quit; 259 QEMUFile *file; 260 QemuMutex mutex; 261 QemuCond cond; 262 RAMBlock *block; 263 ram_addr_t offset; 264 }; 265 typedef struct CompressParam CompressParam; 266 267 struct DecompressParam { 268 bool done; 269 bool quit; 270 QemuMutex mutex; 271 QemuCond cond; 272 void *des; 273 uint8_t *compbuf; 274 int len; 275 }; 276 typedef struct DecompressParam DecompressParam; 277 278 static CompressParam *comp_param; 279 static QemuThread *compress_threads; 280 /* comp_done_cond is used to wake up the migration thread when 281 * one of the compression threads has finished the compression. 282 * comp_done_lock is used to co-work with comp_done_cond. 283 */ 284 static QemuMutex comp_done_lock; 285 static QemuCond comp_done_cond; 286 /* The empty QEMUFileOps will be used by file in CompressParam */ 287 static const QEMUFileOps empty_ops = { }; 288 289 static bool compression_switch; 290 static DecompressParam *decomp_param; 291 static QemuThread *decompress_threads; 292 static QemuMutex decomp_done_lock; 293 static QemuCond decomp_done_cond; 294 295 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 296 ram_addr_t offset); 297 298 static void *do_data_compress(void *opaque) 299 { 300 CompressParam *param = opaque; 301 RAMBlock *block; 302 ram_addr_t offset; 303 304 qemu_mutex_lock(¶m->mutex); 305 while (!param->quit) { 306 if (param->block) { 307 block = param->block; 308 offset = param->offset; 309 param->block = NULL; 310 qemu_mutex_unlock(¶m->mutex); 311 312 do_compress_ram_page(param->file, block, offset); 313 314 qemu_mutex_lock(&comp_done_lock); 315 param->done = true; 316 qemu_cond_signal(&comp_done_cond); 317 qemu_mutex_unlock(&comp_done_lock); 318 319 qemu_mutex_lock(¶m->mutex); 320 } else { 321 qemu_cond_wait(¶m->cond, ¶m->mutex); 322 } 323 } 324 qemu_mutex_unlock(¶m->mutex); 325 326 return NULL; 327 } 328 329 static inline void terminate_compression_threads(void) 330 { 331 int idx, thread_count; 332 333 thread_count = migrate_compress_threads(); 334 for (idx = 0; idx < thread_count; idx++) { 335 qemu_mutex_lock(&comp_param[idx].mutex); 336 comp_param[idx].quit = true; 337 qemu_cond_signal(&comp_param[idx].cond); 338 qemu_mutex_unlock(&comp_param[idx].mutex); 339 } 340 } 341 342 void migrate_compress_threads_join(void) 343 { 344 int i, thread_count; 345 346 if (!migrate_use_compression()) { 347 return; 348 } 349 terminate_compression_threads(); 350 thread_count = migrate_compress_threads(); 351 for (i = 0; i < thread_count; i++) { 352 qemu_thread_join(compress_threads + i); 353 qemu_fclose(comp_param[i].file); 354 qemu_mutex_destroy(&comp_param[i].mutex); 355 qemu_cond_destroy(&comp_param[i].cond); 356 } 357 qemu_mutex_destroy(&comp_done_lock); 358 qemu_cond_destroy(&comp_done_cond); 359 g_free(compress_threads); 360 g_free(comp_param); 361 compress_threads = NULL; 362 comp_param = NULL; 363 } 364 365 void migrate_compress_threads_create(void) 366 { 367 int i, thread_count; 368 369 if (!migrate_use_compression()) { 370 return; 371 } 372 compression_switch = true; 373 thread_count = migrate_compress_threads(); 374 compress_threads = g_new0(QemuThread, thread_count); 375 comp_param = g_new0(CompressParam, thread_count); 376 qemu_cond_init(&comp_done_cond); 377 qemu_mutex_init(&comp_done_lock); 378 for (i = 0; i < thread_count; i++) { 379 /* comp_param[i].file is just used as a dummy buffer to save data, 380 * set its ops to empty. 381 */ 382 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 383 comp_param[i].done = true; 384 comp_param[i].quit = false; 385 qemu_mutex_init(&comp_param[i].mutex); 386 qemu_cond_init(&comp_param[i].cond); 387 qemu_thread_create(compress_threads + i, "compress", 388 do_data_compress, comp_param + i, 389 QEMU_THREAD_JOINABLE); 390 } 391 } 392 393 /** 394 * save_page_header: Write page header to wire 395 * 396 * If this is the 1st block, it also writes the block identification 397 * 398 * Returns: Number of bytes written 399 * 400 * @f: QEMUFile where to send the data 401 * @block: block that contains the page we want to send 402 * @offset: offset inside the block for the page 403 * in the lower bits, it contains flags 404 */ 405 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset) 406 { 407 size_t size, len; 408 409 qemu_put_be64(f, offset); 410 size = 8; 411 412 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 413 len = strlen(block->idstr); 414 qemu_put_byte(f, len); 415 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 416 size += 1 + len; 417 } 418 return size; 419 } 420 421 /* Reduce amount of guest cpu execution to hopefully slow down memory writes. 422 * If guest dirty memory rate is reduced below the rate at which we can 423 * transfer pages to the destination then we should be able to complete 424 * migration. Some workloads dirty memory way too fast and will not effectively 425 * converge, even with auto-converge. 426 */ 427 static void mig_throttle_guest_down(void) 428 { 429 MigrationState *s = migrate_get_current(); 430 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 431 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 432 433 /* We have not started throttling yet. Let's start it. */ 434 if (!cpu_throttle_active()) { 435 cpu_throttle_set(pct_initial); 436 } else { 437 /* Throttling already on, just increase the rate */ 438 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); 439 } 440 } 441 442 /* Update the xbzrle cache to reflect a page that's been sent as all 0. 443 * The important thing is that a stale (not-yet-0'd) page be replaced 444 * by the new data. 445 * As a bonus, if the page wasn't in the cache it gets added so that 446 * when a small write is made into the 0'd page it gets XBZRLE sent 447 */ 448 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 449 { 450 if (ram_bulk_stage || !migrate_use_xbzrle()) { 451 return; 452 } 453 454 /* We don't care if this fails to allocate a new cache page 455 * as long as it updated an old one */ 456 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE, 457 bitmap_sync_count); 458 } 459 460 #define ENCODING_FLAG_XBZRLE 0x1 461 462 /** 463 * save_xbzrle_page: compress and send current page 464 * 465 * Returns: 1 means that we wrote the page 466 * 0 means that page is identical to the one already sent 467 * -1 means that xbzrle would be longer than normal 468 * 469 * @f: QEMUFile where to send the data 470 * @current_data: 471 * @current_addr: 472 * @block: block that contains the page we want to send 473 * @offset: offset inside the block for the page 474 * @last_stage: if we are at the completion stage 475 * @bytes_transferred: increase it with the number of transferred bytes 476 */ 477 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, 478 ram_addr_t current_addr, RAMBlock *block, 479 ram_addr_t offset, bool last_stage, 480 uint64_t *bytes_transferred) 481 { 482 int encoded_len = 0, bytes_xbzrle; 483 uint8_t *prev_cached_page; 484 485 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) { 486 acct_info.xbzrle_cache_miss++; 487 if (!last_stage) { 488 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 489 bitmap_sync_count) == -1) { 490 return -1; 491 } else { 492 /* update *current_data when the page has been 493 inserted into cache */ 494 *current_data = get_cached_data(XBZRLE.cache, current_addr); 495 } 496 } 497 return -1; 498 } 499 500 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 501 502 /* save current buffer into memory */ 503 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 504 505 /* XBZRLE encoding (if there is no overflow) */ 506 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 507 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 508 TARGET_PAGE_SIZE); 509 if (encoded_len == 0) { 510 DPRINTF("Skipping unmodified page\n"); 511 return 0; 512 } else if (encoded_len == -1) { 513 DPRINTF("Overflow\n"); 514 acct_info.xbzrle_overflows++; 515 /* update data in the cache */ 516 if (!last_stage) { 517 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); 518 *current_data = prev_cached_page; 519 } 520 return -1; 521 } 522 523 /* we need to update the data in the cache, in order to get the same data */ 524 if (!last_stage) { 525 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 526 } 527 528 /* Send XBZRLE based compressed page */ 529 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE); 530 qemu_put_byte(f, ENCODING_FLAG_XBZRLE); 531 qemu_put_be16(f, encoded_len); 532 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len); 533 bytes_xbzrle += encoded_len + 1 + 2; 534 acct_info.xbzrle_pages++; 535 acct_info.xbzrle_bytes += bytes_xbzrle; 536 *bytes_transferred += bytes_xbzrle; 537 538 return 1; 539 } 540 541 /* Called with rcu_read_lock() to protect migration_bitmap 542 * rb: The RAMBlock to search for dirty pages in 543 * start: Start address (typically so we can continue from previous page) 544 * ram_addr_abs: Pointer into which to store the address of the dirty page 545 * within the global ram_addr space 546 * 547 * Returns: byte offset within memory region of the start of a dirty page 548 */ 549 static inline 550 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb, 551 ram_addr_t start, 552 ram_addr_t *ram_addr_abs) 553 { 554 unsigned long base = rb->offset >> TARGET_PAGE_BITS; 555 unsigned long nr = base + (start >> TARGET_PAGE_BITS); 556 uint64_t rb_size = rb->used_length; 557 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS); 558 unsigned long *bitmap; 559 560 unsigned long next; 561 562 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 563 if (ram_bulk_stage && nr > base) { 564 next = nr + 1; 565 } else { 566 next = find_next_bit(bitmap, size, nr); 567 } 568 569 *ram_addr_abs = next << TARGET_PAGE_BITS; 570 return (next - base) << TARGET_PAGE_BITS; 571 } 572 573 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr) 574 { 575 bool ret; 576 int nr = addr >> TARGET_PAGE_BITS; 577 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 578 579 ret = test_and_clear_bit(nr, bitmap); 580 581 if (ret) { 582 migration_dirty_pages--; 583 } 584 return ret; 585 } 586 587 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) 588 { 589 unsigned long *bitmap; 590 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 591 migration_dirty_pages += 592 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); 593 } 594 595 /* Fix me: there are too many global variables used in migration process. */ 596 static int64_t start_time; 597 static int64_t bytes_xfer_prev; 598 static int64_t num_dirty_pages_period; 599 static uint64_t xbzrle_cache_miss_prev; 600 static uint64_t iterations_prev; 601 602 static void migration_bitmap_sync_init(void) 603 { 604 start_time = 0; 605 bytes_xfer_prev = 0; 606 num_dirty_pages_period = 0; 607 xbzrle_cache_miss_prev = 0; 608 iterations_prev = 0; 609 } 610 611 static void migration_bitmap_sync(void) 612 { 613 RAMBlock *block; 614 uint64_t num_dirty_pages_init = migration_dirty_pages; 615 MigrationState *s = migrate_get_current(); 616 int64_t end_time; 617 int64_t bytes_xfer_now; 618 619 bitmap_sync_count++; 620 621 if (!bytes_xfer_prev) { 622 bytes_xfer_prev = ram_bytes_transferred(); 623 } 624 625 if (!start_time) { 626 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 627 } 628 629 trace_migration_bitmap_sync_start(); 630 memory_global_dirty_log_sync(); 631 632 qemu_mutex_lock(&migration_bitmap_mutex); 633 rcu_read_lock(); 634 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 635 migration_bitmap_sync_range(block->offset, block->used_length); 636 } 637 rcu_read_unlock(); 638 qemu_mutex_unlock(&migration_bitmap_mutex); 639 640 trace_migration_bitmap_sync_end(migration_dirty_pages 641 - num_dirty_pages_init); 642 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; 643 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 644 645 /* more than 1 second = 1000 millisecons */ 646 if (end_time > start_time + 1000) { 647 if (migrate_auto_converge()) { 648 /* The following detection logic can be refined later. For now: 649 Check to see if the dirtied bytes is 50% more than the approx. 650 amount of bytes that just got transferred since the last time we 651 were in this routine. If that happens twice, start or increase 652 throttling */ 653 bytes_xfer_now = ram_bytes_transferred(); 654 655 if (s->dirty_pages_rate && 656 (num_dirty_pages_period * TARGET_PAGE_SIZE > 657 (bytes_xfer_now - bytes_xfer_prev)/2) && 658 (dirty_rate_high_cnt++ >= 2)) { 659 trace_migration_throttle(); 660 dirty_rate_high_cnt = 0; 661 mig_throttle_guest_down(); 662 } 663 bytes_xfer_prev = bytes_xfer_now; 664 } 665 666 if (migrate_use_xbzrle()) { 667 if (iterations_prev != acct_info.iterations) { 668 acct_info.xbzrle_cache_miss_rate = 669 (double)(acct_info.xbzrle_cache_miss - 670 xbzrle_cache_miss_prev) / 671 (acct_info.iterations - iterations_prev); 672 } 673 iterations_prev = acct_info.iterations; 674 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss; 675 } 676 s->dirty_pages_rate = num_dirty_pages_period * 1000 677 / (end_time - start_time); 678 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; 679 start_time = end_time; 680 num_dirty_pages_period = 0; 681 } 682 s->dirty_sync_count = bitmap_sync_count; 683 if (migrate_use_events()) { 684 qapi_event_send_migration_pass(bitmap_sync_count, NULL); 685 } 686 } 687 688 /** 689 * save_zero_page: Send the zero page to the stream 690 * 691 * Returns: Number of pages written. 692 * 693 * @f: QEMUFile where to send the data 694 * @block: block that contains the page we want to send 695 * @offset: offset inside the block for the page 696 * @p: pointer to the page 697 * @bytes_transferred: increase it with the number of transferred bytes 698 */ 699 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, 700 uint8_t *p, uint64_t *bytes_transferred) 701 { 702 int pages = -1; 703 704 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 705 acct_info.dup_pages++; 706 *bytes_transferred += save_page_header(f, block, 707 offset | RAM_SAVE_FLAG_COMPRESS); 708 qemu_put_byte(f, 0); 709 *bytes_transferred += 1; 710 pages = 1; 711 } 712 713 return pages; 714 } 715 716 /** 717 * ram_save_page: Send the given page to the stream 718 * 719 * Returns: Number of pages written. 720 * < 0 - error 721 * >=0 - Number of pages written - this might legally be 0 722 * if xbzrle noticed the page was the same. 723 * 724 * @f: QEMUFile where to send the data 725 * @block: block that contains the page we want to send 726 * @offset: offset inside the block for the page 727 * @last_stage: if we are at the completion stage 728 * @bytes_transferred: increase it with the number of transferred bytes 729 */ 730 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, 731 bool last_stage, uint64_t *bytes_transferred) 732 { 733 int pages = -1; 734 uint64_t bytes_xmit; 735 ram_addr_t current_addr; 736 uint8_t *p; 737 int ret; 738 bool send_async = true; 739 RAMBlock *block = pss->block; 740 ram_addr_t offset = pss->offset; 741 742 p = block->host + offset; 743 744 /* In doubt sent page as normal */ 745 bytes_xmit = 0; 746 ret = ram_control_save_page(f, block->offset, 747 offset, TARGET_PAGE_SIZE, &bytes_xmit); 748 if (bytes_xmit) { 749 *bytes_transferred += bytes_xmit; 750 pages = 1; 751 } 752 753 XBZRLE_cache_lock(); 754 755 current_addr = block->offset + offset; 756 757 if (block == last_sent_block) { 758 offset |= RAM_SAVE_FLAG_CONTINUE; 759 } 760 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 761 if (ret != RAM_SAVE_CONTROL_DELAYED) { 762 if (bytes_xmit > 0) { 763 acct_info.norm_pages++; 764 } else if (bytes_xmit == 0) { 765 acct_info.dup_pages++; 766 } 767 } 768 } else { 769 pages = save_zero_page(f, block, offset, p, bytes_transferred); 770 if (pages > 0) { 771 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 772 * page would be stale 773 */ 774 xbzrle_cache_zero_page(current_addr); 775 } else if (!ram_bulk_stage && 776 !migration_in_postcopy(migrate_get_current()) && 777 migrate_use_xbzrle()) { 778 pages = save_xbzrle_page(f, &p, current_addr, block, 779 offset, last_stage, bytes_transferred); 780 if (!last_stage) { 781 /* Can't send this cached data async, since the cache page 782 * might get updated before it gets to the wire 783 */ 784 send_async = false; 785 } 786 } 787 } 788 789 /* XBZRLE overflow or normal page */ 790 if (pages == -1) { 791 *bytes_transferred += save_page_header(f, block, 792 offset | RAM_SAVE_FLAG_PAGE); 793 if (send_async) { 794 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); 795 } else { 796 qemu_put_buffer(f, p, TARGET_PAGE_SIZE); 797 } 798 *bytes_transferred += TARGET_PAGE_SIZE; 799 pages = 1; 800 acct_info.norm_pages++; 801 } 802 803 XBZRLE_cache_unlock(); 804 805 return pages; 806 } 807 808 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 809 ram_addr_t offset) 810 { 811 int bytes_sent, blen; 812 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 813 814 bytes_sent = save_page_header(f, block, offset | 815 RAM_SAVE_FLAG_COMPRESS_PAGE); 816 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 817 migrate_compress_level()); 818 if (blen < 0) { 819 bytes_sent = 0; 820 qemu_file_set_error(migrate_get_current()->to_dst_file, blen); 821 error_report("compressed data failed!"); 822 } else { 823 bytes_sent += blen; 824 } 825 826 return bytes_sent; 827 } 828 829 static uint64_t bytes_transferred; 830 831 static void flush_compressed_data(QEMUFile *f) 832 { 833 int idx, len, thread_count; 834 835 if (!migrate_use_compression()) { 836 return; 837 } 838 thread_count = migrate_compress_threads(); 839 840 qemu_mutex_lock(&comp_done_lock); 841 for (idx = 0; idx < thread_count; idx++) { 842 while (!comp_param[idx].done) { 843 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 844 } 845 } 846 qemu_mutex_unlock(&comp_done_lock); 847 848 for (idx = 0; idx < thread_count; idx++) { 849 qemu_mutex_lock(&comp_param[idx].mutex); 850 if (!comp_param[idx].quit) { 851 len = qemu_put_qemu_file(f, comp_param[idx].file); 852 bytes_transferred += len; 853 } 854 qemu_mutex_unlock(&comp_param[idx].mutex); 855 } 856 } 857 858 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 859 ram_addr_t offset) 860 { 861 param->block = block; 862 param->offset = offset; 863 } 864 865 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, 866 ram_addr_t offset, 867 uint64_t *bytes_transferred) 868 { 869 int idx, thread_count, bytes_xmit = -1, pages = -1; 870 871 thread_count = migrate_compress_threads(); 872 qemu_mutex_lock(&comp_done_lock); 873 while (true) { 874 for (idx = 0; idx < thread_count; idx++) { 875 if (comp_param[idx].done) { 876 comp_param[idx].done = false; 877 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file); 878 qemu_mutex_lock(&comp_param[idx].mutex); 879 set_compress_params(&comp_param[idx], block, offset); 880 qemu_cond_signal(&comp_param[idx].cond); 881 qemu_mutex_unlock(&comp_param[idx].mutex); 882 pages = 1; 883 acct_info.norm_pages++; 884 *bytes_transferred += bytes_xmit; 885 break; 886 } 887 } 888 if (pages > 0) { 889 break; 890 } else { 891 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 892 } 893 } 894 qemu_mutex_unlock(&comp_done_lock); 895 896 return pages; 897 } 898 899 /** 900 * ram_save_compressed_page: compress the given page and send it to the stream 901 * 902 * Returns: Number of pages written. 903 * 904 * @f: QEMUFile where to send the data 905 * @block: block that contains the page we want to send 906 * @offset: offset inside the block for the page 907 * @last_stage: if we are at the completion stage 908 * @bytes_transferred: increase it with the number of transferred bytes 909 */ 910 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, 911 bool last_stage, 912 uint64_t *bytes_transferred) 913 { 914 int pages = -1; 915 uint64_t bytes_xmit = 0; 916 uint8_t *p; 917 int ret, blen; 918 RAMBlock *block = pss->block; 919 ram_addr_t offset = pss->offset; 920 921 p = block->host + offset; 922 923 ret = ram_control_save_page(f, block->offset, 924 offset, TARGET_PAGE_SIZE, &bytes_xmit); 925 if (bytes_xmit) { 926 *bytes_transferred += bytes_xmit; 927 pages = 1; 928 } 929 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 930 if (ret != RAM_SAVE_CONTROL_DELAYED) { 931 if (bytes_xmit > 0) { 932 acct_info.norm_pages++; 933 } else if (bytes_xmit == 0) { 934 acct_info.dup_pages++; 935 } 936 } 937 } else { 938 /* When starting the process of a new block, the first page of 939 * the block should be sent out before other pages in the same 940 * block, and all the pages in last block should have been sent 941 * out, keeping this order is important, because the 'cont' flag 942 * is used to avoid resending the block name. 943 */ 944 if (block != last_sent_block) { 945 flush_compressed_data(f); 946 pages = save_zero_page(f, block, offset, p, bytes_transferred); 947 if (pages == -1) { 948 /* Make sure the first page is sent out before other pages */ 949 bytes_xmit = save_page_header(f, block, offset | 950 RAM_SAVE_FLAG_COMPRESS_PAGE); 951 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 952 migrate_compress_level()); 953 if (blen > 0) { 954 *bytes_transferred += bytes_xmit + blen; 955 acct_info.norm_pages++; 956 pages = 1; 957 } else { 958 qemu_file_set_error(f, blen); 959 error_report("compressed data failed!"); 960 } 961 } 962 } else { 963 offset |= RAM_SAVE_FLAG_CONTINUE; 964 pages = save_zero_page(f, block, offset, p, bytes_transferred); 965 if (pages == -1) { 966 pages = compress_page_with_multi_thread(f, block, offset, 967 bytes_transferred); 968 } 969 } 970 } 971 972 return pages; 973 } 974 975 /* 976 * Find the next dirty page and update any state associated with 977 * the search process. 978 * 979 * Returns: True if a page is found 980 * 981 * @f: Current migration stream. 982 * @pss: Data about the state of the current dirty page scan. 983 * @*again: Set to false if the search has scanned the whole of RAM 984 * *ram_addr_abs: Pointer into which to store the address of the dirty page 985 * within the global ram_addr space 986 */ 987 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, 988 bool *again, ram_addr_t *ram_addr_abs) 989 { 990 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset, 991 ram_addr_abs); 992 if (pss->complete_round && pss->block == last_seen_block && 993 pss->offset >= last_offset) { 994 /* 995 * We've been once around the RAM and haven't found anything. 996 * Give up. 997 */ 998 *again = false; 999 return false; 1000 } 1001 if (pss->offset >= pss->block->used_length) { 1002 /* Didn't find anything in this RAM Block */ 1003 pss->offset = 0; 1004 pss->block = QLIST_NEXT_RCU(pss->block, next); 1005 if (!pss->block) { 1006 /* Hit the end of the list */ 1007 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1008 /* Flag that we've looped */ 1009 pss->complete_round = true; 1010 ram_bulk_stage = false; 1011 if (migrate_use_xbzrle()) { 1012 /* If xbzrle is on, stop using the data compression at this 1013 * point. In theory, xbzrle can do better than compression. 1014 */ 1015 flush_compressed_data(f); 1016 compression_switch = false; 1017 } 1018 } 1019 /* Didn't find anything this time, but try again on the new block */ 1020 *again = true; 1021 return false; 1022 } else { 1023 /* Can go around again, but... */ 1024 *again = true; 1025 /* We've found something so probably don't need to */ 1026 return true; 1027 } 1028 } 1029 1030 /* 1031 * Helper for 'get_queued_page' - gets a page off the queue 1032 * ms: MigrationState in 1033 * *offset: Used to return the offset within the RAMBlock 1034 * ram_addr_abs: global offset in the dirty/sent bitmaps 1035 * 1036 * Returns: block (or NULL if none available) 1037 */ 1038 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, 1039 ram_addr_t *ram_addr_abs) 1040 { 1041 RAMBlock *block = NULL; 1042 1043 qemu_mutex_lock(&ms->src_page_req_mutex); 1044 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) { 1045 struct MigrationSrcPageRequest *entry = 1046 QSIMPLEQ_FIRST(&ms->src_page_requests); 1047 block = entry->rb; 1048 *offset = entry->offset; 1049 *ram_addr_abs = (entry->offset + entry->rb->offset) & 1050 TARGET_PAGE_MASK; 1051 1052 if (entry->len > TARGET_PAGE_SIZE) { 1053 entry->len -= TARGET_PAGE_SIZE; 1054 entry->offset += TARGET_PAGE_SIZE; 1055 } else { 1056 memory_region_unref(block->mr); 1057 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); 1058 g_free(entry); 1059 } 1060 } 1061 qemu_mutex_unlock(&ms->src_page_req_mutex); 1062 1063 return block; 1064 } 1065 1066 /* 1067 * Unqueue a page from the queue fed by postcopy page requests; skips pages 1068 * that are already sent (!dirty) 1069 * 1070 * ms: MigrationState in 1071 * pss: PageSearchStatus structure updated with found block/offset 1072 * ram_addr_abs: global offset in the dirty/sent bitmaps 1073 * 1074 * Returns: true if a queued page is found 1075 */ 1076 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, 1077 ram_addr_t *ram_addr_abs) 1078 { 1079 RAMBlock *block; 1080 ram_addr_t offset; 1081 bool dirty; 1082 1083 do { 1084 block = unqueue_page(ms, &offset, ram_addr_abs); 1085 /* 1086 * We're sending this page, and since it's postcopy nothing else 1087 * will dirty it, and we must make sure it doesn't get sent again 1088 * even if this queue request was received after the background 1089 * search already sent it. 1090 */ 1091 if (block) { 1092 unsigned long *bitmap; 1093 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1094 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap); 1095 if (!dirty) { 1096 trace_get_queued_page_not_dirty( 1097 block->idstr, (uint64_t)offset, 1098 (uint64_t)*ram_addr_abs, 1099 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, 1100 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap)); 1101 } else { 1102 trace_get_queued_page(block->idstr, 1103 (uint64_t)offset, 1104 (uint64_t)*ram_addr_abs); 1105 } 1106 } 1107 1108 } while (block && !dirty); 1109 1110 if (block) { 1111 /* 1112 * As soon as we start servicing pages out of order, then we have 1113 * to kill the bulk stage, since the bulk stage assumes 1114 * in (migration_bitmap_find_and_reset_dirty) that every page is 1115 * dirty, that's no longer true. 1116 */ 1117 ram_bulk_stage = false; 1118 1119 /* 1120 * We want the background search to continue from the queued page 1121 * since the guest is likely to want other pages near to the page 1122 * it just requested. 1123 */ 1124 pss->block = block; 1125 pss->offset = offset; 1126 } 1127 1128 return !!block; 1129 } 1130 1131 /** 1132 * flush_page_queue: Flush any remaining pages in the ram request queue 1133 * it should be empty at the end anyway, but in error cases there may be 1134 * some left. 1135 * 1136 * ms: MigrationState 1137 */ 1138 void flush_page_queue(MigrationState *ms) 1139 { 1140 struct MigrationSrcPageRequest *mspr, *next_mspr; 1141 /* This queue generally should be empty - but in the case of a failed 1142 * migration might have some droppings in. 1143 */ 1144 rcu_read_lock(); 1145 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) { 1146 memory_region_unref(mspr->rb->mr); 1147 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); 1148 g_free(mspr); 1149 } 1150 rcu_read_unlock(); 1151 } 1152 1153 /** 1154 * Queue the pages for transmission, e.g. a request from postcopy destination 1155 * ms: MigrationStatus in which the queue is held 1156 * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last) 1157 * start: Offset from the start of the RAMBlock 1158 * len: Length (in bytes) to send 1159 * Return: 0 on success 1160 */ 1161 int ram_save_queue_pages(MigrationState *ms, const char *rbname, 1162 ram_addr_t start, ram_addr_t len) 1163 { 1164 RAMBlock *ramblock; 1165 1166 ms->postcopy_requests++; 1167 rcu_read_lock(); 1168 if (!rbname) { 1169 /* Reuse last RAMBlock */ 1170 ramblock = ms->last_req_rb; 1171 1172 if (!ramblock) { 1173 /* 1174 * Shouldn't happen, we can't reuse the last RAMBlock if 1175 * it's the 1st request. 1176 */ 1177 error_report("ram_save_queue_pages no previous block"); 1178 goto err; 1179 } 1180 } else { 1181 ramblock = qemu_ram_block_by_name(rbname); 1182 1183 if (!ramblock) { 1184 /* We shouldn't be asked for a non-existent RAMBlock */ 1185 error_report("ram_save_queue_pages no block '%s'", rbname); 1186 goto err; 1187 } 1188 ms->last_req_rb = ramblock; 1189 } 1190 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1191 if (start+len > ramblock->used_length) { 1192 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1193 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1194 __func__, start, len, ramblock->used_length); 1195 goto err; 1196 } 1197 1198 struct MigrationSrcPageRequest *new_entry = 1199 g_malloc0(sizeof(struct MigrationSrcPageRequest)); 1200 new_entry->rb = ramblock; 1201 new_entry->offset = start; 1202 new_entry->len = len; 1203 1204 memory_region_ref(ramblock->mr); 1205 qemu_mutex_lock(&ms->src_page_req_mutex); 1206 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req); 1207 qemu_mutex_unlock(&ms->src_page_req_mutex); 1208 rcu_read_unlock(); 1209 1210 return 0; 1211 1212 err: 1213 rcu_read_unlock(); 1214 return -1; 1215 } 1216 1217 /** 1218 * ram_save_target_page: Save one target page 1219 * 1220 * 1221 * @f: QEMUFile where to send the data 1222 * @block: pointer to block that contains the page we want to send 1223 * @offset: offset inside the block for the page; 1224 * @last_stage: if we are at the completion stage 1225 * @bytes_transferred: increase it with the number of transferred bytes 1226 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space 1227 * 1228 * Returns: Number of pages written. 1229 */ 1230 static int ram_save_target_page(MigrationState *ms, QEMUFile *f, 1231 PageSearchStatus *pss, 1232 bool last_stage, 1233 uint64_t *bytes_transferred, 1234 ram_addr_t dirty_ram_abs) 1235 { 1236 int res = 0; 1237 1238 /* Check the pages is dirty and if it is send it */ 1239 if (migration_bitmap_clear_dirty(dirty_ram_abs)) { 1240 unsigned long *unsentmap; 1241 if (compression_switch && migrate_use_compression()) { 1242 res = ram_save_compressed_page(f, pss, 1243 last_stage, 1244 bytes_transferred); 1245 } else { 1246 res = ram_save_page(f, pss, last_stage, 1247 bytes_transferred); 1248 } 1249 1250 if (res < 0) { 1251 return res; 1252 } 1253 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1254 if (unsentmap) { 1255 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap); 1256 } 1257 /* Only update last_sent_block if a block was actually sent; xbzrle 1258 * might have decided the page was identical so didn't bother writing 1259 * to the stream. 1260 */ 1261 if (res > 0) { 1262 last_sent_block = pss->block; 1263 } 1264 } 1265 1266 return res; 1267 } 1268 1269 /** 1270 * ram_save_host_page: Starting at *offset send pages up to the end 1271 * of the current host page. It's valid for the initial 1272 * offset to point into the middle of a host page 1273 * in which case the remainder of the hostpage is sent. 1274 * Only dirty target pages are sent. 1275 * 1276 * Returns: Number of pages written. 1277 * 1278 * @f: QEMUFile where to send the data 1279 * @block: pointer to block that contains the page we want to send 1280 * @offset: offset inside the block for the page; updated to last target page 1281 * sent 1282 * @last_stage: if we are at the completion stage 1283 * @bytes_transferred: increase it with the number of transferred bytes 1284 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space 1285 */ 1286 static int ram_save_host_page(MigrationState *ms, QEMUFile *f, 1287 PageSearchStatus *pss, 1288 bool last_stage, 1289 uint64_t *bytes_transferred, 1290 ram_addr_t dirty_ram_abs) 1291 { 1292 int tmppages, pages = 0; 1293 do { 1294 tmppages = ram_save_target_page(ms, f, pss, last_stage, 1295 bytes_transferred, dirty_ram_abs); 1296 if (tmppages < 0) { 1297 return tmppages; 1298 } 1299 1300 pages += tmppages; 1301 pss->offset += TARGET_PAGE_SIZE; 1302 dirty_ram_abs += TARGET_PAGE_SIZE; 1303 } while (pss->offset & (qemu_host_page_size - 1)); 1304 1305 /* The offset we leave with is the last one we looked at */ 1306 pss->offset -= TARGET_PAGE_SIZE; 1307 return pages; 1308 } 1309 1310 /** 1311 * ram_find_and_save_block: Finds a dirty page and sends it to f 1312 * 1313 * Called within an RCU critical section. 1314 * 1315 * Returns: The number of pages written 1316 * 0 means no dirty pages 1317 * 1318 * @f: QEMUFile where to send the data 1319 * @last_stage: if we are at the completion stage 1320 * @bytes_transferred: increase it with the number of transferred bytes 1321 * 1322 * On systems where host-page-size > target-page-size it will send all the 1323 * pages in a host page that are dirty. 1324 */ 1325 1326 static int ram_find_and_save_block(QEMUFile *f, bool last_stage, 1327 uint64_t *bytes_transferred) 1328 { 1329 PageSearchStatus pss; 1330 MigrationState *ms = migrate_get_current(); 1331 int pages = 0; 1332 bool again, found; 1333 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in 1334 ram_addr_t space */ 1335 1336 pss.block = last_seen_block; 1337 pss.offset = last_offset; 1338 pss.complete_round = false; 1339 1340 if (!pss.block) { 1341 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1342 } 1343 1344 do { 1345 again = true; 1346 found = get_queued_page(ms, &pss, &dirty_ram_abs); 1347 1348 if (!found) { 1349 /* priority queue empty, so just search for something dirty */ 1350 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); 1351 } 1352 1353 if (found) { 1354 pages = ram_save_host_page(ms, f, &pss, 1355 last_stage, bytes_transferred, 1356 dirty_ram_abs); 1357 } 1358 } while (!pages && again); 1359 1360 last_seen_block = pss.block; 1361 last_offset = pss.offset; 1362 1363 return pages; 1364 } 1365 1366 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1367 { 1368 uint64_t pages = size / TARGET_PAGE_SIZE; 1369 if (zero) { 1370 acct_info.dup_pages += pages; 1371 } else { 1372 acct_info.norm_pages += pages; 1373 bytes_transferred += size; 1374 qemu_update_position(f, size); 1375 } 1376 } 1377 1378 static ram_addr_t ram_save_remaining(void) 1379 { 1380 return migration_dirty_pages; 1381 } 1382 1383 uint64_t ram_bytes_remaining(void) 1384 { 1385 return ram_save_remaining() * TARGET_PAGE_SIZE; 1386 } 1387 1388 uint64_t ram_bytes_transferred(void) 1389 { 1390 return bytes_transferred; 1391 } 1392 1393 uint64_t ram_bytes_total(void) 1394 { 1395 RAMBlock *block; 1396 uint64_t total = 0; 1397 1398 rcu_read_lock(); 1399 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) 1400 total += block->used_length; 1401 rcu_read_unlock(); 1402 return total; 1403 } 1404 1405 void free_xbzrle_decoded_buf(void) 1406 { 1407 g_free(xbzrle_decoded_buf); 1408 xbzrle_decoded_buf = NULL; 1409 } 1410 1411 static void migration_bitmap_free(struct BitmapRcu *bmap) 1412 { 1413 g_free(bmap->bmap); 1414 g_free(bmap->unsentmap); 1415 g_free(bmap); 1416 } 1417 1418 static void ram_migration_cleanup(void *opaque) 1419 { 1420 /* caller have hold iothread lock or is in a bh, so there is 1421 * no writing race against this migration_bitmap 1422 */ 1423 struct BitmapRcu *bitmap = migration_bitmap_rcu; 1424 atomic_rcu_set(&migration_bitmap_rcu, NULL); 1425 if (bitmap) { 1426 memory_global_dirty_log_stop(); 1427 call_rcu(bitmap, migration_bitmap_free, rcu); 1428 } 1429 1430 XBZRLE_cache_lock(); 1431 if (XBZRLE.cache) { 1432 cache_fini(XBZRLE.cache); 1433 g_free(XBZRLE.encoded_buf); 1434 g_free(XBZRLE.current_buf); 1435 g_free(ZERO_TARGET_PAGE); 1436 XBZRLE.cache = NULL; 1437 XBZRLE.encoded_buf = NULL; 1438 XBZRLE.current_buf = NULL; 1439 } 1440 XBZRLE_cache_unlock(); 1441 } 1442 1443 static void reset_ram_globals(void) 1444 { 1445 last_seen_block = NULL; 1446 last_sent_block = NULL; 1447 last_offset = 0; 1448 last_version = ram_list.version; 1449 ram_bulk_stage = true; 1450 } 1451 1452 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1453 1454 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) 1455 { 1456 /* called in qemu main thread, so there is 1457 * no writing race against this migration_bitmap 1458 */ 1459 if (migration_bitmap_rcu) { 1460 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap; 1461 bitmap = g_new(struct BitmapRcu, 1); 1462 bitmap->bmap = bitmap_new(new); 1463 1464 /* prevent migration_bitmap content from being set bit 1465 * by migration_bitmap_sync_range() at the same time. 1466 * it is safe to migration if migration_bitmap is cleared bit 1467 * at the same time. 1468 */ 1469 qemu_mutex_lock(&migration_bitmap_mutex); 1470 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old); 1471 bitmap_set(bitmap->bmap, old, new - old); 1472 1473 /* We don't have a way to safely extend the sentmap 1474 * with RCU; so mark it as missing, entry to postcopy 1475 * will fail. 1476 */ 1477 bitmap->unsentmap = NULL; 1478 1479 atomic_rcu_set(&migration_bitmap_rcu, bitmap); 1480 qemu_mutex_unlock(&migration_bitmap_mutex); 1481 migration_dirty_pages += new - old; 1482 call_rcu(old_bitmap, migration_bitmap_free, rcu); 1483 } 1484 } 1485 1486 /* 1487 * 'expected' is the value you expect the bitmap mostly to be full 1488 * of; it won't bother printing lines that are all this value. 1489 * If 'todump' is null the migration bitmap is dumped. 1490 */ 1491 void ram_debug_dump_bitmap(unsigned long *todump, bool expected) 1492 { 1493 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; 1494 1495 int64_t cur; 1496 int64_t linelen = 128; 1497 char linebuf[129]; 1498 1499 if (!todump) { 1500 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1501 } 1502 1503 for (cur = 0; cur < ram_pages; cur += linelen) { 1504 int64_t curb; 1505 bool found = false; 1506 /* 1507 * Last line; catch the case where the line length 1508 * is longer than remaining ram 1509 */ 1510 if (cur + linelen > ram_pages) { 1511 linelen = ram_pages - cur; 1512 } 1513 for (curb = 0; curb < linelen; curb++) { 1514 bool thisbit = test_bit(cur + curb, todump); 1515 linebuf[curb] = thisbit ? '1' : '.'; 1516 found = found || (thisbit != expected); 1517 } 1518 if (found) { 1519 linebuf[curb] = '\0'; 1520 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1521 } 1522 } 1523 } 1524 1525 /* **** functions for postcopy ***** */ 1526 1527 /* 1528 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1529 * Note: At this point the 'unsentmap' is the processed bitmap combined 1530 * with the dirtymap; so a '1' means it's either dirty or unsent. 1531 * start,length: Indexes into the bitmap for the first bit 1532 * representing the named block and length in target-pages 1533 */ 1534 static int postcopy_send_discard_bm_ram(MigrationState *ms, 1535 PostcopyDiscardState *pds, 1536 unsigned long start, 1537 unsigned long length) 1538 { 1539 unsigned long end = start + length; /* one after the end */ 1540 unsigned long current; 1541 unsigned long *unsentmap; 1542 1543 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1544 for (current = start; current < end; ) { 1545 unsigned long one = find_next_bit(unsentmap, end, current); 1546 1547 if (one <= end) { 1548 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); 1549 unsigned long discard_length; 1550 1551 if (zero >= end) { 1552 discard_length = end - one; 1553 } else { 1554 discard_length = zero - one; 1555 } 1556 if (discard_length) { 1557 postcopy_discard_send_range(ms, pds, one, discard_length); 1558 } 1559 current = one + discard_length; 1560 } else { 1561 current = one; 1562 } 1563 } 1564 1565 return 0; 1566 } 1567 1568 /* 1569 * Utility for the outgoing postcopy code. 1570 * Calls postcopy_send_discard_bm_ram for each RAMBlock 1571 * passing it bitmap indexes and name. 1572 * Returns: 0 on success 1573 * (qemu_ram_foreach_block ends up passing unscaled lengths 1574 * which would mean postcopy code would have to deal with target page) 1575 */ 1576 static int postcopy_each_ram_send_discard(MigrationState *ms) 1577 { 1578 struct RAMBlock *block; 1579 int ret; 1580 1581 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1582 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1583 PostcopyDiscardState *pds = postcopy_discard_send_init(ms, 1584 first, 1585 block->idstr); 1586 1587 /* 1588 * Postcopy sends chunks of bitmap over the wire, but it 1589 * just needs indexes at this point, avoids it having 1590 * target page specific code. 1591 */ 1592 ret = postcopy_send_discard_bm_ram(ms, pds, first, 1593 block->used_length >> TARGET_PAGE_BITS); 1594 postcopy_discard_send_finish(ms, pds); 1595 if (ret) { 1596 return ret; 1597 } 1598 } 1599 1600 return 0; 1601 } 1602 1603 /* 1604 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup 1605 * the two bitmaps, that are similar, but one is inverted. 1606 * 1607 * We search for runs of target-pages that don't start or end on a 1608 * host page boundary; 1609 * unsent_pass=true: Cleans up partially unsent host pages by searching 1610 * the unsentmap 1611 * unsent_pass=false: Cleans up partially dirty host pages by searching 1612 * the main migration bitmap 1613 * 1614 */ 1615 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, 1616 RAMBlock *block, 1617 PostcopyDiscardState *pds) 1618 { 1619 unsigned long *bitmap; 1620 unsigned long *unsentmap; 1621 unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE; 1622 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1623 unsigned long len = block->used_length >> TARGET_PAGE_BITS; 1624 unsigned long last = first + (len - 1); 1625 unsigned long run_start; 1626 1627 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1628 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1629 1630 if (unsent_pass) { 1631 /* Find a sent page */ 1632 run_start = find_next_zero_bit(unsentmap, last + 1, first); 1633 } else { 1634 /* Find a dirty page */ 1635 run_start = find_next_bit(bitmap, last + 1, first); 1636 } 1637 1638 while (run_start <= last) { 1639 bool do_fixup = false; 1640 unsigned long fixup_start_addr; 1641 unsigned long host_offset; 1642 1643 /* 1644 * If the start of this run of pages is in the middle of a host 1645 * page, then we need to fixup this host page. 1646 */ 1647 host_offset = run_start % host_ratio; 1648 if (host_offset) { 1649 do_fixup = true; 1650 run_start -= host_offset; 1651 fixup_start_addr = run_start; 1652 /* For the next pass */ 1653 run_start = run_start + host_ratio; 1654 } else { 1655 /* Find the end of this run */ 1656 unsigned long run_end; 1657 if (unsent_pass) { 1658 run_end = find_next_bit(unsentmap, last + 1, run_start + 1); 1659 } else { 1660 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1); 1661 } 1662 /* 1663 * If the end isn't at the start of a host page, then the 1664 * run doesn't finish at the end of a host page 1665 * and we need to discard. 1666 */ 1667 host_offset = run_end % host_ratio; 1668 if (host_offset) { 1669 do_fixup = true; 1670 fixup_start_addr = run_end - host_offset; 1671 /* 1672 * This host page has gone, the next loop iteration starts 1673 * from after the fixup 1674 */ 1675 run_start = fixup_start_addr + host_ratio; 1676 } else { 1677 /* 1678 * No discards on this iteration, next loop starts from 1679 * next sent/dirty page 1680 */ 1681 run_start = run_end + 1; 1682 } 1683 } 1684 1685 if (do_fixup) { 1686 unsigned long page; 1687 1688 /* Tell the destination to discard this page */ 1689 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { 1690 /* For the unsent_pass we: 1691 * discard partially sent pages 1692 * For the !unsent_pass (dirty) we: 1693 * discard partially dirty pages that were sent 1694 * (any partially sent pages were already discarded 1695 * by the previous unsent_pass) 1696 */ 1697 postcopy_discard_send_range(ms, pds, fixup_start_addr, 1698 host_ratio); 1699 } 1700 1701 /* Clean up the bitmap */ 1702 for (page = fixup_start_addr; 1703 page < fixup_start_addr + host_ratio; page++) { 1704 /* All pages in this host page are now not sent */ 1705 set_bit(page, unsentmap); 1706 1707 /* 1708 * Remark them as dirty, updating the count for any pages 1709 * that weren't previously dirty. 1710 */ 1711 migration_dirty_pages += !test_and_set_bit(page, bitmap); 1712 } 1713 } 1714 1715 if (unsent_pass) { 1716 /* Find the next sent page for the next iteration */ 1717 run_start = find_next_zero_bit(unsentmap, last + 1, 1718 run_start); 1719 } else { 1720 /* Find the next dirty page for the next iteration */ 1721 run_start = find_next_bit(bitmap, last + 1, run_start); 1722 } 1723 } 1724 } 1725 1726 /* 1727 * Utility for the outgoing postcopy code. 1728 * 1729 * Discard any partially sent host-page size chunks, mark any partially 1730 * dirty host-page size chunks as all dirty. 1731 * 1732 * Returns: 0 on success 1733 */ 1734 static int postcopy_chunk_hostpages(MigrationState *ms) 1735 { 1736 struct RAMBlock *block; 1737 1738 if (qemu_host_page_size == TARGET_PAGE_SIZE) { 1739 /* Easy case - TPS==HPS - nothing to be done */ 1740 return 0; 1741 } 1742 1743 /* Easiest way to make sure we don't resume in the middle of a host-page */ 1744 last_seen_block = NULL; 1745 last_sent_block = NULL; 1746 last_offset = 0; 1747 1748 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1749 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1750 1751 PostcopyDiscardState *pds = 1752 postcopy_discard_send_init(ms, first, block->idstr); 1753 1754 /* First pass: Discard all partially sent host pages */ 1755 postcopy_chunk_hostpages_pass(ms, true, block, pds); 1756 /* 1757 * Second pass: Ensure that all partially dirty host pages are made 1758 * fully dirty. 1759 */ 1760 postcopy_chunk_hostpages_pass(ms, false, block, pds); 1761 1762 postcopy_discard_send_finish(ms, pds); 1763 } /* ram_list loop */ 1764 1765 return 0; 1766 } 1767 1768 /* 1769 * Transmit the set of pages to be discarded after precopy to the target 1770 * these are pages that: 1771 * a) Have been previously transmitted but are now dirty again 1772 * b) Pages that have never been transmitted, this ensures that 1773 * any pages on the destination that have been mapped by background 1774 * tasks get discarded (transparent huge pages is the specific concern) 1775 * Hopefully this is pretty sparse 1776 */ 1777 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 1778 { 1779 int ret; 1780 unsigned long *bitmap, *unsentmap; 1781 1782 rcu_read_lock(); 1783 1784 /* This should be our last sync, the src is now paused */ 1785 migration_bitmap_sync(); 1786 1787 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1788 if (!unsentmap) { 1789 /* We don't have a safe way to resize the sentmap, so 1790 * if the bitmap was resized it will be NULL at this 1791 * point. 1792 */ 1793 error_report("migration ram resized during precopy phase"); 1794 rcu_read_unlock(); 1795 return -EINVAL; 1796 } 1797 1798 /* Deal with TPS != HPS */ 1799 ret = postcopy_chunk_hostpages(ms); 1800 if (ret) { 1801 rcu_read_unlock(); 1802 return ret; 1803 } 1804 1805 /* 1806 * Update the unsentmap to be unsentmap = unsentmap | dirty 1807 */ 1808 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1809 bitmap_or(unsentmap, unsentmap, bitmap, 1810 last_ram_offset() >> TARGET_PAGE_BITS); 1811 1812 1813 trace_ram_postcopy_send_discard_bitmap(); 1814 #ifdef DEBUG_POSTCOPY 1815 ram_debug_dump_bitmap(unsentmap, true); 1816 #endif 1817 1818 ret = postcopy_each_ram_send_discard(ms); 1819 rcu_read_unlock(); 1820 1821 return ret; 1822 } 1823 1824 /* 1825 * At the start of the postcopy phase of migration, any now-dirty 1826 * precopied pages are discarded. 1827 * 1828 * start, length describe a byte address range within the RAMBlock 1829 * 1830 * Returns 0 on success. 1831 */ 1832 int ram_discard_range(MigrationIncomingState *mis, 1833 const char *block_name, 1834 uint64_t start, size_t length) 1835 { 1836 int ret = -1; 1837 1838 rcu_read_lock(); 1839 RAMBlock *rb = qemu_ram_block_by_name(block_name); 1840 1841 if (!rb) { 1842 error_report("ram_discard_range: Failed to find block '%s'", 1843 block_name); 1844 goto err; 1845 } 1846 1847 uint8_t *host_startaddr = rb->host + start; 1848 1849 if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) { 1850 error_report("ram_discard_range: Unaligned start address: %p", 1851 host_startaddr); 1852 goto err; 1853 } 1854 1855 if ((start + length) <= rb->used_length) { 1856 uint8_t *host_endaddr = host_startaddr + length; 1857 if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) { 1858 error_report("ram_discard_range: Unaligned end address: %p", 1859 host_endaddr); 1860 goto err; 1861 } 1862 ret = postcopy_ram_discard_range(mis, host_startaddr, length); 1863 } else { 1864 error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 1865 "/%zx/" RAM_ADDR_FMT")", 1866 block_name, start, length, rb->used_length); 1867 } 1868 1869 err: 1870 rcu_read_unlock(); 1871 1872 return ret; 1873 } 1874 1875 static int ram_save_init_globals(void) 1876 { 1877 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ 1878 1879 dirty_rate_high_cnt = 0; 1880 bitmap_sync_count = 0; 1881 migration_bitmap_sync_init(); 1882 qemu_mutex_init(&migration_bitmap_mutex); 1883 1884 if (migrate_use_xbzrle()) { 1885 XBZRLE_cache_lock(); 1886 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE); 1887 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / 1888 TARGET_PAGE_SIZE, 1889 TARGET_PAGE_SIZE); 1890 if (!XBZRLE.cache) { 1891 XBZRLE_cache_unlock(); 1892 error_report("Error creating cache"); 1893 return -1; 1894 } 1895 XBZRLE_cache_unlock(); 1896 1897 /* We prefer not to abort if there is no memory */ 1898 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 1899 if (!XBZRLE.encoded_buf) { 1900 error_report("Error allocating encoded_buf"); 1901 return -1; 1902 } 1903 1904 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 1905 if (!XBZRLE.current_buf) { 1906 error_report("Error allocating current_buf"); 1907 g_free(XBZRLE.encoded_buf); 1908 XBZRLE.encoded_buf = NULL; 1909 return -1; 1910 } 1911 1912 acct_clear(); 1913 } 1914 1915 /* For memory_global_dirty_log_start below. */ 1916 qemu_mutex_lock_iothread(); 1917 1918 qemu_mutex_lock_ramlist(); 1919 rcu_read_lock(); 1920 bytes_transferred = 0; 1921 reset_ram_globals(); 1922 1923 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; 1924 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); 1925 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); 1926 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); 1927 1928 if (migrate_postcopy_ram()) { 1929 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); 1930 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); 1931 } 1932 1933 /* 1934 * Count the total number of pages used by ram blocks not including any 1935 * gaps due to alignment or unplugs. 1936 */ 1937 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 1938 1939 memory_global_dirty_log_start(); 1940 migration_bitmap_sync(); 1941 qemu_mutex_unlock_ramlist(); 1942 qemu_mutex_unlock_iothread(); 1943 rcu_read_unlock(); 1944 1945 return 0; 1946 } 1947 1948 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has 1949 * long-running RCU critical section. When rcu-reclaims in the code 1950 * start to become numerous it will be necessary to reduce the 1951 * granularity of these critical sections. 1952 */ 1953 1954 static int ram_save_setup(QEMUFile *f, void *opaque) 1955 { 1956 RAMBlock *block; 1957 1958 /* migration has already setup the bitmap, reuse it. */ 1959 if (!migration_in_colo_state()) { 1960 if (ram_save_init_globals() < 0) { 1961 return -1; 1962 } 1963 } 1964 1965 rcu_read_lock(); 1966 1967 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); 1968 1969 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1970 qemu_put_byte(f, strlen(block->idstr)); 1971 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 1972 qemu_put_be64(f, block->used_length); 1973 } 1974 1975 rcu_read_unlock(); 1976 1977 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 1978 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 1979 1980 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 1981 1982 return 0; 1983 } 1984 1985 static int ram_save_iterate(QEMUFile *f, void *opaque) 1986 { 1987 int ret; 1988 int i; 1989 int64_t t0; 1990 int done = 0; 1991 1992 rcu_read_lock(); 1993 if (ram_list.version != last_version) { 1994 reset_ram_globals(); 1995 } 1996 1997 /* Read version before ram_list.blocks */ 1998 smp_rmb(); 1999 2000 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2001 2002 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2003 i = 0; 2004 while ((ret = qemu_file_rate_limit(f)) == 0) { 2005 int pages; 2006 2007 pages = ram_find_and_save_block(f, false, &bytes_transferred); 2008 /* no more pages to sent */ 2009 if (pages == 0) { 2010 done = 1; 2011 break; 2012 } 2013 acct_info.iterations++; 2014 2015 /* we want to check in the 1st loop, just in case it was the 1st time 2016 and we had to sync the dirty bitmap. 2017 qemu_get_clock_ns() is a bit expensive, so we only check each some 2018 iterations 2019 */ 2020 if ((i & 63) == 0) { 2021 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; 2022 if (t1 > MAX_WAIT) { 2023 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n", 2024 t1, i); 2025 break; 2026 } 2027 } 2028 i++; 2029 } 2030 flush_compressed_data(f); 2031 rcu_read_unlock(); 2032 2033 /* 2034 * Must occur before EOS (or any QEMUFile operation) 2035 * because of RDMA protocol. 2036 */ 2037 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2038 2039 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2040 bytes_transferred += 8; 2041 2042 ret = qemu_file_get_error(f); 2043 if (ret < 0) { 2044 return ret; 2045 } 2046 2047 return done; 2048 } 2049 2050 /* Called with iothread lock */ 2051 static int ram_save_complete(QEMUFile *f, void *opaque) 2052 { 2053 rcu_read_lock(); 2054 2055 if (!migration_in_postcopy(migrate_get_current())) { 2056 migration_bitmap_sync(); 2057 } 2058 2059 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2060 2061 /* try transferring iterative blocks of memory */ 2062 2063 /* flush all remaining blocks regardless of rate limiting */ 2064 while (true) { 2065 int pages; 2066 2067 pages = ram_find_and_save_block(f, !migration_in_colo_state(), 2068 &bytes_transferred); 2069 /* no more blocks to sent */ 2070 if (pages == 0) { 2071 break; 2072 } 2073 } 2074 2075 flush_compressed_data(f); 2076 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2077 2078 rcu_read_unlock(); 2079 2080 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2081 2082 return 0; 2083 } 2084 2085 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2086 uint64_t *non_postcopiable_pending, 2087 uint64_t *postcopiable_pending) 2088 { 2089 uint64_t remaining_size; 2090 2091 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; 2092 2093 if (!migration_in_postcopy(migrate_get_current()) && 2094 remaining_size < max_size) { 2095 qemu_mutex_lock_iothread(); 2096 rcu_read_lock(); 2097 migration_bitmap_sync(); 2098 rcu_read_unlock(); 2099 qemu_mutex_unlock_iothread(); 2100 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; 2101 } 2102 2103 /* We can do postcopy, and all the data is postcopiable */ 2104 *postcopiable_pending += remaining_size; 2105 } 2106 2107 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2108 { 2109 unsigned int xh_len; 2110 int xh_flags; 2111 uint8_t *loaded_data; 2112 2113 if (!xbzrle_decoded_buf) { 2114 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2115 } 2116 loaded_data = xbzrle_decoded_buf; 2117 2118 /* extract RLE header */ 2119 xh_flags = qemu_get_byte(f); 2120 xh_len = qemu_get_be16(f); 2121 2122 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2123 error_report("Failed to load XBZRLE page - wrong compression!"); 2124 return -1; 2125 } 2126 2127 if (xh_len > TARGET_PAGE_SIZE) { 2128 error_report("Failed to load XBZRLE page - len overflow!"); 2129 return -1; 2130 } 2131 /* load data and decode */ 2132 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2133 2134 /* decode RLE */ 2135 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2136 TARGET_PAGE_SIZE) == -1) { 2137 error_report("Failed to load XBZRLE page - decode error!"); 2138 return -1; 2139 } 2140 2141 return 0; 2142 } 2143 2144 /* Must be called from within a rcu critical section. 2145 * Returns a pointer from within the RCU-protected ram_list. 2146 */ 2147 /* 2148 * Read a RAMBlock ID from the stream f. 2149 * 2150 * f: Stream to read from 2151 * flags: Page flags (mostly to see if it's a continuation of previous block) 2152 */ 2153 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, 2154 int flags) 2155 { 2156 static RAMBlock *block = NULL; 2157 char id[256]; 2158 uint8_t len; 2159 2160 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2161 if (!block) { 2162 error_report("Ack, bad migration stream!"); 2163 return NULL; 2164 } 2165 return block; 2166 } 2167 2168 len = qemu_get_byte(f); 2169 qemu_get_buffer(f, (uint8_t *)id, len); 2170 id[len] = 0; 2171 2172 block = qemu_ram_block_by_name(id); 2173 if (!block) { 2174 error_report("Can't find block %s", id); 2175 return NULL; 2176 } 2177 2178 return block; 2179 } 2180 2181 static inline void *host_from_ram_block_offset(RAMBlock *block, 2182 ram_addr_t offset) 2183 { 2184 if (!offset_in_ramblock(block, offset)) { 2185 return NULL; 2186 } 2187 2188 return block->host + offset; 2189 } 2190 2191 /* 2192 * If a page (or a whole RDMA chunk) has been 2193 * determined to be zero, then zap it. 2194 */ 2195 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2196 { 2197 if (ch != 0 || !is_zero_range(host, size)) { 2198 memset(host, ch, size); 2199 } 2200 } 2201 2202 static void *do_data_decompress(void *opaque) 2203 { 2204 DecompressParam *param = opaque; 2205 unsigned long pagesize; 2206 uint8_t *des; 2207 int len; 2208 2209 qemu_mutex_lock(¶m->mutex); 2210 while (!param->quit) { 2211 if (param->des) { 2212 des = param->des; 2213 len = param->len; 2214 param->des = 0; 2215 qemu_mutex_unlock(¶m->mutex); 2216 2217 pagesize = TARGET_PAGE_SIZE; 2218 /* uncompress() will return failed in some case, especially 2219 * when the page is dirted when doing the compression, it's 2220 * not a problem because the dirty page will be retransferred 2221 * and uncompress() won't break the data in other pages. 2222 */ 2223 uncompress((Bytef *)des, &pagesize, 2224 (const Bytef *)param->compbuf, len); 2225 2226 qemu_mutex_lock(&decomp_done_lock); 2227 param->done = true; 2228 qemu_cond_signal(&decomp_done_cond); 2229 qemu_mutex_unlock(&decomp_done_lock); 2230 2231 qemu_mutex_lock(¶m->mutex); 2232 } else { 2233 qemu_cond_wait(¶m->cond, ¶m->mutex); 2234 } 2235 } 2236 qemu_mutex_unlock(¶m->mutex); 2237 2238 return NULL; 2239 } 2240 2241 static void wait_for_decompress_done(void) 2242 { 2243 int idx, thread_count; 2244 2245 if (!migrate_use_compression()) { 2246 return; 2247 } 2248 2249 thread_count = migrate_decompress_threads(); 2250 qemu_mutex_lock(&decomp_done_lock); 2251 for (idx = 0; idx < thread_count; idx++) { 2252 while (!decomp_param[idx].done) { 2253 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2254 } 2255 } 2256 qemu_mutex_unlock(&decomp_done_lock); 2257 } 2258 2259 void migrate_decompress_threads_create(void) 2260 { 2261 int i, thread_count; 2262 2263 thread_count = migrate_decompress_threads(); 2264 decompress_threads = g_new0(QemuThread, thread_count); 2265 decomp_param = g_new0(DecompressParam, thread_count); 2266 qemu_mutex_init(&decomp_done_lock); 2267 qemu_cond_init(&decomp_done_cond); 2268 for (i = 0; i < thread_count; i++) { 2269 qemu_mutex_init(&decomp_param[i].mutex); 2270 qemu_cond_init(&decomp_param[i].cond); 2271 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2272 decomp_param[i].done = true; 2273 decomp_param[i].quit = false; 2274 qemu_thread_create(decompress_threads + i, "decompress", 2275 do_data_decompress, decomp_param + i, 2276 QEMU_THREAD_JOINABLE); 2277 } 2278 } 2279 2280 void migrate_decompress_threads_join(void) 2281 { 2282 int i, thread_count; 2283 2284 thread_count = migrate_decompress_threads(); 2285 for (i = 0; i < thread_count; i++) { 2286 qemu_mutex_lock(&decomp_param[i].mutex); 2287 decomp_param[i].quit = true; 2288 qemu_cond_signal(&decomp_param[i].cond); 2289 qemu_mutex_unlock(&decomp_param[i].mutex); 2290 } 2291 for (i = 0; i < thread_count; i++) { 2292 qemu_thread_join(decompress_threads + i); 2293 qemu_mutex_destroy(&decomp_param[i].mutex); 2294 qemu_cond_destroy(&decomp_param[i].cond); 2295 g_free(decomp_param[i].compbuf); 2296 } 2297 g_free(decompress_threads); 2298 g_free(decomp_param); 2299 decompress_threads = NULL; 2300 decomp_param = NULL; 2301 } 2302 2303 static void decompress_data_with_multi_threads(QEMUFile *f, 2304 void *host, int len) 2305 { 2306 int idx, thread_count; 2307 2308 thread_count = migrate_decompress_threads(); 2309 qemu_mutex_lock(&decomp_done_lock); 2310 while (true) { 2311 for (idx = 0; idx < thread_count; idx++) { 2312 if (decomp_param[idx].done) { 2313 decomp_param[idx].done = false; 2314 qemu_mutex_lock(&decomp_param[idx].mutex); 2315 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2316 decomp_param[idx].des = host; 2317 decomp_param[idx].len = len; 2318 qemu_cond_signal(&decomp_param[idx].cond); 2319 qemu_mutex_unlock(&decomp_param[idx].mutex); 2320 break; 2321 } 2322 } 2323 if (idx < thread_count) { 2324 break; 2325 } else { 2326 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2327 } 2328 } 2329 qemu_mutex_unlock(&decomp_done_lock); 2330 } 2331 2332 /* 2333 * Allocate data structures etc needed by incoming migration with postcopy-ram 2334 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work 2335 */ 2336 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 2337 { 2338 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; 2339 2340 return postcopy_ram_incoming_init(mis, ram_pages); 2341 } 2342 2343 /* 2344 * Called in postcopy mode by ram_load(). 2345 * rcu_read_lock is taken prior to this being called. 2346 */ 2347 static int ram_load_postcopy(QEMUFile *f) 2348 { 2349 int flags = 0, ret = 0; 2350 bool place_needed = false; 2351 bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE; 2352 MigrationIncomingState *mis = migration_incoming_get_current(); 2353 /* Temporary page that is later 'placed' */ 2354 void *postcopy_host_page = postcopy_get_tmp_page(mis); 2355 void *last_host = NULL; 2356 bool all_zero = false; 2357 2358 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2359 ram_addr_t addr; 2360 void *host = NULL; 2361 void *page_buffer = NULL; 2362 void *place_source = NULL; 2363 uint8_t ch; 2364 2365 addr = qemu_get_be64(f); 2366 flags = addr & ~TARGET_PAGE_MASK; 2367 addr &= TARGET_PAGE_MASK; 2368 2369 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 2370 place_needed = false; 2371 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) { 2372 RAMBlock *block = ram_block_from_stream(f, flags); 2373 2374 host = host_from_ram_block_offset(block, addr); 2375 if (!host) { 2376 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2377 ret = -EINVAL; 2378 break; 2379 } 2380 /* 2381 * Postcopy requires that we place whole host pages atomically. 2382 * To make it atomic, the data is read into a temporary page 2383 * that's moved into place later. 2384 * The migration protocol uses, possibly smaller, target-pages 2385 * however the source ensures it always sends all the components 2386 * of a host page in order. 2387 */ 2388 page_buffer = postcopy_host_page + 2389 ((uintptr_t)host & ~qemu_host_page_mask); 2390 /* If all TP are zero then we can optimise the place */ 2391 if (!((uintptr_t)host & ~qemu_host_page_mask)) { 2392 all_zero = true; 2393 } else { 2394 /* not the 1st TP within the HP */ 2395 if (host != (last_host + TARGET_PAGE_SIZE)) { 2396 error_report("Non-sequential target page %p/%p", 2397 host, last_host); 2398 ret = -EINVAL; 2399 break; 2400 } 2401 } 2402 2403 2404 /* 2405 * If it's the last part of a host page then we place the host 2406 * page 2407 */ 2408 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & 2409 ~qemu_host_page_mask) == 0; 2410 place_source = postcopy_host_page; 2411 } 2412 last_host = host; 2413 2414 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2415 case RAM_SAVE_FLAG_COMPRESS: 2416 ch = qemu_get_byte(f); 2417 memset(page_buffer, ch, TARGET_PAGE_SIZE); 2418 if (ch) { 2419 all_zero = false; 2420 } 2421 break; 2422 2423 case RAM_SAVE_FLAG_PAGE: 2424 all_zero = false; 2425 if (!place_needed || !matching_page_sizes) { 2426 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 2427 } else { 2428 /* Avoids the qemu_file copy during postcopy, which is 2429 * going to do a copy later; can only do it when we 2430 * do this read in one go (matching page sizes) 2431 */ 2432 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 2433 TARGET_PAGE_SIZE); 2434 } 2435 break; 2436 case RAM_SAVE_FLAG_EOS: 2437 /* normal exit */ 2438 break; 2439 default: 2440 error_report("Unknown combination of migration flags: %#x" 2441 " (postcopy mode)", flags); 2442 ret = -EINVAL; 2443 } 2444 2445 if (place_needed) { 2446 /* This gets called at the last target page in the host page */ 2447 if (all_zero) { 2448 ret = postcopy_place_page_zero(mis, 2449 host + TARGET_PAGE_SIZE - 2450 qemu_host_page_size); 2451 } else { 2452 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE - 2453 qemu_host_page_size, 2454 place_source); 2455 } 2456 } 2457 if (!ret) { 2458 ret = qemu_file_get_error(f); 2459 } 2460 } 2461 2462 return ret; 2463 } 2464 2465 static int ram_load(QEMUFile *f, void *opaque, int version_id) 2466 { 2467 int flags = 0, ret = 0; 2468 static uint64_t seq_iter; 2469 int len = 0; 2470 /* 2471 * If system is running in postcopy mode, page inserts to host memory must 2472 * be atomic 2473 */ 2474 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; 2475 2476 seq_iter++; 2477 2478 if (version_id != 4) { 2479 ret = -EINVAL; 2480 } 2481 2482 /* This RCU critical section can be very long running. 2483 * When RCU reclaims in the code start to become numerous, 2484 * it will be necessary to reduce the granularity of this 2485 * critical section. 2486 */ 2487 rcu_read_lock(); 2488 2489 if (postcopy_running) { 2490 ret = ram_load_postcopy(f); 2491 } 2492 2493 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2494 ram_addr_t addr, total_ram_bytes; 2495 void *host = NULL; 2496 uint8_t ch; 2497 2498 addr = qemu_get_be64(f); 2499 flags = addr & ~TARGET_PAGE_MASK; 2500 addr &= TARGET_PAGE_MASK; 2501 2502 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE | 2503 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 2504 RAMBlock *block = ram_block_from_stream(f, flags); 2505 2506 host = host_from_ram_block_offset(block, addr); 2507 if (!host) { 2508 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2509 ret = -EINVAL; 2510 break; 2511 } 2512 } 2513 2514 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2515 case RAM_SAVE_FLAG_MEM_SIZE: 2516 /* Synchronize RAM block list */ 2517 total_ram_bytes = addr; 2518 while (!ret && total_ram_bytes) { 2519 RAMBlock *block; 2520 char id[256]; 2521 ram_addr_t length; 2522 2523 len = qemu_get_byte(f); 2524 qemu_get_buffer(f, (uint8_t *)id, len); 2525 id[len] = 0; 2526 length = qemu_get_be64(f); 2527 2528 block = qemu_ram_block_by_name(id); 2529 if (block) { 2530 if (length != block->used_length) { 2531 Error *local_err = NULL; 2532 2533 ret = qemu_ram_resize(block, length, 2534 &local_err); 2535 if (local_err) { 2536 error_report_err(local_err); 2537 } 2538 } 2539 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 2540 block->idstr); 2541 } else { 2542 error_report("Unknown ramblock \"%s\", cannot " 2543 "accept migration", id); 2544 ret = -EINVAL; 2545 } 2546 2547 total_ram_bytes -= length; 2548 } 2549 break; 2550 2551 case RAM_SAVE_FLAG_COMPRESS: 2552 ch = qemu_get_byte(f); 2553 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 2554 break; 2555 2556 case RAM_SAVE_FLAG_PAGE: 2557 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 2558 break; 2559 2560 case RAM_SAVE_FLAG_COMPRESS_PAGE: 2561 len = qemu_get_be32(f); 2562 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 2563 error_report("Invalid compressed data length: %d", len); 2564 ret = -EINVAL; 2565 break; 2566 } 2567 decompress_data_with_multi_threads(f, host, len); 2568 break; 2569 2570 case RAM_SAVE_FLAG_XBZRLE: 2571 if (load_xbzrle(f, addr, host) < 0) { 2572 error_report("Failed to decompress XBZRLE page at " 2573 RAM_ADDR_FMT, addr); 2574 ret = -EINVAL; 2575 break; 2576 } 2577 break; 2578 case RAM_SAVE_FLAG_EOS: 2579 /* normal exit */ 2580 break; 2581 default: 2582 if (flags & RAM_SAVE_FLAG_HOOK) { 2583 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 2584 } else { 2585 error_report("Unknown combination of migration flags: %#x", 2586 flags); 2587 ret = -EINVAL; 2588 } 2589 } 2590 if (!ret) { 2591 ret = qemu_file_get_error(f); 2592 } 2593 } 2594 2595 wait_for_decompress_done(); 2596 rcu_read_unlock(); 2597 DPRINTF("Completed load of VM with exit code %d seq iteration " 2598 "%" PRIu64 "\n", ret, seq_iter); 2599 return ret; 2600 } 2601 2602 static SaveVMHandlers savevm_ram_handlers = { 2603 .save_live_setup = ram_save_setup, 2604 .save_live_iterate = ram_save_iterate, 2605 .save_live_complete_postcopy = ram_save_complete, 2606 .save_live_complete_precopy = ram_save_complete, 2607 .save_live_pending = ram_save_pending, 2608 .load_state = ram_load, 2609 .cleanup = ram_migration_cleanup, 2610 }; 2611 2612 void ram_mig_init(void) 2613 { 2614 qemu_mutex_init(&XBZRLE.lock); 2615 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL); 2616 } 2617