1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 #include "qemu/osdep.h" 29 #include "qemu-common.h" 30 #include "cpu.h" 31 #include <zlib.h> 32 #include "qapi-event.h" 33 #include "qemu/cutils.h" 34 #include "qemu/bitops.h" 35 #include "qemu/bitmap.h" 36 #include "qemu/timer.h" 37 #include "qemu/main-loop.h" 38 #include "migration/migration.h" 39 #include "migration/postcopy-ram.h" 40 #include "exec/address-spaces.h" 41 #include "migration/page_cache.h" 42 #include "qemu/error-report.h" 43 #include "trace.h" 44 #include "exec/ram_addr.h" 45 #include "qemu/rcu_queue.h" 46 #include "migration/colo.h" 47 48 static int dirty_rate_high_cnt; 49 50 static uint64_t bitmap_sync_count; 51 52 /***********************************************************/ 53 /* ram save/restore */ 54 55 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 56 #define RAM_SAVE_FLAG_COMPRESS 0x02 57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 58 #define RAM_SAVE_FLAG_PAGE 0x08 59 #define RAM_SAVE_FLAG_EOS 0x10 60 #define RAM_SAVE_FLAG_CONTINUE 0x20 61 #define RAM_SAVE_FLAG_XBZRLE 0x40 62 /* 0x80 is reserved in migration.h start with 0x100 next */ 63 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 64 65 static uint8_t *ZERO_TARGET_PAGE; 66 67 static inline bool is_zero_range(uint8_t *p, uint64_t size) 68 { 69 return buffer_is_zero(p, size); 70 } 71 72 /* struct contains XBZRLE cache and a static page 73 used by the compression */ 74 static struct { 75 /* buffer used for XBZRLE encoding */ 76 uint8_t *encoded_buf; 77 /* buffer for storing page content */ 78 uint8_t *current_buf; 79 /* Cache for XBZRLE, Protected by lock. */ 80 PageCache *cache; 81 QemuMutex lock; 82 } XBZRLE; 83 84 /* buffer used for XBZRLE decoding */ 85 static uint8_t *xbzrle_decoded_buf; 86 87 static void XBZRLE_cache_lock(void) 88 { 89 if (migrate_use_xbzrle()) 90 qemu_mutex_lock(&XBZRLE.lock); 91 } 92 93 static void XBZRLE_cache_unlock(void) 94 { 95 if (migrate_use_xbzrle()) 96 qemu_mutex_unlock(&XBZRLE.lock); 97 } 98 99 /* 100 * called from qmp_migrate_set_cache_size in main thread, possibly while 101 * a migration is in progress. 102 * A running migration maybe using the cache and might finish during this 103 * call, hence changes to the cache are protected by XBZRLE.lock(). 104 */ 105 int64_t xbzrle_cache_resize(int64_t new_size) 106 { 107 PageCache *new_cache; 108 int64_t ret; 109 110 if (new_size < TARGET_PAGE_SIZE) { 111 return -1; 112 } 113 114 XBZRLE_cache_lock(); 115 116 if (XBZRLE.cache != NULL) { 117 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { 118 goto out_new_size; 119 } 120 new_cache = cache_init(new_size / TARGET_PAGE_SIZE, 121 TARGET_PAGE_SIZE); 122 if (!new_cache) { 123 error_report("Error creating cache"); 124 ret = -1; 125 goto out; 126 } 127 128 cache_fini(XBZRLE.cache); 129 XBZRLE.cache = new_cache; 130 } 131 132 out_new_size: 133 ret = pow2floor(new_size); 134 out: 135 XBZRLE_cache_unlock(); 136 return ret; 137 } 138 139 /* accounting for migration statistics */ 140 typedef struct AccountingInfo { 141 uint64_t dup_pages; 142 uint64_t skipped_pages; 143 uint64_t norm_pages; 144 uint64_t iterations; 145 uint64_t xbzrle_bytes; 146 uint64_t xbzrle_pages; 147 uint64_t xbzrle_cache_miss; 148 double xbzrle_cache_miss_rate; 149 uint64_t xbzrle_overflows; 150 } AccountingInfo; 151 152 static AccountingInfo acct_info; 153 154 static void acct_clear(void) 155 { 156 memset(&acct_info, 0, sizeof(acct_info)); 157 } 158 159 uint64_t dup_mig_bytes_transferred(void) 160 { 161 return acct_info.dup_pages * TARGET_PAGE_SIZE; 162 } 163 164 uint64_t dup_mig_pages_transferred(void) 165 { 166 return acct_info.dup_pages; 167 } 168 169 uint64_t skipped_mig_bytes_transferred(void) 170 { 171 return acct_info.skipped_pages * TARGET_PAGE_SIZE; 172 } 173 174 uint64_t skipped_mig_pages_transferred(void) 175 { 176 return acct_info.skipped_pages; 177 } 178 179 uint64_t norm_mig_bytes_transferred(void) 180 { 181 return acct_info.norm_pages * TARGET_PAGE_SIZE; 182 } 183 184 uint64_t norm_mig_pages_transferred(void) 185 { 186 return acct_info.norm_pages; 187 } 188 189 uint64_t xbzrle_mig_bytes_transferred(void) 190 { 191 return acct_info.xbzrle_bytes; 192 } 193 194 uint64_t xbzrle_mig_pages_transferred(void) 195 { 196 return acct_info.xbzrle_pages; 197 } 198 199 uint64_t xbzrle_mig_pages_cache_miss(void) 200 { 201 return acct_info.xbzrle_cache_miss; 202 } 203 204 double xbzrle_mig_cache_miss_rate(void) 205 { 206 return acct_info.xbzrle_cache_miss_rate; 207 } 208 209 uint64_t xbzrle_mig_pages_overflow(void) 210 { 211 return acct_info.xbzrle_overflows; 212 } 213 214 /* This is the last block that we have visited serching for dirty pages 215 */ 216 static RAMBlock *last_seen_block; 217 /* This is the last block from where we have sent data */ 218 static RAMBlock *last_sent_block; 219 static ram_addr_t last_offset; 220 static QemuMutex migration_bitmap_mutex; 221 static uint64_t migration_dirty_pages; 222 static uint32_t last_version; 223 static bool ram_bulk_stage; 224 225 /* used by the search for pages to send */ 226 struct PageSearchStatus { 227 /* Current block being searched */ 228 RAMBlock *block; 229 /* Current offset to search from */ 230 ram_addr_t offset; 231 /* Set once we wrap around */ 232 bool complete_round; 233 }; 234 typedef struct PageSearchStatus PageSearchStatus; 235 236 static struct BitmapRcu { 237 struct rcu_head rcu; 238 /* Main migration bitmap */ 239 unsigned long *bmap; 240 /* bitmap of pages that haven't been sent even once 241 * only maintained and used in postcopy at the moment 242 * where it's used to send the dirtymap at the start 243 * of the postcopy phase 244 */ 245 unsigned long *unsentmap; 246 } *migration_bitmap_rcu; 247 248 struct CompressParam { 249 bool done; 250 bool quit; 251 QEMUFile *file; 252 QemuMutex mutex; 253 QemuCond cond; 254 RAMBlock *block; 255 ram_addr_t offset; 256 }; 257 typedef struct CompressParam CompressParam; 258 259 struct DecompressParam { 260 bool done; 261 bool quit; 262 QemuMutex mutex; 263 QemuCond cond; 264 void *des; 265 uint8_t *compbuf; 266 int len; 267 }; 268 typedef struct DecompressParam DecompressParam; 269 270 static CompressParam *comp_param; 271 static QemuThread *compress_threads; 272 /* comp_done_cond is used to wake up the migration thread when 273 * one of the compression threads has finished the compression. 274 * comp_done_lock is used to co-work with comp_done_cond. 275 */ 276 static QemuMutex comp_done_lock; 277 static QemuCond comp_done_cond; 278 /* The empty QEMUFileOps will be used by file in CompressParam */ 279 static const QEMUFileOps empty_ops = { }; 280 281 static bool compression_switch; 282 static DecompressParam *decomp_param; 283 static QemuThread *decompress_threads; 284 static QemuMutex decomp_done_lock; 285 static QemuCond decomp_done_cond; 286 287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 288 ram_addr_t offset); 289 290 static void *do_data_compress(void *opaque) 291 { 292 CompressParam *param = opaque; 293 RAMBlock *block; 294 ram_addr_t offset; 295 296 qemu_mutex_lock(¶m->mutex); 297 while (!param->quit) { 298 if (param->block) { 299 block = param->block; 300 offset = param->offset; 301 param->block = NULL; 302 qemu_mutex_unlock(¶m->mutex); 303 304 do_compress_ram_page(param->file, block, offset); 305 306 qemu_mutex_lock(&comp_done_lock); 307 param->done = true; 308 qemu_cond_signal(&comp_done_cond); 309 qemu_mutex_unlock(&comp_done_lock); 310 311 qemu_mutex_lock(¶m->mutex); 312 } else { 313 qemu_cond_wait(¶m->cond, ¶m->mutex); 314 } 315 } 316 qemu_mutex_unlock(¶m->mutex); 317 318 return NULL; 319 } 320 321 static inline void terminate_compression_threads(void) 322 { 323 int idx, thread_count; 324 325 thread_count = migrate_compress_threads(); 326 for (idx = 0; idx < thread_count; idx++) { 327 qemu_mutex_lock(&comp_param[idx].mutex); 328 comp_param[idx].quit = true; 329 qemu_cond_signal(&comp_param[idx].cond); 330 qemu_mutex_unlock(&comp_param[idx].mutex); 331 } 332 } 333 334 void migrate_compress_threads_join(void) 335 { 336 int i, thread_count; 337 338 if (!migrate_use_compression()) { 339 return; 340 } 341 terminate_compression_threads(); 342 thread_count = migrate_compress_threads(); 343 for (i = 0; i < thread_count; i++) { 344 qemu_thread_join(compress_threads + i); 345 qemu_fclose(comp_param[i].file); 346 qemu_mutex_destroy(&comp_param[i].mutex); 347 qemu_cond_destroy(&comp_param[i].cond); 348 } 349 qemu_mutex_destroy(&comp_done_lock); 350 qemu_cond_destroy(&comp_done_cond); 351 g_free(compress_threads); 352 g_free(comp_param); 353 compress_threads = NULL; 354 comp_param = NULL; 355 } 356 357 void migrate_compress_threads_create(void) 358 { 359 int i, thread_count; 360 361 if (!migrate_use_compression()) { 362 return; 363 } 364 compression_switch = true; 365 thread_count = migrate_compress_threads(); 366 compress_threads = g_new0(QemuThread, thread_count); 367 comp_param = g_new0(CompressParam, thread_count); 368 qemu_cond_init(&comp_done_cond); 369 qemu_mutex_init(&comp_done_lock); 370 for (i = 0; i < thread_count; i++) { 371 /* comp_param[i].file is just used as a dummy buffer to save data, 372 * set its ops to empty. 373 */ 374 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 375 comp_param[i].done = true; 376 comp_param[i].quit = false; 377 qemu_mutex_init(&comp_param[i].mutex); 378 qemu_cond_init(&comp_param[i].cond); 379 qemu_thread_create(compress_threads + i, "compress", 380 do_data_compress, comp_param + i, 381 QEMU_THREAD_JOINABLE); 382 } 383 } 384 385 /** 386 * save_page_header: Write page header to wire 387 * 388 * If this is the 1st block, it also writes the block identification 389 * 390 * Returns: Number of bytes written 391 * 392 * @f: QEMUFile where to send the data 393 * @block: block that contains the page we want to send 394 * @offset: offset inside the block for the page 395 * in the lower bits, it contains flags 396 */ 397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset) 398 { 399 size_t size, len; 400 401 qemu_put_be64(f, offset); 402 size = 8; 403 404 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 405 len = strlen(block->idstr); 406 qemu_put_byte(f, len); 407 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 408 size += 1 + len; 409 } 410 return size; 411 } 412 413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes. 414 * If guest dirty memory rate is reduced below the rate at which we can 415 * transfer pages to the destination then we should be able to complete 416 * migration. Some workloads dirty memory way too fast and will not effectively 417 * converge, even with auto-converge. 418 */ 419 static void mig_throttle_guest_down(void) 420 { 421 MigrationState *s = migrate_get_current(); 422 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 423 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 424 425 /* We have not started throttling yet. Let's start it. */ 426 if (!cpu_throttle_active()) { 427 cpu_throttle_set(pct_initial); 428 } else { 429 /* Throttling already on, just increase the rate */ 430 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); 431 } 432 } 433 434 /* Update the xbzrle cache to reflect a page that's been sent as all 0. 435 * The important thing is that a stale (not-yet-0'd) page be replaced 436 * by the new data. 437 * As a bonus, if the page wasn't in the cache it gets added so that 438 * when a small write is made into the 0'd page it gets XBZRLE sent 439 */ 440 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 441 { 442 if (ram_bulk_stage || !migrate_use_xbzrle()) { 443 return; 444 } 445 446 /* We don't care if this fails to allocate a new cache page 447 * as long as it updated an old one */ 448 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE, 449 bitmap_sync_count); 450 } 451 452 #define ENCODING_FLAG_XBZRLE 0x1 453 454 /** 455 * save_xbzrle_page: compress and send current page 456 * 457 * Returns: 1 means that we wrote the page 458 * 0 means that page is identical to the one already sent 459 * -1 means that xbzrle would be longer than normal 460 * 461 * @f: QEMUFile where to send the data 462 * @current_data: 463 * @current_addr: 464 * @block: block that contains the page we want to send 465 * @offset: offset inside the block for the page 466 * @last_stage: if we are at the completion stage 467 * @bytes_transferred: increase it with the number of transferred bytes 468 */ 469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, 470 ram_addr_t current_addr, RAMBlock *block, 471 ram_addr_t offset, bool last_stage, 472 uint64_t *bytes_transferred) 473 { 474 int encoded_len = 0, bytes_xbzrle; 475 uint8_t *prev_cached_page; 476 477 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) { 478 acct_info.xbzrle_cache_miss++; 479 if (!last_stage) { 480 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 481 bitmap_sync_count) == -1) { 482 return -1; 483 } else { 484 /* update *current_data when the page has been 485 inserted into cache */ 486 *current_data = get_cached_data(XBZRLE.cache, current_addr); 487 } 488 } 489 return -1; 490 } 491 492 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 493 494 /* save current buffer into memory */ 495 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 496 497 /* XBZRLE encoding (if there is no overflow) */ 498 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 499 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 500 TARGET_PAGE_SIZE); 501 if (encoded_len == 0) { 502 trace_save_xbzrle_page_skipping(); 503 return 0; 504 } else if (encoded_len == -1) { 505 trace_save_xbzrle_page_overflow(); 506 acct_info.xbzrle_overflows++; 507 /* update data in the cache */ 508 if (!last_stage) { 509 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); 510 *current_data = prev_cached_page; 511 } 512 return -1; 513 } 514 515 /* we need to update the data in the cache, in order to get the same data */ 516 if (!last_stage) { 517 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 518 } 519 520 /* Send XBZRLE based compressed page */ 521 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE); 522 qemu_put_byte(f, ENCODING_FLAG_XBZRLE); 523 qemu_put_be16(f, encoded_len); 524 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len); 525 bytes_xbzrle += encoded_len + 1 + 2; 526 acct_info.xbzrle_pages++; 527 acct_info.xbzrle_bytes += bytes_xbzrle; 528 *bytes_transferred += bytes_xbzrle; 529 530 return 1; 531 } 532 533 /* Called with rcu_read_lock() to protect migration_bitmap 534 * rb: The RAMBlock to search for dirty pages in 535 * start: Start address (typically so we can continue from previous page) 536 * ram_addr_abs: Pointer into which to store the address of the dirty page 537 * within the global ram_addr space 538 * 539 * Returns: byte offset within memory region of the start of a dirty page 540 */ 541 static inline 542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb, 543 ram_addr_t start, 544 ram_addr_t *ram_addr_abs) 545 { 546 unsigned long base = rb->offset >> TARGET_PAGE_BITS; 547 unsigned long nr = base + (start >> TARGET_PAGE_BITS); 548 uint64_t rb_size = rb->used_length; 549 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS); 550 unsigned long *bitmap; 551 552 unsigned long next; 553 554 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 555 if (ram_bulk_stage && nr > base) { 556 next = nr + 1; 557 } else { 558 next = find_next_bit(bitmap, size, nr); 559 } 560 561 *ram_addr_abs = next << TARGET_PAGE_BITS; 562 return (next - base) << TARGET_PAGE_BITS; 563 } 564 565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr) 566 { 567 bool ret; 568 int nr = addr >> TARGET_PAGE_BITS; 569 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 570 571 ret = test_and_clear_bit(nr, bitmap); 572 573 if (ret) { 574 migration_dirty_pages--; 575 } 576 return ret; 577 } 578 579 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) 580 { 581 unsigned long *bitmap; 582 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 583 migration_dirty_pages += 584 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); 585 } 586 587 /* Fix me: there are too many global variables used in migration process. */ 588 static int64_t start_time; 589 static int64_t bytes_xfer_prev; 590 static int64_t num_dirty_pages_period; 591 static uint64_t xbzrle_cache_miss_prev; 592 static uint64_t iterations_prev; 593 594 static void migration_bitmap_sync_init(void) 595 { 596 start_time = 0; 597 bytes_xfer_prev = 0; 598 num_dirty_pages_period = 0; 599 xbzrle_cache_miss_prev = 0; 600 iterations_prev = 0; 601 } 602 603 static void migration_bitmap_sync(void) 604 { 605 RAMBlock *block; 606 uint64_t num_dirty_pages_init = migration_dirty_pages; 607 MigrationState *s = migrate_get_current(); 608 int64_t end_time; 609 int64_t bytes_xfer_now; 610 611 bitmap_sync_count++; 612 613 if (!bytes_xfer_prev) { 614 bytes_xfer_prev = ram_bytes_transferred(); 615 } 616 617 if (!start_time) { 618 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 619 } 620 621 trace_migration_bitmap_sync_start(); 622 memory_global_dirty_log_sync(); 623 624 qemu_mutex_lock(&migration_bitmap_mutex); 625 rcu_read_lock(); 626 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 627 migration_bitmap_sync_range(block->offset, block->used_length); 628 } 629 rcu_read_unlock(); 630 qemu_mutex_unlock(&migration_bitmap_mutex); 631 632 trace_migration_bitmap_sync_end(migration_dirty_pages 633 - num_dirty_pages_init); 634 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; 635 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 636 637 /* more than 1 second = 1000 millisecons */ 638 if (end_time > start_time + 1000) { 639 if (migrate_auto_converge()) { 640 /* The following detection logic can be refined later. For now: 641 Check to see if the dirtied bytes is 50% more than the approx. 642 amount of bytes that just got transferred since the last time we 643 were in this routine. If that happens twice, start or increase 644 throttling */ 645 bytes_xfer_now = ram_bytes_transferred(); 646 647 if (s->dirty_pages_rate && 648 (num_dirty_pages_period * TARGET_PAGE_SIZE > 649 (bytes_xfer_now - bytes_xfer_prev)/2) && 650 (dirty_rate_high_cnt++ >= 2)) { 651 trace_migration_throttle(); 652 dirty_rate_high_cnt = 0; 653 mig_throttle_guest_down(); 654 } 655 bytes_xfer_prev = bytes_xfer_now; 656 } 657 658 if (migrate_use_xbzrle()) { 659 if (iterations_prev != acct_info.iterations) { 660 acct_info.xbzrle_cache_miss_rate = 661 (double)(acct_info.xbzrle_cache_miss - 662 xbzrle_cache_miss_prev) / 663 (acct_info.iterations - iterations_prev); 664 } 665 iterations_prev = acct_info.iterations; 666 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss; 667 } 668 s->dirty_pages_rate = num_dirty_pages_period * 1000 669 / (end_time - start_time); 670 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; 671 start_time = end_time; 672 num_dirty_pages_period = 0; 673 } 674 s->dirty_sync_count = bitmap_sync_count; 675 if (migrate_use_events()) { 676 qapi_event_send_migration_pass(bitmap_sync_count, NULL); 677 } 678 } 679 680 /** 681 * save_zero_page: Send the zero page to the stream 682 * 683 * Returns: Number of pages written. 684 * 685 * @f: QEMUFile where to send the data 686 * @block: block that contains the page we want to send 687 * @offset: offset inside the block for the page 688 * @p: pointer to the page 689 * @bytes_transferred: increase it with the number of transferred bytes 690 */ 691 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, 692 uint8_t *p, uint64_t *bytes_transferred) 693 { 694 int pages = -1; 695 696 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 697 acct_info.dup_pages++; 698 *bytes_transferred += save_page_header(f, block, 699 offset | RAM_SAVE_FLAG_COMPRESS); 700 qemu_put_byte(f, 0); 701 *bytes_transferred += 1; 702 pages = 1; 703 } 704 705 return pages; 706 } 707 708 static void ram_release_pages(MigrationState *ms, const char *block_name, 709 uint64_t offset, int pages) 710 { 711 if (!migrate_release_ram() || !migration_in_postcopy(ms)) { 712 return; 713 } 714 715 ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS); 716 } 717 718 /** 719 * ram_save_page: Send the given page to the stream 720 * 721 * Returns: Number of pages written. 722 * < 0 - error 723 * >=0 - Number of pages written - this might legally be 0 724 * if xbzrle noticed the page was the same. 725 * 726 * @ms: The current migration state. 727 * @f: QEMUFile where to send the data 728 * @block: block that contains the page we want to send 729 * @offset: offset inside the block for the page 730 * @last_stage: if we are at the completion stage 731 * @bytes_transferred: increase it with the number of transferred bytes 732 */ 733 static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss, 734 bool last_stage, uint64_t *bytes_transferred) 735 { 736 int pages = -1; 737 uint64_t bytes_xmit; 738 ram_addr_t current_addr; 739 uint8_t *p; 740 int ret; 741 bool send_async = true; 742 RAMBlock *block = pss->block; 743 ram_addr_t offset = pss->offset; 744 745 p = block->host + offset; 746 747 /* In doubt sent page as normal */ 748 bytes_xmit = 0; 749 ret = ram_control_save_page(f, block->offset, 750 offset, TARGET_PAGE_SIZE, &bytes_xmit); 751 if (bytes_xmit) { 752 *bytes_transferred += bytes_xmit; 753 pages = 1; 754 } 755 756 XBZRLE_cache_lock(); 757 758 current_addr = block->offset + offset; 759 760 if (block == last_sent_block) { 761 offset |= RAM_SAVE_FLAG_CONTINUE; 762 } 763 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 764 if (ret != RAM_SAVE_CONTROL_DELAYED) { 765 if (bytes_xmit > 0) { 766 acct_info.norm_pages++; 767 } else if (bytes_xmit == 0) { 768 acct_info.dup_pages++; 769 } 770 } 771 } else { 772 pages = save_zero_page(f, block, offset, p, bytes_transferred); 773 if (pages > 0) { 774 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 775 * page would be stale 776 */ 777 xbzrle_cache_zero_page(current_addr); 778 ram_release_pages(ms, block->idstr, pss->offset, pages); 779 } else if (!ram_bulk_stage && 780 !migration_in_postcopy(ms) && migrate_use_xbzrle()) { 781 pages = save_xbzrle_page(f, &p, current_addr, block, 782 offset, last_stage, bytes_transferred); 783 if (!last_stage) { 784 /* Can't send this cached data async, since the cache page 785 * might get updated before it gets to the wire 786 */ 787 send_async = false; 788 } 789 } 790 } 791 792 /* XBZRLE overflow or normal page */ 793 if (pages == -1) { 794 *bytes_transferred += save_page_header(f, block, 795 offset | RAM_SAVE_FLAG_PAGE); 796 if (send_async) { 797 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE, 798 migrate_release_ram() & 799 migration_in_postcopy(ms)); 800 } else { 801 qemu_put_buffer(f, p, TARGET_PAGE_SIZE); 802 } 803 *bytes_transferred += TARGET_PAGE_SIZE; 804 pages = 1; 805 acct_info.norm_pages++; 806 } 807 808 XBZRLE_cache_unlock(); 809 810 return pages; 811 } 812 813 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 814 ram_addr_t offset) 815 { 816 int bytes_sent, blen; 817 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 818 819 bytes_sent = save_page_header(f, block, offset | 820 RAM_SAVE_FLAG_COMPRESS_PAGE); 821 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 822 migrate_compress_level()); 823 if (blen < 0) { 824 bytes_sent = 0; 825 qemu_file_set_error(migrate_get_current()->to_dst_file, blen); 826 error_report("compressed data failed!"); 827 } else { 828 bytes_sent += blen; 829 ram_release_pages(migrate_get_current(), block->idstr, 830 offset & TARGET_PAGE_MASK, 1); 831 } 832 833 return bytes_sent; 834 } 835 836 static uint64_t bytes_transferred; 837 838 static void flush_compressed_data(QEMUFile *f) 839 { 840 int idx, len, thread_count; 841 842 if (!migrate_use_compression()) { 843 return; 844 } 845 thread_count = migrate_compress_threads(); 846 847 qemu_mutex_lock(&comp_done_lock); 848 for (idx = 0; idx < thread_count; idx++) { 849 while (!comp_param[idx].done) { 850 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 851 } 852 } 853 qemu_mutex_unlock(&comp_done_lock); 854 855 for (idx = 0; idx < thread_count; idx++) { 856 qemu_mutex_lock(&comp_param[idx].mutex); 857 if (!comp_param[idx].quit) { 858 len = qemu_put_qemu_file(f, comp_param[idx].file); 859 bytes_transferred += len; 860 } 861 qemu_mutex_unlock(&comp_param[idx].mutex); 862 } 863 } 864 865 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 866 ram_addr_t offset) 867 { 868 param->block = block; 869 param->offset = offset; 870 } 871 872 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, 873 ram_addr_t offset, 874 uint64_t *bytes_transferred) 875 { 876 int idx, thread_count, bytes_xmit = -1, pages = -1; 877 878 thread_count = migrate_compress_threads(); 879 qemu_mutex_lock(&comp_done_lock); 880 while (true) { 881 for (idx = 0; idx < thread_count; idx++) { 882 if (comp_param[idx].done) { 883 comp_param[idx].done = false; 884 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file); 885 qemu_mutex_lock(&comp_param[idx].mutex); 886 set_compress_params(&comp_param[idx], block, offset); 887 qemu_cond_signal(&comp_param[idx].cond); 888 qemu_mutex_unlock(&comp_param[idx].mutex); 889 pages = 1; 890 acct_info.norm_pages++; 891 *bytes_transferred += bytes_xmit; 892 break; 893 } 894 } 895 if (pages > 0) { 896 break; 897 } else { 898 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 899 } 900 } 901 qemu_mutex_unlock(&comp_done_lock); 902 903 return pages; 904 } 905 906 /** 907 * ram_save_compressed_page: compress the given page and send it to the stream 908 * 909 * Returns: Number of pages written. 910 * 911 * @ms: The current migration state. 912 * @f: QEMUFile where to send the data 913 * @block: block that contains the page we want to send 914 * @offset: offset inside the block for the page 915 * @last_stage: if we are at the completion stage 916 * @bytes_transferred: increase it with the number of transferred bytes 917 */ 918 static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f, 919 PageSearchStatus *pss, bool last_stage, 920 uint64_t *bytes_transferred) 921 { 922 int pages = -1; 923 uint64_t bytes_xmit = 0; 924 uint8_t *p; 925 int ret, blen; 926 RAMBlock *block = pss->block; 927 ram_addr_t offset = pss->offset; 928 929 p = block->host + offset; 930 931 ret = ram_control_save_page(f, block->offset, 932 offset, TARGET_PAGE_SIZE, &bytes_xmit); 933 if (bytes_xmit) { 934 *bytes_transferred += bytes_xmit; 935 pages = 1; 936 } 937 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 938 if (ret != RAM_SAVE_CONTROL_DELAYED) { 939 if (bytes_xmit > 0) { 940 acct_info.norm_pages++; 941 } else if (bytes_xmit == 0) { 942 acct_info.dup_pages++; 943 } 944 } 945 } else { 946 /* When starting the process of a new block, the first page of 947 * the block should be sent out before other pages in the same 948 * block, and all the pages in last block should have been sent 949 * out, keeping this order is important, because the 'cont' flag 950 * is used to avoid resending the block name. 951 */ 952 if (block != last_sent_block) { 953 flush_compressed_data(f); 954 pages = save_zero_page(f, block, offset, p, bytes_transferred); 955 if (pages == -1) { 956 /* Make sure the first page is sent out before other pages */ 957 bytes_xmit = save_page_header(f, block, offset | 958 RAM_SAVE_FLAG_COMPRESS_PAGE); 959 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 960 migrate_compress_level()); 961 if (blen > 0) { 962 *bytes_transferred += bytes_xmit + blen; 963 acct_info.norm_pages++; 964 pages = 1; 965 } else { 966 qemu_file_set_error(f, blen); 967 error_report("compressed data failed!"); 968 } 969 } 970 if (pages > 0) { 971 ram_release_pages(ms, block->idstr, pss->offset, pages); 972 } 973 } else { 974 offset |= RAM_SAVE_FLAG_CONTINUE; 975 pages = save_zero_page(f, block, offset, p, bytes_transferred); 976 if (pages == -1) { 977 pages = compress_page_with_multi_thread(f, block, offset, 978 bytes_transferred); 979 } else { 980 ram_release_pages(ms, block->idstr, pss->offset, pages); 981 } 982 } 983 } 984 985 return pages; 986 } 987 988 /* 989 * Find the next dirty page and update any state associated with 990 * the search process. 991 * 992 * Returns: True if a page is found 993 * 994 * @f: Current migration stream. 995 * @pss: Data about the state of the current dirty page scan. 996 * @*again: Set to false if the search has scanned the whole of RAM 997 * *ram_addr_abs: Pointer into which to store the address of the dirty page 998 * within the global ram_addr space 999 */ 1000 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, 1001 bool *again, ram_addr_t *ram_addr_abs) 1002 { 1003 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset, 1004 ram_addr_abs); 1005 if (pss->complete_round && pss->block == last_seen_block && 1006 pss->offset >= last_offset) { 1007 /* 1008 * We've been once around the RAM and haven't found anything. 1009 * Give up. 1010 */ 1011 *again = false; 1012 return false; 1013 } 1014 if (pss->offset >= pss->block->used_length) { 1015 /* Didn't find anything in this RAM Block */ 1016 pss->offset = 0; 1017 pss->block = QLIST_NEXT_RCU(pss->block, next); 1018 if (!pss->block) { 1019 /* Hit the end of the list */ 1020 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1021 /* Flag that we've looped */ 1022 pss->complete_round = true; 1023 ram_bulk_stage = false; 1024 if (migrate_use_xbzrle()) { 1025 /* If xbzrle is on, stop using the data compression at this 1026 * point. In theory, xbzrle can do better than compression. 1027 */ 1028 flush_compressed_data(f); 1029 compression_switch = false; 1030 } 1031 } 1032 /* Didn't find anything this time, but try again on the new block */ 1033 *again = true; 1034 return false; 1035 } else { 1036 /* Can go around again, but... */ 1037 *again = true; 1038 /* We've found something so probably don't need to */ 1039 return true; 1040 } 1041 } 1042 1043 /* 1044 * Helper for 'get_queued_page' - gets a page off the queue 1045 * ms: MigrationState in 1046 * *offset: Used to return the offset within the RAMBlock 1047 * ram_addr_abs: global offset in the dirty/sent bitmaps 1048 * 1049 * Returns: block (or NULL if none available) 1050 */ 1051 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, 1052 ram_addr_t *ram_addr_abs) 1053 { 1054 RAMBlock *block = NULL; 1055 1056 qemu_mutex_lock(&ms->src_page_req_mutex); 1057 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) { 1058 struct MigrationSrcPageRequest *entry = 1059 QSIMPLEQ_FIRST(&ms->src_page_requests); 1060 block = entry->rb; 1061 *offset = entry->offset; 1062 *ram_addr_abs = (entry->offset + entry->rb->offset) & 1063 TARGET_PAGE_MASK; 1064 1065 if (entry->len > TARGET_PAGE_SIZE) { 1066 entry->len -= TARGET_PAGE_SIZE; 1067 entry->offset += TARGET_PAGE_SIZE; 1068 } else { 1069 memory_region_unref(block->mr); 1070 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); 1071 g_free(entry); 1072 } 1073 } 1074 qemu_mutex_unlock(&ms->src_page_req_mutex); 1075 1076 return block; 1077 } 1078 1079 /* 1080 * Unqueue a page from the queue fed by postcopy page requests; skips pages 1081 * that are already sent (!dirty) 1082 * 1083 * ms: MigrationState in 1084 * pss: PageSearchStatus structure updated with found block/offset 1085 * ram_addr_abs: global offset in the dirty/sent bitmaps 1086 * 1087 * Returns: true if a queued page is found 1088 */ 1089 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, 1090 ram_addr_t *ram_addr_abs) 1091 { 1092 RAMBlock *block; 1093 ram_addr_t offset; 1094 bool dirty; 1095 1096 do { 1097 block = unqueue_page(ms, &offset, ram_addr_abs); 1098 /* 1099 * We're sending this page, and since it's postcopy nothing else 1100 * will dirty it, and we must make sure it doesn't get sent again 1101 * even if this queue request was received after the background 1102 * search already sent it. 1103 */ 1104 if (block) { 1105 unsigned long *bitmap; 1106 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1107 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap); 1108 if (!dirty) { 1109 trace_get_queued_page_not_dirty( 1110 block->idstr, (uint64_t)offset, 1111 (uint64_t)*ram_addr_abs, 1112 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, 1113 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap)); 1114 } else { 1115 trace_get_queued_page(block->idstr, 1116 (uint64_t)offset, 1117 (uint64_t)*ram_addr_abs); 1118 } 1119 } 1120 1121 } while (block && !dirty); 1122 1123 if (block) { 1124 /* 1125 * As soon as we start servicing pages out of order, then we have 1126 * to kill the bulk stage, since the bulk stage assumes 1127 * in (migration_bitmap_find_and_reset_dirty) that every page is 1128 * dirty, that's no longer true. 1129 */ 1130 ram_bulk_stage = false; 1131 1132 /* 1133 * We want the background search to continue from the queued page 1134 * since the guest is likely to want other pages near to the page 1135 * it just requested. 1136 */ 1137 pss->block = block; 1138 pss->offset = offset; 1139 } 1140 1141 return !!block; 1142 } 1143 1144 /** 1145 * flush_page_queue: Flush any remaining pages in the ram request queue 1146 * it should be empty at the end anyway, but in error cases there may be 1147 * some left. 1148 * 1149 * ms: MigrationState 1150 */ 1151 void flush_page_queue(MigrationState *ms) 1152 { 1153 struct MigrationSrcPageRequest *mspr, *next_mspr; 1154 /* This queue generally should be empty - but in the case of a failed 1155 * migration might have some droppings in. 1156 */ 1157 rcu_read_lock(); 1158 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) { 1159 memory_region_unref(mspr->rb->mr); 1160 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); 1161 g_free(mspr); 1162 } 1163 rcu_read_unlock(); 1164 } 1165 1166 /** 1167 * Queue the pages for transmission, e.g. a request from postcopy destination 1168 * ms: MigrationStatus in which the queue is held 1169 * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last) 1170 * start: Offset from the start of the RAMBlock 1171 * len: Length (in bytes) to send 1172 * Return: 0 on success 1173 */ 1174 int ram_save_queue_pages(MigrationState *ms, const char *rbname, 1175 ram_addr_t start, ram_addr_t len) 1176 { 1177 RAMBlock *ramblock; 1178 1179 ms->postcopy_requests++; 1180 rcu_read_lock(); 1181 if (!rbname) { 1182 /* Reuse last RAMBlock */ 1183 ramblock = ms->last_req_rb; 1184 1185 if (!ramblock) { 1186 /* 1187 * Shouldn't happen, we can't reuse the last RAMBlock if 1188 * it's the 1st request. 1189 */ 1190 error_report("ram_save_queue_pages no previous block"); 1191 goto err; 1192 } 1193 } else { 1194 ramblock = qemu_ram_block_by_name(rbname); 1195 1196 if (!ramblock) { 1197 /* We shouldn't be asked for a non-existent RAMBlock */ 1198 error_report("ram_save_queue_pages no block '%s'", rbname); 1199 goto err; 1200 } 1201 ms->last_req_rb = ramblock; 1202 } 1203 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1204 if (start+len > ramblock->used_length) { 1205 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1206 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1207 __func__, start, len, ramblock->used_length); 1208 goto err; 1209 } 1210 1211 struct MigrationSrcPageRequest *new_entry = 1212 g_malloc0(sizeof(struct MigrationSrcPageRequest)); 1213 new_entry->rb = ramblock; 1214 new_entry->offset = start; 1215 new_entry->len = len; 1216 1217 memory_region_ref(ramblock->mr); 1218 qemu_mutex_lock(&ms->src_page_req_mutex); 1219 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req); 1220 qemu_mutex_unlock(&ms->src_page_req_mutex); 1221 rcu_read_unlock(); 1222 1223 return 0; 1224 1225 err: 1226 rcu_read_unlock(); 1227 return -1; 1228 } 1229 1230 /** 1231 * ram_save_target_page: Save one target page 1232 * 1233 * 1234 * @f: QEMUFile where to send the data 1235 * @block: pointer to block that contains the page we want to send 1236 * @offset: offset inside the block for the page; 1237 * @last_stage: if we are at the completion stage 1238 * @bytes_transferred: increase it with the number of transferred bytes 1239 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space 1240 * 1241 * Returns: Number of pages written. 1242 */ 1243 static int ram_save_target_page(MigrationState *ms, QEMUFile *f, 1244 PageSearchStatus *pss, 1245 bool last_stage, 1246 uint64_t *bytes_transferred, 1247 ram_addr_t dirty_ram_abs) 1248 { 1249 int res = 0; 1250 1251 /* Check the pages is dirty and if it is send it */ 1252 if (migration_bitmap_clear_dirty(dirty_ram_abs)) { 1253 unsigned long *unsentmap; 1254 if (compression_switch && migrate_use_compression()) { 1255 res = ram_save_compressed_page(ms, f, pss, 1256 last_stage, 1257 bytes_transferred); 1258 } else { 1259 res = ram_save_page(ms, f, pss, last_stage, 1260 bytes_transferred); 1261 } 1262 1263 if (res < 0) { 1264 return res; 1265 } 1266 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1267 if (unsentmap) { 1268 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap); 1269 } 1270 /* Only update last_sent_block if a block was actually sent; xbzrle 1271 * might have decided the page was identical so didn't bother writing 1272 * to the stream. 1273 */ 1274 if (res > 0) { 1275 last_sent_block = pss->block; 1276 } 1277 } 1278 1279 return res; 1280 } 1281 1282 /** 1283 * ram_save_host_page: Starting at *offset send pages up to the end 1284 * of the current host page. It's valid for the initial 1285 * offset to point into the middle of a host page 1286 * in which case the remainder of the hostpage is sent. 1287 * Only dirty target pages are sent. 1288 * 1289 * Returns: Number of pages written. 1290 * 1291 * @f: QEMUFile where to send the data 1292 * @block: pointer to block that contains the page we want to send 1293 * @offset: offset inside the block for the page; updated to last target page 1294 * sent 1295 * @last_stage: if we are at the completion stage 1296 * @bytes_transferred: increase it with the number of transferred bytes 1297 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space 1298 */ 1299 static int ram_save_host_page(MigrationState *ms, QEMUFile *f, 1300 PageSearchStatus *pss, 1301 bool last_stage, 1302 uint64_t *bytes_transferred, 1303 ram_addr_t dirty_ram_abs) 1304 { 1305 int tmppages, pages = 0; 1306 do { 1307 tmppages = ram_save_target_page(ms, f, pss, last_stage, 1308 bytes_transferred, dirty_ram_abs); 1309 if (tmppages < 0) { 1310 return tmppages; 1311 } 1312 1313 pages += tmppages; 1314 pss->offset += TARGET_PAGE_SIZE; 1315 dirty_ram_abs += TARGET_PAGE_SIZE; 1316 } while (pss->offset & (qemu_host_page_size - 1)); 1317 1318 /* The offset we leave with is the last one we looked at */ 1319 pss->offset -= TARGET_PAGE_SIZE; 1320 return pages; 1321 } 1322 1323 /** 1324 * ram_find_and_save_block: Finds a dirty page and sends it to f 1325 * 1326 * Called within an RCU critical section. 1327 * 1328 * Returns: The number of pages written 1329 * 0 means no dirty pages 1330 * 1331 * @f: QEMUFile where to send the data 1332 * @last_stage: if we are at the completion stage 1333 * @bytes_transferred: increase it with the number of transferred bytes 1334 * 1335 * On systems where host-page-size > target-page-size it will send all the 1336 * pages in a host page that are dirty. 1337 */ 1338 1339 static int ram_find_and_save_block(QEMUFile *f, bool last_stage, 1340 uint64_t *bytes_transferred) 1341 { 1342 PageSearchStatus pss; 1343 MigrationState *ms = migrate_get_current(); 1344 int pages = 0; 1345 bool again, found; 1346 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in 1347 ram_addr_t space */ 1348 1349 /* No dirty page as there is zero RAM */ 1350 if (!ram_bytes_total()) { 1351 return pages; 1352 } 1353 1354 pss.block = last_seen_block; 1355 pss.offset = last_offset; 1356 pss.complete_round = false; 1357 1358 if (!pss.block) { 1359 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1360 } 1361 1362 do { 1363 again = true; 1364 found = get_queued_page(ms, &pss, &dirty_ram_abs); 1365 1366 if (!found) { 1367 /* priority queue empty, so just search for something dirty */ 1368 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); 1369 } 1370 1371 if (found) { 1372 pages = ram_save_host_page(ms, f, &pss, 1373 last_stage, bytes_transferred, 1374 dirty_ram_abs); 1375 } 1376 } while (!pages && again); 1377 1378 last_seen_block = pss.block; 1379 last_offset = pss.offset; 1380 1381 return pages; 1382 } 1383 1384 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1385 { 1386 uint64_t pages = size / TARGET_PAGE_SIZE; 1387 if (zero) { 1388 acct_info.dup_pages += pages; 1389 } else { 1390 acct_info.norm_pages += pages; 1391 bytes_transferred += size; 1392 qemu_update_position(f, size); 1393 } 1394 } 1395 1396 static ram_addr_t ram_save_remaining(void) 1397 { 1398 return migration_dirty_pages; 1399 } 1400 1401 uint64_t ram_bytes_remaining(void) 1402 { 1403 return ram_save_remaining() * TARGET_PAGE_SIZE; 1404 } 1405 1406 uint64_t ram_bytes_transferred(void) 1407 { 1408 return bytes_transferred; 1409 } 1410 1411 uint64_t ram_bytes_total(void) 1412 { 1413 RAMBlock *block; 1414 uint64_t total = 0; 1415 1416 rcu_read_lock(); 1417 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) 1418 total += block->used_length; 1419 rcu_read_unlock(); 1420 return total; 1421 } 1422 1423 void free_xbzrle_decoded_buf(void) 1424 { 1425 g_free(xbzrle_decoded_buf); 1426 xbzrle_decoded_buf = NULL; 1427 } 1428 1429 static void migration_bitmap_free(struct BitmapRcu *bmap) 1430 { 1431 g_free(bmap->bmap); 1432 g_free(bmap->unsentmap); 1433 g_free(bmap); 1434 } 1435 1436 static void ram_migration_cleanup(void *opaque) 1437 { 1438 /* caller have hold iothread lock or is in a bh, so there is 1439 * no writing race against this migration_bitmap 1440 */ 1441 struct BitmapRcu *bitmap = migration_bitmap_rcu; 1442 atomic_rcu_set(&migration_bitmap_rcu, NULL); 1443 if (bitmap) { 1444 memory_global_dirty_log_stop(); 1445 call_rcu(bitmap, migration_bitmap_free, rcu); 1446 } 1447 1448 XBZRLE_cache_lock(); 1449 if (XBZRLE.cache) { 1450 cache_fini(XBZRLE.cache); 1451 g_free(XBZRLE.encoded_buf); 1452 g_free(XBZRLE.current_buf); 1453 g_free(ZERO_TARGET_PAGE); 1454 XBZRLE.cache = NULL; 1455 XBZRLE.encoded_buf = NULL; 1456 XBZRLE.current_buf = NULL; 1457 } 1458 XBZRLE_cache_unlock(); 1459 } 1460 1461 static void reset_ram_globals(void) 1462 { 1463 last_seen_block = NULL; 1464 last_sent_block = NULL; 1465 last_offset = 0; 1466 last_version = ram_list.version; 1467 ram_bulk_stage = true; 1468 } 1469 1470 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1471 1472 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) 1473 { 1474 /* called in qemu main thread, so there is 1475 * no writing race against this migration_bitmap 1476 */ 1477 if (migration_bitmap_rcu) { 1478 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap; 1479 bitmap = g_new(struct BitmapRcu, 1); 1480 bitmap->bmap = bitmap_new(new); 1481 1482 /* prevent migration_bitmap content from being set bit 1483 * by migration_bitmap_sync_range() at the same time. 1484 * it is safe to migration if migration_bitmap is cleared bit 1485 * at the same time. 1486 */ 1487 qemu_mutex_lock(&migration_bitmap_mutex); 1488 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old); 1489 bitmap_set(bitmap->bmap, old, new - old); 1490 1491 /* We don't have a way to safely extend the sentmap 1492 * with RCU; so mark it as missing, entry to postcopy 1493 * will fail. 1494 */ 1495 bitmap->unsentmap = NULL; 1496 1497 atomic_rcu_set(&migration_bitmap_rcu, bitmap); 1498 qemu_mutex_unlock(&migration_bitmap_mutex); 1499 migration_dirty_pages += new - old; 1500 call_rcu(old_bitmap, migration_bitmap_free, rcu); 1501 } 1502 } 1503 1504 /* 1505 * 'expected' is the value you expect the bitmap mostly to be full 1506 * of; it won't bother printing lines that are all this value. 1507 * If 'todump' is null the migration bitmap is dumped. 1508 */ 1509 void ram_debug_dump_bitmap(unsigned long *todump, bool expected) 1510 { 1511 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; 1512 1513 int64_t cur; 1514 int64_t linelen = 128; 1515 char linebuf[129]; 1516 1517 if (!todump) { 1518 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1519 } 1520 1521 for (cur = 0; cur < ram_pages; cur += linelen) { 1522 int64_t curb; 1523 bool found = false; 1524 /* 1525 * Last line; catch the case where the line length 1526 * is longer than remaining ram 1527 */ 1528 if (cur + linelen > ram_pages) { 1529 linelen = ram_pages - cur; 1530 } 1531 for (curb = 0; curb < linelen; curb++) { 1532 bool thisbit = test_bit(cur + curb, todump); 1533 linebuf[curb] = thisbit ? '1' : '.'; 1534 found = found || (thisbit != expected); 1535 } 1536 if (found) { 1537 linebuf[curb] = '\0'; 1538 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1539 } 1540 } 1541 } 1542 1543 /* **** functions for postcopy ***** */ 1544 1545 void ram_postcopy_migrated_memory_release(MigrationState *ms) 1546 { 1547 struct RAMBlock *block; 1548 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1549 1550 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1551 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1552 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS); 1553 unsigned long run_start = find_next_zero_bit(bitmap, range, first); 1554 1555 while (run_start < range) { 1556 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 1557 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS, 1558 (run_end - run_start) << TARGET_PAGE_BITS); 1559 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 1560 } 1561 } 1562 } 1563 1564 /* 1565 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1566 * Note: At this point the 'unsentmap' is the processed bitmap combined 1567 * with the dirtymap; so a '1' means it's either dirty or unsent. 1568 * start,length: Indexes into the bitmap for the first bit 1569 * representing the named block and length in target-pages 1570 */ 1571 static int postcopy_send_discard_bm_ram(MigrationState *ms, 1572 PostcopyDiscardState *pds, 1573 unsigned long start, 1574 unsigned long length) 1575 { 1576 unsigned long end = start + length; /* one after the end */ 1577 unsigned long current; 1578 unsigned long *unsentmap; 1579 1580 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1581 for (current = start; current < end; ) { 1582 unsigned long one = find_next_bit(unsentmap, end, current); 1583 1584 if (one <= end) { 1585 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); 1586 unsigned long discard_length; 1587 1588 if (zero >= end) { 1589 discard_length = end - one; 1590 } else { 1591 discard_length = zero - one; 1592 } 1593 if (discard_length) { 1594 postcopy_discard_send_range(ms, pds, one, discard_length); 1595 } 1596 current = one + discard_length; 1597 } else { 1598 current = one; 1599 } 1600 } 1601 1602 return 0; 1603 } 1604 1605 /* 1606 * Utility for the outgoing postcopy code. 1607 * Calls postcopy_send_discard_bm_ram for each RAMBlock 1608 * passing it bitmap indexes and name. 1609 * Returns: 0 on success 1610 * (qemu_ram_foreach_block ends up passing unscaled lengths 1611 * which would mean postcopy code would have to deal with target page) 1612 */ 1613 static int postcopy_each_ram_send_discard(MigrationState *ms) 1614 { 1615 struct RAMBlock *block; 1616 int ret; 1617 1618 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1619 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1620 PostcopyDiscardState *pds = postcopy_discard_send_init(ms, 1621 first, 1622 block->idstr); 1623 1624 /* 1625 * Postcopy sends chunks of bitmap over the wire, but it 1626 * just needs indexes at this point, avoids it having 1627 * target page specific code. 1628 */ 1629 ret = postcopy_send_discard_bm_ram(ms, pds, first, 1630 block->used_length >> TARGET_PAGE_BITS); 1631 postcopy_discard_send_finish(ms, pds); 1632 if (ret) { 1633 return ret; 1634 } 1635 } 1636 1637 return 0; 1638 } 1639 1640 /* 1641 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup 1642 * the two bitmaps, that are similar, but one is inverted. 1643 * 1644 * We search for runs of target-pages that don't start or end on a 1645 * host page boundary; 1646 * unsent_pass=true: Cleans up partially unsent host pages by searching 1647 * the unsentmap 1648 * unsent_pass=false: Cleans up partially dirty host pages by searching 1649 * the main migration bitmap 1650 * 1651 */ 1652 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, 1653 RAMBlock *block, 1654 PostcopyDiscardState *pds) 1655 { 1656 unsigned long *bitmap; 1657 unsigned long *unsentmap; 1658 unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE; 1659 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1660 unsigned long len = block->used_length >> TARGET_PAGE_BITS; 1661 unsigned long last = first + (len - 1); 1662 unsigned long run_start; 1663 1664 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1665 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1666 1667 if (unsent_pass) { 1668 /* Find a sent page */ 1669 run_start = find_next_zero_bit(unsentmap, last + 1, first); 1670 } else { 1671 /* Find a dirty page */ 1672 run_start = find_next_bit(bitmap, last + 1, first); 1673 } 1674 1675 while (run_start <= last) { 1676 bool do_fixup = false; 1677 unsigned long fixup_start_addr; 1678 unsigned long host_offset; 1679 1680 /* 1681 * If the start of this run of pages is in the middle of a host 1682 * page, then we need to fixup this host page. 1683 */ 1684 host_offset = run_start % host_ratio; 1685 if (host_offset) { 1686 do_fixup = true; 1687 run_start -= host_offset; 1688 fixup_start_addr = run_start; 1689 /* For the next pass */ 1690 run_start = run_start + host_ratio; 1691 } else { 1692 /* Find the end of this run */ 1693 unsigned long run_end; 1694 if (unsent_pass) { 1695 run_end = find_next_bit(unsentmap, last + 1, run_start + 1); 1696 } else { 1697 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1); 1698 } 1699 /* 1700 * If the end isn't at the start of a host page, then the 1701 * run doesn't finish at the end of a host page 1702 * and we need to discard. 1703 */ 1704 host_offset = run_end % host_ratio; 1705 if (host_offset) { 1706 do_fixup = true; 1707 fixup_start_addr = run_end - host_offset; 1708 /* 1709 * This host page has gone, the next loop iteration starts 1710 * from after the fixup 1711 */ 1712 run_start = fixup_start_addr + host_ratio; 1713 } else { 1714 /* 1715 * No discards on this iteration, next loop starts from 1716 * next sent/dirty page 1717 */ 1718 run_start = run_end + 1; 1719 } 1720 } 1721 1722 if (do_fixup) { 1723 unsigned long page; 1724 1725 /* Tell the destination to discard this page */ 1726 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { 1727 /* For the unsent_pass we: 1728 * discard partially sent pages 1729 * For the !unsent_pass (dirty) we: 1730 * discard partially dirty pages that were sent 1731 * (any partially sent pages were already discarded 1732 * by the previous unsent_pass) 1733 */ 1734 postcopy_discard_send_range(ms, pds, fixup_start_addr, 1735 host_ratio); 1736 } 1737 1738 /* Clean up the bitmap */ 1739 for (page = fixup_start_addr; 1740 page < fixup_start_addr + host_ratio; page++) { 1741 /* All pages in this host page are now not sent */ 1742 set_bit(page, unsentmap); 1743 1744 /* 1745 * Remark them as dirty, updating the count for any pages 1746 * that weren't previously dirty. 1747 */ 1748 migration_dirty_pages += !test_and_set_bit(page, bitmap); 1749 } 1750 } 1751 1752 if (unsent_pass) { 1753 /* Find the next sent page for the next iteration */ 1754 run_start = find_next_zero_bit(unsentmap, last + 1, 1755 run_start); 1756 } else { 1757 /* Find the next dirty page for the next iteration */ 1758 run_start = find_next_bit(bitmap, last + 1, run_start); 1759 } 1760 } 1761 } 1762 1763 /* 1764 * Utility for the outgoing postcopy code. 1765 * 1766 * Discard any partially sent host-page size chunks, mark any partially 1767 * dirty host-page size chunks as all dirty. 1768 * 1769 * Returns: 0 on success 1770 */ 1771 static int postcopy_chunk_hostpages(MigrationState *ms) 1772 { 1773 struct RAMBlock *block; 1774 1775 if (qemu_host_page_size == TARGET_PAGE_SIZE) { 1776 /* Easy case - TPS==HPS - nothing to be done */ 1777 return 0; 1778 } 1779 1780 /* Easiest way to make sure we don't resume in the middle of a host-page */ 1781 last_seen_block = NULL; 1782 last_sent_block = NULL; 1783 last_offset = 0; 1784 1785 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1786 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1787 1788 PostcopyDiscardState *pds = 1789 postcopy_discard_send_init(ms, first, block->idstr); 1790 1791 /* First pass: Discard all partially sent host pages */ 1792 postcopy_chunk_hostpages_pass(ms, true, block, pds); 1793 /* 1794 * Second pass: Ensure that all partially dirty host pages are made 1795 * fully dirty. 1796 */ 1797 postcopy_chunk_hostpages_pass(ms, false, block, pds); 1798 1799 postcopy_discard_send_finish(ms, pds); 1800 } /* ram_list loop */ 1801 1802 return 0; 1803 } 1804 1805 /* 1806 * Transmit the set of pages to be discarded after precopy to the target 1807 * these are pages that: 1808 * a) Have been previously transmitted but are now dirty again 1809 * b) Pages that have never been transmitted, this ensures that 1810 * any pages on the destination that have been mapped by background 1811 * tasks get discarded (transparent huge pages is the specific concern) 1812 * Hopefully this is pretty sparse 1813 */ 1814 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 1815 { 1816 int ret; 1817 unsigned long *bitmap, *unsentmap; 1818 1819 rcu_read_lock(); 1820 1821 /* This should be our last sync, the src is now paused */ 1822 migration_bitmap_sync(); 1823 1824 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1825 if (!unsentmap) { 1826 /* We don't have a safe way to resize the sentmap, so 1827 * if the bitmap was resized it will be NULL at this 1828 * point. 1829 */ 1830 error_report("migration ram resized during precopy phase"); 1831 rcu_read_unlock(); 1832 return -EINVAL; 1833 } 1834 1835 /* Deal with TPS != HPS */ 1836 ret = postcopy_chunk_hostpages(ms); 1837 if (ret) { 1838 rcu_read_unlock(); 1839 return ret; 1840 } 1841 1842 /* 1843 * Update the unsentmap to be unsentmap = unsentmap | dirty 1844 */ 1845 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1846 bitmap_or(unsentmap, unsentmap, bitmap, 1847 last_ram_offset() >> TARGET_PAGE_BITS); 1848 1849 1850 trace_ram_postcopy_send_discard_bitmap(); 1851 #ifdef DEBUG_POSTCOPY 1852 ram_debug_dump_bitmap(unsentmap, true); 1853 #endif 1854 1855 ret = postcopy_each_ram_send_discard(ms); 1856 rcu_read_unlock(); 1857 1858 return ret; 1859 } 1860 1861 /* 1862 * At the start of the postcopy phase of migration, any now-dirty 1863 * precopied pages are discarded. 1864 * 1865 * start, length describe a byte address range within the RAMBlock 1866 * 1867 * Returns 0 on success. 1868 */ 1869 int ram_discard_range(MigrationIncomingState *mis, 1870 const char *block_name, 1871 uint64_t start, size_t length) 1872 { 1873 int ret = -1; 1874 1875 rcu_read_lock(); 1876 RAMBlock *rb = qemu_ram_block_by_name(block_name); 1877 1878 if (!rb) { 1879 error_report("ram_discard_range: Failed to find block '%s'", 1880 block_name); 1881 goto err; 1882 } 1883 1884 uint8_t *host_startaddr = rb->host + start; 1885 1886 if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) { 1887 error_report("ram_discard_range: Unaligned start address: %p", 1888 host_startaddr); 1889 goto err; 1890 } 1891 1892 if ((start + length) <= rb->used_length) { 1893 uint8_t *host_endaddr = host_startaddr + length; 1894 if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) { 1895 error_report("ram_discard_range: Unaligned end address: %p", 1896 host_endaddr); 1897 goto err; 1898 } 1899 ret = postcopy_ram_discard_range(mis, host_startaddr, length); 1900 } else { 1901 error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 1902 "/%zx/" RAM_ADDR_FMT")", 1903 block_name, start, length, rb->used_length); 1904 } 1905 1906 err: 1907 rcu_read_unlock(); 1908 1909 return ret; 1910 } 1911 1912 static int ram_save_init_globals(void) 1913 { 1914 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ 1915 1916 dirty_rate_high_cnt = 0; 1917 bitmap_sync_count = 0; 1918 migration_bitmap_sync_init(); 1919 qemu_mutex_init(&migration_bitmap_mutex); 1920 1921 if (migrate_use_xbzrle()) { 1922 XBZRLE_cache_lock(); 1923 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE); 1924 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / 1925 TARGET_PAGE_SIZE, 1926 TARGET_PAGE_SIZE); 1927 if (!XBZRLE.cache) { 1928 XBZRLE_cache_unlock(); 1929 error_report("Error creating cache"); 1930 return -1; 1931 } 1932 XBZRLE_cache_unlock(); 1933 1934 /* We prefer not to abort if there is no memory */ 1935 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 1936 if (!XBZRLE.encoded_buf) { 1937 error_report("Error allocating encoded_buf"); 1938 return -1; 1939 } 1940 1941 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 1942 if (!XBZRLE.current_buf) { 1943 error_report("Error allocating current_buf"); 1944 g_free(XBZRLE.encoded_buf); 1945 XBZRLE.encoded_buf = NULL; 1946 return -1; 1947 } 1948 1949 acct_clear(); 1950 } 1951 1952 /* For memory_global_dirty_log_start below. */ 1953 qemu_mutex_lock_iothread(); 1954 1955 qemu_mutex_lock_ramlist(); 1956 rcu_read_lock(); 1957 bytes_transferred = 0; 1958 reset_ram_globals(); 1959 1960 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); 1961 /* Skip setting bitmap if there is no RAM */ 1962 if (ram_bytes_total()) { 1963 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; 1964 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); 1965 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); 1966 1967 if (migrate_postcopy_ram()) { 1968 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); 1969 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); 1970 } 1971 } 1972 1973 /* 1974 * Count the total number of pages used by ram blocks not including any 1975 * gaps due to alignment or unplugs. 1976 */ 1977 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 1978 1979 memory_global_dirty_log_start(); 1980 migration_bitmap_sync(); 1981 qemu_mutex_unlock_ramlist(); 1982 qemu_mutex_unlock_iothread(); 1983 rcu_read_unlock(); 1984 1985 return 0; 1986 } 1987 1988 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has 1989 * long-running RCU critical section. When rcu-reclaims in the code 1990 * start to become numerous it will be necessary to reduce the 1991 * granularity of these critical sections. 1992 */ 1993 1994 static int ram_save_setup(QEMUFile *f, void *opaque) 1995 { 1996 RAMBlock *block; 1997 1998 /* migration has already setup the bitmap, reuse it. */ 1999 if (!migration_in_colo_state()) { 2000 if (ram_save_init_globals() < 0) { 2001 return -1; 2002 } 2003 } 2004 2005 rcu_read_lock(); 2006 2007 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); 2008 2009 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 2010 qemu_put_byte(f, strlen(block->idstr)); 2011 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2012 qemu_put_be64(f, block->used_length); 2013 } 2014 2015 rcu_read_unlock(); 2016 2017 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2018 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2019 2020 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2021 2022 return 0; 2023 } 2024 2025 static int ram_save_iterate(QEMUFile *f, void *opaque) 2026 { 2027 int ret; 2028 int i; 2029 int64_t t0; 2030 int done = 0; 2031 2032 rcu_read_lock(); 2033 if (ram_list.version != last_version) { 2034 reset_ram_globals(); 2035 } 2036 2037 /* Read version before ram_list.blocks */ 2038 smp_rmb(); 2039 2040 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2041 2042 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2043 i = 0; 2044 while ((ret = qemu_file_rate_limit(f)) == 0) { 2045 int pages; 2046 2047 pages = ram_find_and_save_block(f, false, &bytes_transferred); 2048 /* no more pages to sent */ 2049 if (pages == 0) { 2050 done = 1; 2051 break; 2052 } 2053 acct_info.iterations++; 2054 2055 /* we want to check in the 1st loop, just in case it was the 1st time 2056 and we had to sync the dirty bitmap. 2057 qemu_get_clock_ns() is a bit expensive, so we only check each some 2058 iterations 2059 */ 2060 if ((i & 63) == 0) { 2061 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; 2062 if (t1 > MAX_WAIT) { 2063 trace_ram_save_iterate_big_wait(t1, i); 2064 break; 2065 } 2066 } 2067 i++; 2068 } 2069 flush_compressed_data(f); 2070 rcu_read_unlock(); 2071 2072 /* 2073 * Must occur before EOS (or any QEMUFile operation) 2074 * because of RDMA protocol. 2075 */ 2076 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2077 2078 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2079 bytes_transferred += 8; 2080 2081 ret = qemu_file_get_error(f); 2082 if (ret < 0) { 2083 return ret; 2084 } 2085 2086 return done; 2087 } 2088 2089 /* Called with iothread lock */ 2090 static int ram_save_complete(QEMUFile *f, void *opaque) 2091 { 2092 rcu_read_lock(); 2093 2094 if (!migration_in_postcopy(migrate_get_current())) { 2095 migration_bitmap_sync(); 2096 } 2097 2098 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2099 2100 /* try transferring iterative blocks of memory */ 2101 2102 /* flush all remaining blocks regardless of rate limiting */ 2103 while (true) { 2104 int pages; 2105 2106 pages = ram_find_and_save_block(f, !migration_in_colo_state(), 2107 &bytes_transferred); 2108 /* no more blocks to sent */ 2109 if (pages == 0) { 2110 break; 2111 } 2112 } 2113 2114 flush_compressed_data(f); 2115 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2116 2117 rcu_read_unlock(); 2118 2119 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2120 2121 return 0; 2122 } 2123 2124 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2125 uint64_t *non_postcopiable_pending, 2126 uint64_t *postcopiable_pending) 2127 { 2128 uint64_t remaining_size; 2129 2130 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; 2131 2132 if (!migration_in_postcopy(migrate_get_current()) && 2133 remaining_size < max_size) { 2134 qemu_mutex_lock_iothread(); 2135 rcu_read_lock(); 2136 migration_bitmap_sync(); 2137 rcu_read_unlock(); 2138 qemu_mutex_unlock_iothread(); 2139 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; 2140 } 2141 2142 /* We can do postcopy, and all the data is postcopiable */ 2143 *postcopiable_pending += remaining_size; 2144 } 2145 2146 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2147 { 2148 unsigned int xh_len; 2149 int xh_flags; 2150 uint8_t *loaded_data; 2151 2152 if (!xbzrle_decoded_buf) { 2153 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2154 } 2155 loaded_data = xbzrle_decoded_buf; 2156 2157 /* extract RLE header */ 2158 xh_flags = qemu_get_byte(f); 2159 xh_len = qemu_get_be16(f); 2160 2161 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2162 error_report("Failed to load XBZRLE page - wrong compression!"); 2163 return -1; 2164 } 2165 2166 if (xh_len > TARGET_PAGE_SIZE) { 2167 error_report("Failed to load XBZRLE page - len overflow!"); 2168 return -1; 2169 } 2170 /* load data and decode */ 2171 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2172 2173 /* decode RLE */ 2174 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2175 TARGET_PAGE_SIZE) == -1) { 2176 error_report("Failed to load XBZRLE page - decode error!"); 2177 return -1; 2178 } 2179 2180 return 0; 2181 } 2182 2183 /* Must be called from within a rcu critical section. 2184 * Returns a pointer from within the RCU-protected ram_list. 2185 */ 2186 /* 2187 * Read a RAMBlock ID from the stream f. 2188 * 2189 * f: Stream to read from 2190 * flags: Page flags (mostly to see if it's a continuation of previous block) 2191 */ 2192 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, 2193 int flags) 2194 { 2195 static RAMBlock *block = NULL; 2196 char id[256]; 2197 uint8_t len; 2198 2199 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2200 if (!block) { 2201 error_report("Ack, bad migration stream!"); 2202 return NULL; 2203 } 2204 return block; 2205 } 2206 2207 len = qemu_get_byte(f); 2208 qemu_get_buffer(f, (uint8_t *)id, len); 2209 id[len] = 0; 2210 2211 block = qemu_ram_block_by_name(id); 2212 if (!block) { 2213 error_report("Can't find block %s", id); 2214 return NULL; 2215 } 2216 2217 return block; 2218 } 2219 2220 static inline void *host_from_ram_block_offset(RAMBlock *block, 2221 ram_addr_t offset) 2222 { 2223 if (!offset_in_ramblock(block, offset)) { 2224 return NULL; 2225 } 2226 2227 return block->host + offset; 2228 } 2229 2230 /* 2231 * If a page (or a whole RDMA chunk) has been 2232 * determined to be zero, then zap it. 2233 */ 2234 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2235 { 2236 if (ch != 0 || !is_zero_range(host, size)) { 2237 memset(host, ch, size); 2238 } 2239 } 2240 2241 static void *do_data_decompress(void *opaque) 2242 { 2243 DecompressParam *param = opaque; 2244 unsigned long pagesize; 2245 uint8_t *des; 2246 int len; 2247 2248 qemu_mutex_lock(¶m->mutex); 2249 while (!param->quit) { 2250 if (param->des) { 2251 des = param->des; 2252 len = param->len; 2253 param->des = 0; 2254 qemu_mutex_unlock(¶m->mutex); 2255 2256 pagesize = TARGET_PAGE_SIZE; 2257 /* uncompress() will return failed in some case, especially 2258 * when the page is dirted when doing the compression, it's 2259 * not a problem because the dirty page will be retransferred 2260 * and uncompress() won't break the data in other pages. 2261 */ 2262 uncompress((Bytef *)des, &pagesize, 2263 (const Bytef *)param->compbuf, len); 2264 2265 qemu_mutex_lock(&decomp_done_lock); 2266 param->done = true; 2267 qemu_cond_signal(&decomp_done_cond); 2268 qemu_mutex_unlock(&decomp_done_lock); 2269 2270 qemu_mutex_lock(¶m->mutex); 2271 } else { 2272 qemu_cond_wait(¶m->cond, ¶m->mutex); 2273 } 2274 } 2275 qemu_mutex_unlock(¶m->mutex); 2276 2277 return NULL; 2278 } 2279 2280 static void wait_for_decompress_done(void) 2281 { 2282 int idx, thread_count; 2283 2284 if (!migrate_use_compression()) { 2285 return; 2286 } 2287 2288 thread_count = migrate_decompress_threads(); 2289 qemu_mutex_lock(&decomp_done_lock); 2290 for (idx = 0; idx < thread_count; idx++) { 2291 while (!decomp_param[idx].done) { 2292 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2293 } 2294 } 2295 qemu_mutex_unlock(&decomp_done_lock); 2296 } 2297 2298 void migrate_decompress_threads_create(void) 2299 { 2300 int i, thread_count; 2301 2302 thread_count = migrate_decompress_threads(); 2303 decompress_threads = g_new0(QemuThread, thread_count); 2304 decomp_param = g_new0(DecompressParam, thread_count); 2305 qemu_mutex_init(&decomp_done_lock); 2306 qemu_cond_init(&decomp_done_cond); 2307 for (i = 0; i < thread_count; i++) { 2308 qemu_mutex_init(&decomp_param[i].mutex); 2309 qemu_cond_init(&decomp_param[i].cond); 2310 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2311 decomp_param[i].done = true; 2312 decomp_param[i].quit = false; 2313 qemu_thread_create(decompress_threads + i, "decompress", 2314 do_data_decompress, decomp_param + i, 2315 QEMU_THREAD_JOINABLE); 2316 } 2317 } 2318 2319 void migrate_decompress_threads_join(void) 2320 { 2321 int i, thread_count; 2322 2323 thread_count = migrate_decompress_threads(); 2324 for (i = 0; i < thread_count; i++) { 2325 qemu_mutex_lock(&decomp_param[i].mutex); 2326 decomp_param[i].quit = true; 2327 qemu_cond_signal(&decomp_param[i].cond); 2328 qemu_mutex_unlock(&decomp_param[i].mutex); 2329 } 2330 for (i = 0; i < thread_count; i++) { 2331 qemu_thread_join(decompress_threads + i); 2332 qemu_mutex_destroy(&decomp_param[i].mutex); 2333 qemu_cond_destroy(&decomp_param[i].cond); 2334 g_free(decomp_param[i].compbuf); 2335 } 2336 g_free(decompress_threads); 2337 g_free(decomp_param); 2338 decompress_threads = NULL; 2339 decomp_param = NULL; 2340 } 2341 2342 static void decompress_data_with_multi_threads(QEMUFile *f, 2343 void *host, int len) 2344 { 2345 int idx, thread_count; 2346 2347 thread_count = migrate_decompress_threads(); 2348 qemu_mutex_lock(&decomp_done_lock); 2349 while (true) { 2350 for (idx = 0; idx < thread_count; idx++) { 2351 if (decomp_param[idx].done) { 2352 decomp_param[idx].done = false; 2353 qemu_mutex_lock(&decomp_param[idx].mutex); 2354 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2355 decomp_param[idx].des = host; 2356 decomp_param[idx].len = len; 2357 qemu_cond_signal(&decomp_param[idx].cond); 2358 qemu_mutex_unlock(&decomp_param[idx].mutex); 2359 break; 2360 } 2361 } 2362 if (idx < thread_count) { 2363 break; 2364 } else { 2365 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2366 } 2367 } 2368 qemu_mutex_unlock(&decomp_done_lock); 2369 } 2370 2371 /* 2372 * Allocate data structures etc needed by incoming migration with postcopy-ram 2373 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work 2374 */ 2375 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 2376 { 2377 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; 2378 2379 return postcopy_ram_incoming_init(mis, ram_pages); 2380 } 2381 2382 /* 2383 * Called in postcopy mode by ram_load(). 2384 * rcu_read_lock is taken prior to this being called. 2385 */ 2386 static int ram_load_postcopy(QEMUFile *f) 2387 { 2388 int flags = 0, ret = 0; 2389 bool place_needed = false; 2390 bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE; 2391 MigrationIncomingState *mis = migration_incoming_get_current(); 2392 /* Temporary page that is later 'placed' */ 2393 void *postcopy_host_page = postcopy_get_tmp_page(mis); 2394 void *last_host = NULL; 2395 bool all_zero = false; 2396 2397 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2398 ram_addr_t addr; 2399 void *host = NULL; 2400 void *page_buffer = NULL; 2401 void *place_source = NULL; 2402 uint8_t ch; 2403 2404 addr = qemu_get_be64(f); 2405 flags = addr & ~TARGET_PAGE_MASK; 2406 addr &= TARGET_PAGE_MASK; 2407 2408 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 2409 place_needed = false; 2410 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) { 2411 RAMBlock *block = ram_block_from_stream(f, flags); 2412 2413 host = host_from_ram_block_offset(block, addr); 2414 if (!host) { 2415 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2416 ret = -EINVAL; 2417 break; 2418 } 2419 /* 2420 * Postcopy requires that we place whole host pages atomically. 2421 * To make it atomic, the data is read into a temporary page 2422 * that's moved into place later. 2423 * The migration protocol uses, possibly smaller, target-pages 2424 * however the source ensures it always sends all the components 2425 * of a host page in order. 2426 */ 2427 page_buffer = postcopy_host_page + 2428 ((uintptr_t)host & ~qemu_host_page_mask); 2429 /* If all TP are zero then we can optimise the place */ 2430 if (!((uintptr_t)host & ~qemu_host_page_mask)) { 2431 all_zero = true; 2432 } else { 2433 /* not the 1st TP within the HP */ 2434 if (host != (last_host + TARGET_PAGE_SIZE)) { 2435 error_report("Non-sequential target page %p/%p", 2436 host, last_host); 2437 ret = -EINVAL; 2438 break; 2439 } 2440 } 2441 2442 2443 /* 2444 * If it's the last part of a host page then we place the host 2445 * page 2446 */ 2447 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & 2448 ~qemu_host_page_mask) == 0; 2449 place_source = postcopy_host_page; 2450 } 2451 last_host = host; 2452 2453 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2454 case RAM_SAVE_FLAG_COMPRESS: 2455 ch = qemu_get_byte(f); 2456 memset(page_buffer, ch, TARGET_PAGE_SIZE); 2457 if (ch) { 2458 all_zero = false; 2459 } 2460 break; 2461 2462 case RAM_SAVE_FLAG_PAGE: 2463 all_zero = false; 2464 if (!place_needed || !matching_page_sizes) { 2465 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 2466 } else { 2467 /* Avoids the qemu_file copy during postcopy, which is 2468 * going to do a copy later; can only do it when we 2469 * do this read in one go (matching page sizes) 2470 */ 2471 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 2472 TARGET_PAGE_SIZE); 2473 } 2474 break; 2475 case RAM_SAVE_FLAG_EOS: 2476 /* normal exit */ 2477 break; 2478 default: 2479 error_report("Unknown combination of migration flags: %#x" 2480 " (postcopy mode)", flags); 2481 ret = -EINVAL; 2482 } 2483 2484 if (place_needed) { 2485 /* This gets called at the last target page in the host page */ 2486 if (all_zero) { 2487 ret = postcopy_place_page_zero(mis, 2488 host + TARGET_PAGE_SIZE - 2489 qemu_host_page_size); 2490 } else { 2491 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE - 2492 qemu_host_page_size, 2493 place_source); 2494 } 2495 } 2496 if (!ret) { 2497 ret = qemu_file_get_error(f); 2498 } 2499 } 2500 2501 return ret; 2502 } 2503 2504 static int ram_load(QEMUFile *f, void *opaque, int version_id) 2505 { 2506 int flags = 0, ret = 0; 2507 static uint64_t seq_iter; 2508 int len = 0; 2509 /* 2510 * If system is running in postcopy mode, page inserts to host memory must 2511 * be atomic 2512 */ 2513 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; 2514 2515 seq_iter++; 2516 2517 if (version_id != 4) { 2518 ret = -EINVAL; 2519 } 2520 2521 /* This RCU critical section can be very long running. 2522 * When RCU reclaims in the code start to become numerous, 2523 * it will be necessary to reduce the granularity of this 2524 * critical section. 2525 */ 2526 rcu_read_lock(); 2527 2528 if (postcopy_running) { 2529 ret = ram_load_postcopy(f); 2530 } 2531 2532 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2533 ram_addr_t addr, total_ram_bytes; 2534 void *host = NULL; 2535 uint8_t ch; 2536 2537 addr = qemu_get_be64(f); 2538 flags = addr & ~TARGET_PAGE_MASK; 2539 addr &= TARGET_PAGE_MASK; 2540 2541 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE | 2542 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 2543 RAMBlock *block = ram_block_from_stream(f, flags); 2544 2545 host = host_from_ram_block_offset(block, addr); 2546 if (!host) { 2547 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2548 ret = -EINVAL; 2549 break; 2550 } 2551 } 2552 2553 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2554 case RAM_SAVE_FLAG_MEM_SIZE: 2555 /* Synchronize RAM block list */ 2556 total_ram_bytes = addr; 2557 while (!ret && total_ram_bytes) { 2558 RAMBlock *block; 2559 char id[256]; 2560 ram_addr_t length; 2561 2562 len = qemu_get_byte(f); 2563 qemu_get_buffer(f, (uint8_t *)id, len); 2564 id[len] = 0; 2565 length = qemu_get_be64(f); 2566 2567 block = qemu_ram_block_by_name(id); 2568 if (block) { 2569 if (length != block->used_length) { 2570 Error *local_err = NULL; 2571 2572 ret = qemu_ram_resize(block, length, 2573 &local_err); 2574 if (local_err) { 2575 error_report_err(local_err); 2576 } 2577 } 2578 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 2579 block->idstr); 2580 } else { 2581 error_report("Unknown ramblock \"%s\", cannot " 2582 "accept migration", id); 2583 ret = -EINVAL; 2584 } 2585 2586 total_ram_bytes -= length; 2587 } 2588 break; 2589 2590 case RAM_SAVE_FLAG_COMPRESS: 2591 ch = qemu_get_byte(f); 2592 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 2593 break; 2594 2595 case RAM_SAVE_FLAG_PAGE: 2596 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 2597 break; 2598 2599 case RAM_SAVE_FLAG_COMPRESS_PAGE: 2600 len = qemu_get_be32(f); 2601 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 2602 error_report("Invalid compressed data length: %d", len); 2603 ret = -EINVAL; 2604 break; 2605 } 2606 decompress_data_with_multi_threads(f, host, len); 2607 break; 2608 2609 case RAM_SAVE_FLAG_XBZRLE: 2610 if (load_xbzrle(f, addr, host) < 0) { 2611 error_report("Failed to decompress XBZRLE page at " 2612 RAM_ADDR_FMT, addr); 2613 ret = -EINVAL; 2614 break; 2615 } 2616 break; 2617 case RAM_SAVE_FLAG_EOS: 2618 /* normal exit */ 2619 break; 2620 default: 2621 if (flags & RAM_SAVE_FLAG_HOOK) { 2622 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 2623 } else { 2624 error_report("Unknown combination of migration flags: %#x", 2625 flags); 2626 ret = -EINVAL; 2627 } 2628 } 2629 if (!ret) { 2630 ret = qemu_file_get_error(f); 2631 } 2632 } 2633 2634 wait_for_decompress_done(); 2635 rcu_read_unlock(); 2636 trace_ram_load_complete(ret, seq_iter); 2637 return ret; 2638 } 2639 2640 static SaveVMHandlers savevm_ram_handlers = { 2641 .save_live_setup = ram_save_setup, 2642 .save_live_iterate = ram_save_iterate, 2643 .save_live_complete_postcopy = ram_save_complete, 2644 .save_live_complete_precopy = ram_save_complete, 2645 .save_live_pending = ram_save_pending, 2646 .load_state = ram_load, 2647 .cleanup = ram_migration_cleanup, 2648 }; 2649 2650 void ram_mig_init(void) 2651 { 2652 qemu_mutex_init(&XBZRLE.lock); 2653 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL); 2654 } 2655