1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 #include "qemu/osdep.h" 29 #include "qemu-common.h" 30 #include "cpu.h" 31 #include <zlib.h> 32 #include "qapi-event.h" 33 #include "qemu/cutils.h" 34 #include "qemu/bitops.h" 35 #include "qemu/bitmap.h" 36 #include "qemu/timer.h" 37 #include "qemu/main-loop.h" 38 #include "migration/migration.h" 39 #include "migration/postcopy-ram.h" 40 #include "exec/address-spaces.h" 41 #include "migration/page_cache.h" 42 #include "qemu/error-report.h" 43 #include "trace.h" 44 #include "exec/ram_addr.h" 45 #include "qemu/rcu_queue.h" 46 #include "migration/colo.h" 47 48 static int dirty_rate_high_cnt; 49 50 static uint64_t bitmap_sync_count; 51 52 /***********************************************************/ 53 /* ram save/restore */ 54 55 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 56 #define RAM_SAVE_FLAG_COMPRESS 0x02 57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 58 #define RAM_SAVE_FLAG_PAGE 0x08 59 #define RAM_SAVE_FLAG_EOS 0x10 60 #define RAM_SAVE_FLAG_CONTINUE 0x20 61 #define RAM_SAVE_FLAG_XBZRLE 0x40 62 /* 0x80 is reserved in migration.h start with 0x100 next */ 63 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 64 65 static uint8_t *ZERO_TARGET_PAGE; 66 67 static inline bool is_zero_range(uint8_t *p, uint64_t size) 68 { 69 return buffer_is_zero(p, size); 70 } 71 72 /* struct contains XBZRLE cache and a static page 73 used by the compression */ 74 static struct { 75 /* buffer used for XBZRLE encoding */ 76 uint8_t *encoded_buf; 77 /* buffer for storing page content */ 78 uint8_t *current_buf; 79 /* Cache for XBZRLE, Protected by lock. */ 80 PageCache *cache; 81 QemuMutex lock; 82 } XBZRLE; 83 84 /* buffer used for XBZRLE decoding */ 85 static uint8_t *xbzrle_decoded_buf; 86 87 static void XBZRLE_cache_lock(void) 88 { 89 if (migrate_use_xbzrle()) 90 qemu_mutex_lock(&XBZRLE.lock); 91 } 92 93 static void XBZRLE_cache_unlock(void) 94 { 95 if (migrate_use_xbzrle()) 96 qemu_mutex_unlock(&XBZRLE.lock); 97 } 98 99 /* 100 * called from qmp_migrate_set_cache_size in main thread, possibly while 101 * a migration is in progress. 102 * A running migration maybe using the cache and might finish during this 103 * call, hence changes to the cache are protected by XBZRLE.lock(). 104 */ 105 int64_t xbzrle_cache_resize(int64_t new_size) 106 { 107 PageCache *new_cache; 108 int64_t ret; 109 110 if (new_size < TARGET_PAGE_SIZE) { 111 return -1; 112 } 113 114 XBZRLE_cache_lock(); 115 116 if (XBZRLE.cache != NULL) { 117 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { 118 goto out_new_size; 119 } 120 new_cache = cache_init(new_size / TARGET_PAGE_SIZE, 121 TARGET_PAGE_SIZE); 122 if (!new_cache) { 123 error_report("Error creating cache"); 124 ret = -1; 125 goto out; 126 } 127 128 cache_fini(XBZRLE.cache); 129 XBZRLE.cache = new_cache; 130 } 131 132 out_new_size: 133 ret = pow2floor(new_size); 134 out: 135 XBZRLE_cache_unlock(); 136 return ret; 137 } 138 139 /* accounting for migration statistics */ 140 typedef struct AccountingInfo { 141 uint64_t dup_pages; 142 uint64_t skipped_pages; 143 uint64_t norm_pages; 144 uint64_t iterations; 145 uint64_t xbzrle_bytes; 146 uint64_t xbzrle_pages; 147 uint64_t xbzrle_cache_miss; 148 double xbzrle_cache_miss_rate; 149 uint64_t xbzrle_overflows; 150 } AccountingInfo; 151 152 static AccountingInfo acct_info; 153 154 static void acct_clear(void) 155 { 156 memset(&acct_info, 0, sizeof(acct_info)); 157 } 158 159 uint64_t dup_mig_bytes_transferred(void) 160 { 161 return acct_info.dup_pages * TARGET_PAGE_SIZE; 162 } 163 164 uint64_t dup_mig_pages_transferred(void) 165 { 166 return acct_info.dup_pages; 167 } 168 169 uint64_t skipped_mig_bytes_transferred(void) 170 { 171 return acct_info.skipped_pages * TARGET_PAGE_SIZE; 172 } 173 174 uint64_t skipped_mig_pages_transferred(void) 175 { 176 return acct_info.skipped_pages; 177 } 178 179 uint64_t norm_mig_bytes_transferred(void) 180 { 181 return acct_info.norm_pages * TARGET_PAGE_SIZE; 182 } 183 184 uint64_t norm_mig_pages_transferred(void) 185 { 186 return acct_info.norm_pages; 187 } 188 189 uint64_t xbzrle_mig_bytes_transferred(void) 190 { 191 return acct_info.xbzrle_bytes; 192 } 193 194 uint64_t xbzrle_mig_pages_transferred(void) 195 { 196 return acct_info.xbzrle_pages; 197 } 198 199 uint64_t xbzrle_mig_pages_cache_miss(void) 200 { 201 return acct_info.xbzrle_cache_miss; 202 } 203 204 double xbzrle_mig_cache_miss_rate(void) 205 { 206 return acct_info.xbzrle_cache_miss_rate; 207 } 208 209 uint64_t xbzrle_mig_pages_overflow(void) 210 { 211 return acct_info.xbzrle_overflows; 212 } 213 214 /* This is the last block that we have visited serching for dirty pages 215 */ 216 static RAMBlock *last_seen_block; 217 /* This is the last block from where we have sent data */ 218 static RAMBlock *last_sent_block; 219 static ram_addr_t last_offset; 220 static QemuMutex migration_bitmap_mutex; 221 static uint64_t migration_dirty_pages; 222 static uint32_t last_version; 223 static bool ram_bulk_stage; 224 225 /* used by the search for pages to send */ 226 struct PageSearchStatus { 227 /* Current block being searched */ 228 RAMBlock *block; 229 /* Current offset to search from */ 230 ram_addr_t offset; 231 /* Set once we wrap around */ 232 bool complete_round; 233 }; 234 typedef struct PageSearchStatus PageSearchStatus; 235 236 static struct BitmapRcu { 237 struct rcu_head rcu; 238 /* Main migration bitmap */ 239 unsigned long *bmap; 240 /* bitmap of pages that haven't been sent even once 241 * only maintained and used in postcopy at the moment 242 * where it's used to send the dirtymap at the start 243 * of the postcopy phase 244 */ 245 unsigned long *unsentmap; 246 } *migration_bitmap_rcu; 247 248 struct CompressParam { 249 bool done; 250 bool quit; 251 QEMUFile *file; 252 QemuMutex mutex; 253 QemuCond cond; 254 RAMBlock *block; 255 ram_addr_t offset; 256 }; 257 typedef struct CompressParam CompressParam; 258 259 struct DecompressParam { 260 bool done; 261 bool quit; 262 QemuMutex mutex; 263 QemuCond cond; 264 void *des; 265 uint8_t *compbuf; 266 int len; 267 }; 268 typedef struct DecompressParam DecompressParam; 269 270 static CompressParam *comp_param; 271 static QemuThread *compress_threads; 272 /* comp_done_cond is used to wake up the migration thread when 273 * one of the compression threads has finished the compression. 274 * comp_done_lock is used to co-work with comp_done_cond. 275 */ 276 static QemuMutex comp_done_lock; 277 static QemuCond comp_done_cond; 278 /* The empty QEMUFileOps will be used by file in CompressParam */ 279 static const QEMUFileOps empty_ops = { }; 280 281 static bool compression_switch; 282 static DecompressParam *decomp_param; 283 static QemuThread *decompress_threads; 284 static QemuMutex decomp_done_lock; 285 static QemuCond decomp_done_cond; 286 287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 288 ram_addr_t offset); 289 290 static void *do_data_compress(void *opaque) 291 { 292 CompressParam *param = opaque; 293 RAMBlock *block; 294 ram_addr_t offset; 295 296 qemu_mutex_lock(¶m->mutex); 297 while (!param->quit) { 298 if (param->block) { 299 block = param->block; 300 offset = param->offset; 301 param->block = NULL; 302 qemu_mutex_unlock(¶m->mutex); 303 304 do_compress_ram_page(param->file, block, offset); 305 306 qemu_mutex_lock(&comp_done_lock); 307 param->done = true; 308 qemu_cond_signal(&comp_done_cond); 309 qemu_mutex_unlock(&comp_done_lock); 310 311 qemu_mutex_lock(¶m->mutex); 312 } else { 313 qemu_cond_wait(¶m->cond, ¶m->mutex); 314 } 315 } 316 qemu_mutex_unlock(¶m->mutex); 317 318 return NULL; 319 } 320 321 static inline void terminate_compression_threads(void) 322 { 323 int idx, thread_count; 324 325 thread_count = migrate_compress_threads(); 326 for (idx = 0; idx < thread_count; idx++) { 327 qemu_mutex_lock(&comp_param[idx].mutex); 328 comp_param[idx].quit = true; 329 qemu_cond_signal(&comp_param[idx].cond); 330 qemu_mutex_unlock(&comp_param[idx].mutex); 331 } 332 } 333 334 void migrate_compress_threads_join(void) 335 { 336 int i, thread_count; 337 338 if (!migrate_use_compression()) { 339 return; 340 } 341 terminate_compression_threads(); 342 thread_count = migrate_compress_threads(); 343 for (i = 0; i < thread_count; i++) { 344 qemu_thread_join(compress_threads + i); 345 qemu_fclose(comp_param[i].file); 346 qemu_mutex_destroy(&comp_param[i].mutex); 347 qemu_cond_destroy(&comp_param[i].cond); 348 } 349 qemu_mutex_destroy(&comp_done_lock); 350 qemu_cond_destroy(&comp_done_cond); 351 g_free(compress_threads); 352 g_free(comp_param); 353 compress_threads = NULL; 354 comp_param = NULL; 355 } 356 357 void migrate_compress_threads_create(void) 358 { 359 int i, thread_count; 360 361 if (!migrate_use_compression()) { 362 return; 363 } 364 compression_switch = true; 365 thread_count = migrate_compress_threads(); 366 compress_threads = g_new0(QemuThread, thread_count); 367 comp_param = g_new0(CompressParam, thread_count); 368 qemu_cond_init(&comp_done_cond); 369 qemu_mutex_init(&comp_done_lock); 370 for (i = 0; i < thread_count; i++) { 371 /* comp_param[i].file is just used as a dummy buffer to save data, 372 * set its ops to empty. 373 */ 374 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 375 comp_param[i].done = true; 376 comp_param[i].quit = false; 377 qemu_mutex_init(&comp_param[i].mutex); 378 qemu_cond_init(&comp_param[i].cond); 379 qemu_thread_create(compress_threads + i, "compress", 380 do_data_compress, comp_param + i, 381 QEMU_THREAD_JOINABLE); 382 } 383 } 384 385 /** 386 * save_page_header: Write page header to wire 387 * 388 * If this is the 1st block, it also writes the block identification 389 * 390 * Returns: Number of bytes written 391 * 392 * @f: QEMUFile where to send the data 393 * @block: block that contains the page we want to send 394 * @offset: offset inside the block for the page 395 * in the lower bits, it contains flags 396 */ 397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset) 398 { 399 size_t size, len; 400 401 qemu_put_be64(f, offset); 402 size = 8; 403 404 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 405 len = strlen(block->idstr); 406 qemu_put_byte(f, len); 407 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 408 size += 1 + len; 409 } 410 return size; 411 } 412 413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes. 414 * If guest dirty memory rate is reduced below the rate at which we can 415 * transfer pages to the destination then we should be able to complete 416 * migration. Some workloads dirty memory way too fast and will not effectively 417 * converge, even with auto-converge. 418 */ 419 static void mig_throttle_guest_down(void) 420 { 421 MigrationState *s = migrate_get_current(); 422 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 423 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 424 425 /* We have not started throttling yet. Let's start it. */ 426 if (!cpu_throttle_active()) { 427 cpu_throttle_set(pct_initial); 428 } else { 429 /* Throttling already on, just increase the rate */ 430 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); 431 } 432 } 433 434 /* Update the xbzrle cache to reflect a page that's been sent as all 0. 435 * The important thing is that a stale (not-yet-0'd) page be replaced 436 * by the new data. 437 * As a bonus, if the page wasn't in the cache it gets added so that 438 * when a small write is made into the 0'd page it gets XBZRLE sent 439 */ 440 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 441 { 442 if (ram_bulk_stage || !migrate_use_xbzrle()) { 443 return; 444 } 445 446 /* We don't care if this fails to allocate a new cache page 447 * as long as it updated an old one */ 448 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE, 449 bitmap_sync_count); 450 } 451 452 #define ENCODING_FLAG_XBZRLE 0x1 453 454 /** 455 * save_xbzrle_page: compress and send current page 456 * 457 * Returns: 1 means that we wrote the page 458 * 0 means that page is identical to the one already sent 459 * -1 means that xbzrle would be longer than normal 460 * 461 * @f: QEMUFile where to send the data 462 * @current_data: 463 * @current_addr: 464 * @block: block that contains the page we want to send 465 * @offset: offset inside the block for the page 466 * @last_stage: if we are at the completion stage 467 * @bytes_transferred: increase it with the number of transferred bytes 468 */ 469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, 470 ram_addr_t current_addr, RAMBlock *block, 471 ram_addr_t offset, bool last_stage, 472 uint64_t *bytes_transferred) 473 { 474 int encoded_len = 0, bytes_xbzrle; 475 uint8_t *prev_cached_page; 476 477 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) { 478 acct_info.xbzrle_cache_miss++; 479 if (!last_stage) { 480 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 481 bitmap_sync_count) == -1) { 482 return -1; 483 } else { 484 /* update *current_data when the page has been 485 inserted into cache */ 486 *current_data = get_cached_data(XBZRLE.cache, current_addr); 487 } 488 } 489 return -1; 490 } 491 492 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 493 494 /* save current buffer into memory */ 495 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 496 497 /* XBZRLE encoding (if there is no overflow) */ 498 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 499 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 500 TARGET_PAGE_SIZE); 501 if (encoded_len == 0) { 502 trace_save_xbzrle_page_skipping(); 503 return 0; 504 } else if (encoded_len == -1) { 505 trace_save_xbzrle_page_overflow(); 506 acct_info.xbzrle_overflows++; 507 /* update data in the cache */ 508 if (!last_stage) { 509 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); 510 *current_data = prev_cached_page; 511 } 512 return -1; 513 } 514 515 /* we need to update the data in the cache, in order to get the same data */ 516 if (!last_stage) { 517 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 518 } 519 520 /* Send XBZRLE based compressed page */ 521 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE); 522 qemu_put_byte(f, ENCODING_FLAG_XBZRLE); 523 qemu_put_be16(f, encoded_len); 524 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len); 525 bytes_xbzrle += encoded_len + 1 + 2; 526 acct_info.xbzrle_pages++; 527 acct_info.xbzrle_bytes += bytes_xbzrle; 528 *bytes_transferred += bytes_xbzrle; 529 530 return 1; 531 } 532 533 /* Called with rcu_read_lock() to protect migration_bitmap 534 * rb: The RAMBlock to search for dirty pages in 535 * start: Start address (typically so we can continue from previous page) 536 * ram_addr_abs: Pointer into which to store the address of the dirty page 537 * within the global ram_addr space 538 * 539 * Returns: byte offset within memory region of the start of a dirty page 540 */ 541 static inline 542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb, 543 ram_addr_t start, 544 ram_addr_t *ram_addr_abs) 545 { 546 unsigned long base = rb->offset >> TARGET_PAGE_BITS; 547 unsigned long nr = base + (start >> TARGET_PAGE_BITS); 548 uint64_t rb_size = rb->used_length; 549 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS); 550 unsigned long *bitmap; 551 552 unsigned long next; 553 554 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 555 if (ram_bulk_stage && nr > base) { 556 next = nr + 1; 557 } else { 558 next = find_next_bit(bitmap, size, nr); 559 } 560 561 *ram_addr_abs = next << TARGET_PAGE_BITS; 562 return (next - base) << TARGET_PAGE_BITS; 563 } 564 565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr) 566 { 567 bool ret; 568 int nr = addr >> TARGET_PAGE_BITS; 569 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 570 571 ret = test_and_clear_bit(nr, bitmap); 572 573 if (ret) { 574 migration_dirty_pages--; 575 } 576 return ret; 577 } 578 579 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) 580 { 581 unsigned long *bitmap; 582 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 583 migration_dirty_pages += 584 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); 585 } 586 587 /* Fix me: there are too many global variables used in migration process. */ 588 static int64_t start_time; 589 static int64_t bytes_xfer_prev; 590 static int64_t num_dirty_pages_period; 591 static uint64_t xbzrle_cache_miss_prev; 592 static uint64_t iterations_prev; 593 594 static void migration_bitmap_sync_init(void) 595 { 596 start_time = 0; 597 bytes_xfer_prev = 0; 598 num_dirty_pages_period = 0; 599 xbzrle_cache_miss_prev = 0; 600 iterations_prev = 0; 601 } 602 603 static void migration_bitmap_sync(void) 604 { 605 RAMBlock *block; 606 uint64_t num_dirty_pages_init = migration_dirty_pages; 607 MigrationState *s = migrate_get_current(); 608 int64_t end_time; 609 int64_t bytes_xfer_now; 610 611 bitmap_sync_count++; 612 613 if (!bytes_xfer_prev) { 614 bytes_xfer_prev = ram_bytes_transferred(); 615 } 616 617 if (!start_time) { 618 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 619 } 620 621 trace_migration_bitmap_sync_start(); 622 memory_global_dirty_log_sync(); 623 624 qemu_mutex_lock(&migration_bitmap_mutex); 625 rcu_read_lock(); 626 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 627 migration_bitmap_sync_range(block->offset, block->used_length); 628 } 629 rcu_read_unlock(); 630 qemu_mutex_unlock(&migration_bitmap_mutex); 631 632 trace_migration_bitmap_sync_end(migration_dirty_pages 633 - num_dirty_pages_init); 634 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; 635 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 636 637 /* more than 1 second = 1000 millisecons */ 638 if (end_time > start_time + 1000) { 639 if (migrate_auto_converge()) { 640 /* The following detection logic can be refined later. For now: 641 Check to see if the dirtied bytes is 50% more than the approx. 642 amount of bytes that just got transferred since the last time we 643 were in this routine. If that happens twice, start or increase 644 throttling */ 645 bytes_xfer_now = ram_bytes_transferred(); 646 647 if (s->dirty_pages_rate && 648 (num_dirty_pages_period * TARGET_PAGE_SIZE > 649 (bytes_xfer_now - bytes_xfer_prev)/2) && 650 (dirty_rate_high_cnt++ >= 2)) { 651 trace_migration_throttle(); 652 dirty_rate_high_cnt = 0; 653 mig_throttle_guest_down(); 654 } 655 bytes_xfer_prev = bytes_xfer_now; 656 } 657 658 if (migrate_use_xbzrle()) { 659 if (iterations_prev != acct_info.iterations) { 660 acct_info.xbzrle_cache_miss_rate = 661 (double)(acct_info.xbzrle_cache_miss - 662 xbzrle_cache_miss_prev) / 663 (acct_info.iterations - iterations_prev); 664 } 665 iterations_prev = acct_info.iterations; 666 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss; 667 } 668 s->dirty_pages_rate = num_dirty_pages_period * 1000 669 / (end_time - start_time); 670 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; 671 start_time = end_time; 672 num_dirty_pages_period = 0; 673 } 674 s->dirty_sync_count = bitmap_sync_count; 675 if (migrate_use_events()) { 676 qapi_event_send_migration_pass(bitmap_sync_count, NULL); 677 } 678 } 679 680 /** 681 * save_zero_page: Send the zero page to the stream 682 * 683 * Returns: Number of pages written. 684 * 685 * @f: QEMUFile where to send the data 686 * @block: block that contains the page we want to send 687 * @offset: offset inside the block for the page 688 * @p: pointer to the page 689 * @bytes_transferred: increase it with the number of transferred bytes 690 */ 691 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, 692 uint8_t *p, uint64_t *bytes_transferred) 693 { 694 int pages = -1; 695 696 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 697 acct_info.dup_pages++; 698 *bytes_transferred += save_page_header(f, block, 699 offset | RAM_SAVE_FLAG_COMPRESS); 700 qemu_put_byte(f, 0); 701 *bytes_transferred += 1; 702 pages = 1; 703 } 704 705 return pages; 706 } 707 708 /** 709 * ram_save_page: Send the given page to the stream 710 * 711 * Returns: Number of pages written. 712 * < 0 - error 713 * >=0 - Number of pages written - this might legally be 0 714 * if xbzrle noticed the page was the same. 715 * 716 * @f: QEMUFile where to send the data 717 * @block: block that contains the page we want to send 718 * @offset: offset inside the block for the page 719 * @last_stage: if we are at the completion stage 720 * @bytes_transferred: increase it with the number of transferred bytes 721 */ 722 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, 723 bool last_stage, uint64_t *bytes_transferred) 724 { 725 int pages = -1; 726 uint64_t bytes_xmit; 727 ram_addr_t current_addr; 728 uint8_t *p; 729 int ret; 730 bool send_async = true; 731 RAMBlock *block = pss->block; 732 ram_addr_t offset = pss->offset; 733 734 p = block->host + offset; 735 736 /* In doubt sent page as normal */ 737 bytes_xmit = 0; 738 ret = ram_control_save_page(f, block->offset, 739 offset, TARGET_PAGE_SIZE, &bytes_xmit); 740 if (bytes_xmit) { 741 *bytes_transferred += bytes_xmit; 742 pages = 1; 743 } 744 745 XBZRLE_cache_lock(); 746 747 current_addr = block->offset + offset; 748 749 if (block == last_sent_block) { 750 offset |= RAM_SAVE_FLAG_CONTINUE; 751 } 752 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 753 if (ret != RAM_SAVE_CONTROL_DELAYED) { 754 if (bytes_xmit > 0) { 755 acct_info.norm_pages++; 756 } else if (bytes_xmit == 0) { 757 acct_info.dup_pages++; 758 } 759 } 760 } else { 761 pages = save_zero_page(f, block, offset, p, bytes_transferred); 762 if (pages > 0) { 763 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 764 * page would be stale 765 */ 766 xbzrle_cache_zero_page(current_addr); 767 } else if (!ram_bulk_stage && 768 !migration_in_postcopy(migrate_get_current()) && 769 migrate_use_xbzrle()) { 770 pages = save_xbzrle_page(f, &p, current_addr, block, 771 offset, last_stage, bytes_transferred); 772 if (!last_stage) { 773 /* Can't send this cached data async, since the cache page 774 * might get updated before it gets to the wire 775 */ 776 send_async = false; 777 } 778 } 779 } 780 781 /* XBZRLE overflow or normal page */ 782 if (pages == -1) { 783 *bytes_transferred += save_page_header(f, block, 784 offset | RAM_SAVE_FLAG_PAGE); 785 if (send_async) { 786 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); 787 } else { 788 qemu_put_buffer(f, p, TARGET_PAGE_SIZE); 789 } 790 *bytes_transferred += TARGET_PAGE_SIZE; 791 pages = 1; 792 acct_info.norm_pages++; 793 } 794 795 XBZRLE_cache_unlock(); 796 797 return pages; 798 } 799 800 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 801 ram_addr_t offset) 802 { 803 int bytes_sent, blen; 804 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 805 806 bytes_sent = save_page_header(f, block, offset | 807 RAM_SAVE_FLAG_COMPRESS_PAGE); 808 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 809 migrate_compress_level()); 810 if (blen < 0) { 811 bytes_sent = 0; 812 qemu_file_set_error(migrate_get_current()->to_dst_file, blen); 813 error_report("compressed data failed!"); 814 } else { 815 bytes_sent += blen; 816 } 817 818 return bytes_sent; 819 } 820 821 static uint64_t bytes_transferred; 822 823 static void flush_compressed_data(QEMUFile *f) 824 { 825 int idx, len, thread_count; 826 827 if (!migrate_use_compression()) { 828 return; 829 } 830 thread_count = migrate_compress_threads(); 831 832 qemu_mutex_lock(&comp_done_lock); 833 for (idx = 0; idx < thread_count; idx++) { 834 while (!comp_param[idx].done) { 835 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 836 } 837 } 838 qemu_mutex_unlock(&comp_done_lock); 839 840 for (idx = 0; idx < thread_count; idx++) { 841 qemu_mutex_lock(&comp_param[idx].mutex); 842 if (!comp_param[idx].quit) { 843 len = qemu_put_qemu_file(f, comp_param[idx].file); 844 bytes_transferred += len; 845 } 846 qemu_mutex_unlock(&comp_param[idx].mutex); 847 } 848 } 849 850 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 851 ram_addr_t offset) 852 { 853 param->block = block; 854 param->offset = offset; 855 } 856 857 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, 858 ram_addr_t offset, 859 uint64_t *bytes_transferred) 860 { 861 int idx, thread_count, bytes_xmit = -1, pages = -1; 862 863 thread_count = migrate_compress_threads(); 864 qemu_mutex_lock(&comp_done_lock); 865 while (true) { 866 for (idx = 0; idx < thread_count; idx++) { 867 if (comp_param[idx].done) { 868 comp_param[idx].done = false; 869 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file); 870 qemu_mutex_lock(&comp_param[idx].mutex); 871 set_compress_params(&comp_param[idx], block, offset); 872 qemu_cond_signal(&comp_param[idx].cond); 873 qemu_mutex_unlock(&comp_param[idx].mutex); 874 pages = 1; 875 acct_info.norm_pages++; 876 *bytes_transferred += bytes_xmit; 877 break; 878 } 879 } 880 if (pages > 0) { 881 break; 882 } else { 883 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 884 } 885 } 886 qemu_mutex_unlock(&comp_done_lock); 887 888 return pages; 889 } 890 891 /** 892 * ram_save_compressed_page: compress the given page and send it to the stream 893 * 894 * Returns: Number of pages written. 895 * 896 * @f: QEMUFile where to send the data 897 * @block: block that contains the page we want to send 898 * @offset: offset inside the block for the page 899 * @last_stage: if we are at the completion stage 900 * @bytes_transferred: increase it with the number of transferred bytes 901 */ 902 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, 903 bool last_stage, 904 uint64_t *bytes_transferred) 905 { 906 int pages = -1; 907 uint64_t bytes_xmit = 0; 908 uint8_t *p; 909 int ret, blen; 910 RAMBlock *block = pss->block; 911 ram_addr_t offset = pss->offset; 912 913 p = block->host + offset; 914 915 ret = ram_control_save_page(f, block->offset, 916 offset, TARGET_PAGE_SIZE, &bytes_xmit); 917 if (bytes_xmit) { 918 *bytes_transferred += bytes_xmit; 919 pages = 1; 920 } 921 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 922 if (ret != RAM_SAVE_CONTROL_DELAYED) { 923 if (bytes_xmit > 0) { 924 acct_info.norm_pages++; 925 } else if (bytes_xmit == 0) { 926 acct_info.dup_pages++; 927 } 928 } 929 } else { 930 /* When starting the process of a new block, the first page of 931 * the block should be sent out before other pages in the same 932 * block, and all the pages in last block should have been sent 933 * out, keeping this order is important, because the 'cont' flag 934 * is used to avoid resending the block name. 935 */ 936 if (block != last_sent_block) { 937 flush_compressed_data(f); 938 pages = save_zero_page(f, block, offset, p, bytes_transferred); 939 if (pages == -1) { 940 /* Make sure the first page is sent out before other pages */ 941 bytes_xmit = save_page_header(f, block, offset | 942 RAM_SAVE_FLAG_COMPRESS_PAGE); 943 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 944 migrate_compress_level()); 945 if (blen > 0) { 946 *bytes_transferred += bytes_xmit + blen; 947 acct_info.norm_pages++; 948 pages = 1; 949 } else { 950 qemu_file_set_error(f, blen); 951 error_report("compressed data failed!"); 952 } 953 } 954 } else { 955 offset |= RAM_SAVE_FLAG_CONTINUE; 956 pages = save_zero_page(f, block, offset, p, bytes_transferred); 957 if (pages == -1) { 958 pages = compress_page_with_multi_thread(f, block, offset, 959 bytes_transferred); 960 } 961 } 962 } 963 964 return pages; 965 } 966 967 /* 968 * Find the next dirty page and update any state associated with 969 * the search process. 970 * 971 * Returns: True if a page is found 972 * 973 * @f: Current migration stream. 974 * @pss: Data about the state of the current dirty page scan. 975 * @*again: Set to false if the search has scanned the whole of RAM 976 * *ram_addr_abs: Pointer into which to store the address of the dirty page 977 * within the global ram_addr space 978 */ 979 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, 980 bool *again, ram_addr_t *ram_addr_abs) 981 { 982 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset, 983 ram_addr_abs); 984 if (pss->complete_round && pss->block == last_seen_block && 985 pss->offset >= last_offset) { 986 /* 987 * We've been once around the RAM and haven't found anything. 988 * Give up. 989 */ 990 *again = false; 991 return false; 992 } 993 if (pss->offset >= pss->block->used_length) { 994 /* Didn't find anything in this RAM Block */ 995 pss->offset = 0; 996 pss->block = QLIST_NEXT_RCU(pss->block, next); 997 if (!pss->block) { 998 /* Hit the end of the list */ 999 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1000 /* Flag that we've looped */ 1001 pss->complete_round = true; 1002 ram_bulk_stage = false; 1003 if (migrate_use_xbzrle()) { 1004 /* If xbzrle is on, stop using the data compression at this 1005 * point. In theory, xbzrle can do better than compression. 1006 */ 1007 flush_compressed_data(f); 1008 compression_switch = false; 1009 } 1010 } 1011 /* Didn't find anything this time, but try again on the new block */ 1012 *again = true; 1013 return false; 1014 } else { 1015 /* Can go around again, but... */ 1016 *again = true; 1017 /* We've found something so probably don't need to */ 1018 return true; 1019 } 1020 } 1021 1022 /* 1023 * Helper for 'get_queued_page' - gets a page off the queue 1024 * ms: MigrationState in 1025 * *offset: Used to return the offset within the RAMBlock 1026 * ram_addr_abs: global offset in the dirty/sent bitmaps 1027 * 1028 * Returns: block (or NULL if none available) 1029 */ 1030 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, 1031 ram_addr_t *ram_addr_abs) 1032 { 1033 RAMBlock *block = NULL; 1034 1035 qemu_mutex_lock(&ms->src_page_req_mutex); 1036 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) { 1037 struct MigrationSrcPageRequest *entry = 1038 QSIMPLEQ_FIRST(&ms->src_page_requests); 1039 block = entry->rb; 1040 *offset = entry->offset; 1041 *ram_addr_abs = (entry->offset + entry->rb->offset) & 1042 TARGET_PAGE_MASK; 1043 1044 if (entry->len > TARGET_PAGE_SIZE) { 1045 entry->len -= TARGET_PAGE_SIZE; 1046 entry->offset += TARGET_PAGE_SIZE; 1047 } else { 1048 memory_region_unref(block->mr); 1049 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); 1050 g_free(entry); 1051 } 1052 } 1053 qemu_mutex_unlock(&ms->src_page_req_mutex); 1054 1055 return block; 1056 } 1057 1058 /* 1059 * Unqueue a page from the queue fed by postcopy page requests; skips pages 1060 * that are already sent (!dirty) 1061 * 1062 * ms: MigrationState in 1063 * pss: PageSearchStatus structure updated with found block/offset 1064 * ram_addr_abs: global offset in the dirty/sent bitmaps 1065 * 1066 * Returns: true if a queued page is found 1067 */ 1068 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, 1069 ram_addr_t *ram_addr_abs) 1070 { 1071 RAMBlock *block; 1072 ram_addr_t offset; 1073 bool dirty; 1074 1075 do { 1076 block = unqueue_page(ms, &offset, ram_addr_abs); 1077 /* 1078 * We're sending this page, and since it's postcopy nothing else 1079 * will dirty it, and we must make sure it doesn't get sent again 1080 * even if this queue request was received after the background 1081 * search already sent it. 1082 */ 1083 if (block) { 1084 unsigned long *bitmap; 1085 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1086 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap); 1087 if (!dirty) { 1088 trace_get_queued_page_not_dirty( 1089 block->idstr, (uint64_t)offset, 1090 (uint64_t)*ram_addr_abs, 1091 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, 1092 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap)); 1093 } else { 1094 trace_get_queued_page(block->idstr, 1095 (uint64_t)offset, 1096 (uint64_t)*ram_addr_abs); 1097 } 1098 } 1099 1100 } while (block && !dirty); 1101 1102 if (block) { 1103 /* 1104 * As soon as we start servicing pages out of order, then we have 1105 * to kill the bulk stage, since the bulk stage assumes 1106 * in (migration_bitmap_find_and_reset_dirty) that every page is 1107 * dirty, that's no longer true. 1108 */ 1109 ram_bulk_stage = false; 1110 1111 /* 1112 * We want the background search to continue from the queued page 1113 * since the guest is likely to want other pages near to the page 1114 * it just requested. 1115 */ 1116 pss->block = block; 1117 pss->offset = offset; 1118 } 1119 1120 return !!block; 1121 } 1122 1123 /** 1124 * flush_page_queue: Flush any remaining pages in the ram request queue 1125 * it should be empty at the end anyway, but in error cases there may be 1126 * some left. 1127 * 1128 * ms: MigrationState 1129 */ 1130 void flush_page_queue(MigrationState *ms) 1131 { 1132 struct MigrationSrcPageRequest *mspr, *next_mspr; 1133 /* This queue generally should be empty - but in the case of a failed 1134 * migration might have some droppings in. 1135 */ 1136 rcu_read_lock(); 1137 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) { 1138 memory_region_unref(mspr->rb->mr); 1139 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); 1140 g_free(mspr); 1141 } 1142 rcu_read_unlock(); 1143 } 1144 1145 /** 1146 * Queue the pages for transmission, e.g. a request from postcopy destination 1147 * ms: MigrationStatus in which the queue is held 1148 * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last) 1149 * start: Offset from the start of the RAMBlock 1150 * len: Length (in bytes) to send 1151 * Return: 0 on success 1152 */ 1153 int ram_save_queue_pages(MigrationState *ms, const char *rbname, 1154 ram_addr_t start, ram_addr_t len) 1155 { 1156 RAMBlock *ramblock; 1157 1158 ms->postcopy_requests++; 1159 rcu_read_lock(); 1160 if (!rbname) { 1161 /* Reuse last RAMBlock */ 1162 ramblock = ms->last_req_rb; 1163 1164 if (!ramblock) { 1165 /* 1166 * Shouldn't happen, we can't reuse the last RAMBlock if 1167 * it's the 1st request. 1168 */ 1169 error_report("ram_save_queue_pages no previous block"); 1170 goto err; 1171 } 1172 } else { 1173 ramblock = qemu_ram_block_by_name(rbname); 1174 1175 if (!ramblock) { 1176 /* We shouldn't be asked for a non-existent RAMBlock */ 1177 error_report("ram_save_queue_pages no block '%s'", rbname); 1178 goto err; 1179 } 1180 ms->last_req_rb = ramblock; 1181 } 1182 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1183 if (start+len > ramblock->used_length) { 1184 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1185 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1186 __func__, start, len, ramblock->used_length); 1187 goto err; 1188 } 1189 1190 struct MigrationSrcPageRequest *new_entry = 1191 g_malloc0(sizeof(struct MigrationSrcPageRequest)); 1192 new_entry->rb = ramblock; 1193 new_entry->offset = start; 1194 new_entry->len = len; 1195 1196 memory_region_ref(ramblock->mr); 1197 qemu_mutex_lock(&ms->src_page_req_mutex); 1198 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req); 1199 qemu_mutex_unlock(&ms->src_page_req_mutex); 1200 rcu_read_unlock(); 1201 1202 return 0; 1203 1204 err: 1205 rcu_read_unlock(); 1206 return -1; 1207 } 1208 1209 /** 1210 * ram_save_target_page: Save one target page 1211 * 1212 * 1213 * @f: QEMUFile where to send the data 1214 * @block: pointer to block that contains the page we want to send 1215 * @offset: offset inside the block for the page; 1216 * @last_stage: if we are at the completion stage 1217 * @bytes_transferred: increase it with the number of transferred bytes 1218 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space 1219 * 1220 * Returns: Number of pages written. 1221 */ 1222 static int ram_save_target_page(MigrationState *ms, QEMUFile *f, 1223 PageSearchStatus *pss, 1224 bool last_stage, 1225 uint64_t *bytes_transferred, 1226 ram_addr_t dirty_ram_abs) 1227 { 1228 int res = 0; 1229 1230 /* Check the pages is dirty and if it is send it */ 1231 if (migration_bitmap_clear_dirty(dirty_ram_abs)) { 1232 unsigned long *unsentmap; 1233 if (compression_switch && migrate_use_compression()) { 1234 res = ram_save_compressed_page(f, pss, 1235 last_stage, 1236 bytes_transferred); 1237 } else { 1238 res = ram_save_page(f, pss, last_stage, 1239 bytes_transferred); 1240 } 1241 1242 if (res < 0) { 1243 return res; 1244 } 1245 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1246 if (unsentmap) { 1247 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap); 1248 } 1249 /* Only update last_sent_block if a block was actually sent; xbzrle 1250 * might have decided the page was identical so didn't bother writing 1251 * to the stream. 1252 */ 1253 if (res > 0) { 1254 last_sent_block = pss->block; 1255 } 1256 } 1257 1258 return res; 1259 } 1260 1261 /** 1262 * ram_save_host_page: Starting at *offset send pages up to the end 1263 * of the current host page. It's valid for the initial 1264 * offset to point into the middle of a host page 1265 * in which case the remainder of the hostpage is sent. 1266 * Only dirty target pages are sent. 1267 * 1268 * Returns: Number of pages written. 1269 * 1270 * @f: QEMUFile where to send the data 1271 * @block: pointer to block that contains the page we want to send 1272 * @offset: offset inside the block for the page; updated to last target page 1273 * sent 1274 * @last_stage: if we are at the completion stage 1275 * @bytes_transferred: increase it with the number of transferred bytes 1276 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space 1277 */ 1278 static int ram_save_host_page(MigrationState *ms, QEMUFile *f, 1279 PageSearchStatus *pss, 1280 bool last_stage, 1281 uint64_t *bytes_transferred, 1282 ram_addr_t dirty_ram_abs) 1283 { 1284 int tmppages, pages = 0; 1285 do { 1286 tmppages = ram_save_target_page(ms, f, pss, last_stage, 1287 bytes_transferred, dirty_ram_abs); 1288 if (tmppages < 0) { 1289 return tmppages; 1290 } 1291 1292 pages += tmppages; 1293 pss->offset += TARGET_PAGE_SIZE; 1294 dirty_ram_abs += TARGET_PAGE_SIZE; 1295 } while (pss->offset & (qemu_host_page_size - 1)); 1296 1297 /* The offset we leave with is the last one we looked at */ 1298 pss->offset -= TARGET_PAGE_SIZE; 1299 return pages; 1300 } 1301 1302 /** 1303 * ram_find_and_save_block: Finds a dirty page and sends it to f 1304 * 1305 * Called within an RCU critical section. 1306 * 1307 * Returns: The number of pages written 1308 * 0 means no dirty pages 1309 * 1310 * @f: QEMUFile where to send the data 1311 * @last_stage: if we are at the completion stage 1312 * @bytes_transferred: increase it with the number of transferred bytes 1313 * 1314 * On systems where host-page-size > target-page-size it will send all the 1315 * pages in a host page that are dirty. 1316 */ 1317 1318 static int ram_find_and_save_block(QEMUFile *f, bool last_stage, 1319 uint64_t *bytes_transferred) 1320 { 1321 PageSearchStatus pss; 1322 MigrationState *ms = migrate_get_current(); 1323 int pages = 0; 1324 bool again, found; 1325 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in 1326 ram_addr_t space */ 1327 1328 pss.block = last_seen_block; 1329 pss.offset = last_offset; 1330 pss.complete_round = false; 1331 1332 if (!pss.block) { 1333 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1334 } 1335 1336 do { 1337 again = true; 1338 found = get_queued_page(ms, &pss, &dirty_ram_abs); 1339 1340 if (!found) { 1341 /* priority queue empty, so just search for something dirty */ 1342 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); 1343 } 1344 1345 if (found) { 1346 pages = ram_save_host_page(ms, f, &pss, 1347 last_stage, bytes_transferred, 1348 dirty_ram_abs); 1349 } 1350 } while (!pages && again); 1351 1352 last_seen_block = pss.block; 1353 last_offset = pss.offset; 1354 1355 return pages; 1356 } 1357 1358 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1359 { 1360 uint64_t pages = size / TARGET_PAGE_SIZE; 1361 if (zero) { 1362 acct_info.dup_pages += pages; 1363 } else { 1364 acct_info.norm_pages += pages; 1365 bytes_transferred += size; 1366 qemu_update_position(f, size); 1367 } 1368 } 1369 1370 static ram_addr_t ram_save_remaining(void) 1371 { 1372 return migration_dirty_pages; 1373 } 1374 1375 uint64_t ram_bytes_remaining(void) 1376 { 1377 return ram_save_remaining() * TARGET_PAGE_SIZE; 1378 } 1379 1380 uint64_t ram_bytes_transferred(void) 1381 { 1382 return bytes_transferred; 1383 } 1384 1385 uint64_t ram_bytes_total(void) 1386 { 1387 RAMBlock *block; 1388 uint64_t total = 0; 1389 1390 rcu_read_lock(); 1391 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) 1392 total += block->used_length; 1393 rcu_read_unlock(); 1394 return total; 1395 } 1396 1397 void free_xbzrle_decoded_buf(void) 1398 { 1399 g_free(xbzrle_decoded_buf); 1400 xbzrle_decoded_buf = NULL; 1401 } 1402 1403 static void migration_bitmap_free(struct BitmapRcu *bmap) 1404 { 1405 g_free(bmap->bmap); 1406 g_free(bmap->unsentmap); 1407 g_free(bmap); 1408 } 1409 1410 static void ram_migration_cleanup(void *opaque) 1411 { 1412 /* caller have hold iothread lock or is in a bh, so there is 1413 * no writing race against this migration_bitmap 1414 */ 1415 struct BitmapRcu *bitmap = migration_bitmap_rcu; 1416 atomic_rcu_set(&migration_bitmap_rcu, NULL); 1417 if (bitmap) { 1418 memory_global_dirty_log_stop(); 1419 call_rcu(bitmap, migration_bitmap_free, rcu); 1420 } 1421 1422 XBZRLE_cache_lock(); 1423 if (XBZRLE.cache) { 1424 cache_fini(XBZRLE.cache); 1425 g_free(XBZRLE.encoded_buf); 1426 g_free(XBZRLE.current_buf); 1427 g_free(ZERO_TARGET_PAGE); 1428 XBZRLE.cache = NULL; 1429 XBZRLE.encoded_buf = NULL; 1430 XBZRLE.current_buf = NULL; 1431 } 1432 XBZRLE_cache_unlock(); 1433 } 1434 1435 static void reset_ram_globals(void) 1436 { 1437 last_seen_block = NULL; 1438 last_sent_block = NULL; 1439 last_offset = 0; 1440 last_version = ram_list.version; 1441 ram_bulk_stage = true; 1442 } 1443 1444 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1445 1446 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) 1447 { 1448 /* called in qemu main thread, so there is 1449 * no writing race against this migration_bitmap 1450 */ 1451 if (migration_bitmap_rcu) { 1452 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap; 1453 bitmap = g_new(struct BitmapRcu, 1); 1454 bitmap->bmap = bitmap_new(new); 1455 1456 /* prevent migration_bitmap content from being set bit 1457 * by migration_bitmap_sync_range() at the same time. 1458 * it is safe to migration if migration_bitmap is cleared bit 1459 * at the same time. 1460 */ 1461 qemu_mutex_lock(&migration_bitmap_mutex); 1462 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old); 1463 bitmap_set(bitmap->bmap, old, new - old); 1464 1465 /* We don't have a way to safely extend the sentmap 1466 * with RCU; so mark it as missing, entry to postcopy 1467 * will fail. 1468 */ 1469 bitmap->unsentmap = NULL; 1470 1471 atomic_rcu_set(&migration_bitmap_rcu, bitmap); 1472 qemu_mutex_unlock(&migration_bitmap_mutex); 1473 migration_dirty_pages += new - old; 1474 call_rcu(old_bitmap, migration_bitmap_free, rcu); 1475 } 1476 } 1477 1478 /* 1479 * 'expected' is the value you expect the bitmap mostly to be full 1480 * of; it won't bother printing lines that are all this value. 1481 * If 'todump' is null the migration bitmap is dumped. 1482 */ 1483 void ram_debug_dump_bitmap(unsigned long *todump, bool expected) 1484 { 1485 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; 1486 1487 int64_t cur; 1488 int64_t linelen = 128; 1489 char linebuf[129]; 1490 1491 if (!todump) { 1492 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1493 } 1494 1495 for (cur = 0; cur < ram_pages; cur += linelen) { 1496 int64_t curb; 1497 bool found = false; 1498 /* 1499 * Last line; catch the case where the line length 1500 * is longer than remaining ram 1501 */ 1502 if (cur + linelen > ram_pages) { 1503 linelen = ram_pages - cur; 1504 } 1505 for (curb = 0; curb < linelen; curb++) { 1506 bool thisbit = test_bit(cur + curb, todump); 1507 linebuf[curb] = thisbit ? '1' : '.'; 1508 found = found || (thisbit != expected); 1509 } 1510 if (found) { 1511 linebuf[curb] = '\0'; 1512 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1513 } 1514 } 1515 } 1516 1517 /* **** functions for postcopy ***** */ 1518 1519 /* 1520 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1521 * Note: At this point the 'unsentmap' is the processed bitmap combined 1522 * with the dirtymap; so a '1' means it's either dirty or unsent. 1523 * start,length: Indexes into the bitmap for the first bit 1524 * representing the named block and length in target-pages 1525 */ 1526 static int postcopy_send_discard_bm_ram(MigrationState *ms, 1527 PostcopyDiscardState *pds, 1528 unsigned long start, 1529 unsigned long length) 1530 { 1531 unsigned long end = start + length; /* one after the end */ 1532 unsigned long current; 1533 unsigned long *unsentmap; 1534 1535 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1536 for (current = start; current < end; ) { 1537 unsigned long one = find_next_bit(unsentmap, end, current); 1538 1539 if (one <= end) { 1540 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); 1541 unsigned long discard_length; 1542 1543 if (zero >= end) { 1544 discard_length = end - one; 1545 } else { 1546 discard_length = zero - one; 1547 } 1548 if (discard_length) { 1549 postcopy_discard_send_range(ms, pds, one, discard_length); 1550 } 1551 current = one + discard_length; 1552 } else { 1553 current = one; 1554 } 1555 } 1556 1557 return 0; 1558 } 1559 1560 /* 1561 * Utility for the outgoing postcopy code. 1562 * Calls postcopy_send_discard_bm_ram for each RAMBlock 1563 * passing it bitmap indexes and name. 1564 * Returns: 0 on success 1565 * (qemu_ram_foreach_block ends up passing unscaled lengths 1566 * which would mean postcopy code would have to deal with target page) 1567 */ 1568 static int postcopy_each_ram_send_discard(MigrationState *ms) 1569 { 1570 struct RAMBlock *block; 1571 int ret; 1572 1573 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1574 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1575 PostcopyDiscardState *pds = postcopy_discard_send_init(ms, 1576 first, 1577 block->idstr); 1578 1579 /* 1580 * Postcopy sends chunks of bitmap over the wire, but it 1581 * just needs indexes at this point, avoids it having 1582 * target page specific code. 1583 */ 1584 ret = postcopy_send_discard_bm_ram(ms, pds, first, 1585 block->used_length >> TARGET_PAGE_BITS); 1586 postcopy_discard_send_finish(ms, pds); 1587 if (ret) { 1588 return ret; 1589 } 1590 } 1591 1592 return 0; 1593 } 1594 1595 /* 1596 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup 1597 * the two bitmaps, that are similar, but one is inverted. 1598 * 1599 * We search for runs of target-pages that don't start or end on a 1600 * host page boundary; 1601 * unsent_pass=true: Cleans up partially unsent host pages by searching 1602 * the unsentmap 1603 * unsent_pass=false: Cleans up partially dirty host pages by searching 1604 * the main migration bitmap 1605 * 1606 */ 1607 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, 1608 RAMBlock *block, 1609 PostcopyDiscardState *pds) 1610 { 1611 unsigned long *bitmap; 1612 unsigned long *unsentmap; 1613 unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE; 1614 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1615 unsigned long len = block->used_length >> TARGET_PAGE_BITS; 1616 unsigned long last = first + (len - 1); 1617 unsigned long run_start; 1618 1619 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1620 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1621 1622 if (unsent_pass) { 1623 /* Find a sent page */ 1624 run_start = find_next_zero_bit(unsentmap, last + 1, first); 1625 } else { 1626 /* Find a dirty page */ 1627 run_start = find_next_bit(bitmap, last + 1, first); 1628 } 1629 1630 while (run_start <= last) { 1631 bool do_fixup = false; 1632 unsigned long fixup_start_addr; 1633 unsigned long host_offset; 1634 1635 /* 1636 * If the start of this run of pages is in the middle of a host 1637 * page, then we need to fixup this host page. 1638 */ 1639 host_offset = run_start % host_ratio; 1640 if (host_offset) { 1641 do_fixup = true; 1642 run_start -= host_offset; 1643 fixup_start_addr = run_start; 1644 /* For the next pass */ 1645 run_start = run_start + host_ratio; 1646 } else { 1647 /* Find the end of this run */ 1648 unsigned long run_end; 1649 if (unsent_pass) { 1650 run_end = find_next_bit(unsentmap, last + 1, run_start + 1); 1651 } else { 1652 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1); 1653 } 1654 /* 1655 * If the end isn't at the start of a host page, then the 1656 * run doesn't finish at the end of a host page 1657 * and we need to discard. 1658 */ 1659 host_offset = run_end % host_ratio; 1660 if (host_offset) { 1661 do_fixup = true; 1662 fixup_start_addr = run_end - host_offset; 1663 /* 1664 * This host page has gone, the next loop iteration starts 1665 * from after the fixup 1666 */ 1667 run_start = fixup_start_addr + host_ratio; 1668 } else { 1669 /* 1670 * No discards on this iteration, next loop starts from 1671 * next sent/dirty page 1672 */ 1673 run_start = run_end + 1; 1674 } 1675 } 1676 1677 if (do_fixup) { 1678 unsigned long page; 1679 1680 /* Tell the destination to discard this page */ 1681 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { 1682 /* For the unsent_pass we: 1683 * discard partially sent pages 1684 * For the !unsent_pass (dirty) we: 1685 * discard partially dirty pages that were sent 1686 * (any partially sent pages were already discarded 1687 * by the previous unsent_pass) 1688 */ 1689 postcopy_discard_send_range(ms, pds, fixup_start_addr, 1690 host_ratio); 1691 } 1692 1693 /* Clean up the bitmap */ 1694 for (page = fixup_start_addr; 1695 page < fixup_start_addr + host_ratio; page++) { 1696 /* All pages in this host page are now not sent */ 1697 set_bit(page, unsentmap); 1698 1699 /* 1700 * Remark them as dirty, updating the count for any pages 1701 * that weren't previously dirty. 1702 */ 1703 migration_dirty_pages += !test_and_set_bit(page, bitmap); 1704 } 1705 } 1706 1707 if (unsent_pass) { 1708 /* Find the next sent page for the next iteration */ 1709 run_start = find_next_zero_bit(unsentmap, last + 1, 1710 run_start); 1711 } else { 1712 /* Find the next dirty page for the next iteration */ 1713 run_start = find_next_bit(bitmap, last + 1, run_start); 1714 } 1715 } 1716 } 1717 1718 /* 1719 * Utility for the outgoing postcopy code. 1720 * 1721 * Discard any partially sent host-page size chunks, mark any partially 1722 * dirty host-page size chunks as all dirty. 1723 * 1724 * Returns: 0 on success 1725 */ 1726 static int postcopy_chunk_hostpages(MigrationState *ms) 1727 { 1728 struct RAMBlock *block; 1729 1730 if (qemu_host_page_size == TARGET_PAGE_SIZE) { 1731 /* Easy case - TPS==HPS - nothing to be done */ 1732 return 0; 1733 } 1734 1735 /* Easiest way to make sure we don't resume in the middle of a host-page */ 1736 last_seen_block = NULL; 1737 last_sent_block = NULL; 1738 last_offset = 0; 1739 1740 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1741 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1742 1743 PostcopyDiscardState *pds = 1744 postcopy_discard_send_init(ms, first, block->idstr); 1745 1746 /* First pass: Discard all partially sent host pages */ 1747 postcopy_chunk_hostpages_pass(ms, true, block, pds); 1748 /* 1749 * Second pass: Ensure that all partially dirty host pages are made 1750 * fully dirty. 1751 */ 1752 postcopy_chunk_hostpages_pass(ms, false, block, pds); 1753 1754 postcopy_discard_send_finish(ms, pds); 1755 } /* ram_list loop */ 1756 1757 return 0; 1758 } 1759 1760 /* 1761 * Transmit the set of pages to be discarded after precopy to the target 1762 * these are pages that: 1763 * a) Have been previously transmitted but are now dirty again 1764 * b) Pages that have never been transmitted, this ensures that 1765 * any pages on the destination that have been mapped by background 1766 * tasks get discarded (transparent huge pages is the specific concern) 1767 * Hopefully this is pretty sparse 1768 */ 1769 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 1770 { 1771 int ret; 1772 unsigned long *bitmap, *unsentmap; 1773 1774 rcu_read_lock(); 1775 1776 /* This should be our last sync, the src is now paused */ 1777 migration_bitmap_sync(); 1778 1779 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1780 if (!unsentmap) { 1781 /* We don't have a safe way to resize the sentmap, so 1782 * if the bitmap was resized it will be NULL at this 1783 * point. 1784 */ 1785 error_report("migration ram resized during precopy phase"); 1786 rcu_read_unlock(); 1787 return -EINVAL; 1788 } 1789 1790 /* Deal with TPS != HPS */ 1791 ret = postcopy_chunk_hostpages(ms); 1792 if (ret) { 1793 rcu_read_unlock(); 1794 return ret; 1795 } 1796 1797 /* 1798 * Update the unsentmap to be unsentmap = unsentmap | dirty 1799 */ 1800 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1801 bitmap_or(unsentmap, unsentmap, bitmap, 1802 last_ram_offset() >> TARGET_PAGE_BITS); 1803 1804 1805 trace_ram_postcopy_send_discard_bitmap(); 1806 #ifdef DEBUG_POSTCOPY 1807 ram_debug_dump_bitmap(unsentmap, true); 1808 #endif 1809 1810 ret = postcopy_each_ram_send_discard(ms); 1811 rcu_read_unlock(); 1812 1813 return ret; 1814 } 1815 1816 /* 1817 * At the start of the postcopy phase of migration, any now-dirty 1818 * precopied pages are discarded. 1819 * 1820 * start, length describe a byte address range within the RAMBlock 1821 * 1822 * Returns 0 on success. 1823 */ 1824 int ram_discard_range(MigrationIncomingState *mis, 1825 const char *block_name, 1826 uint64_t start, size_t length) 1827 { 1828 int ret = -1; 1829 1830 rcu_read_lock(); 1831 RAMBlock *rb = qemu_ram_block_by_name(block_name); 1832 1833 if (!rb) { 1834 error_report("ram_discard_range: Failed to find block '%s'", 1835 block_name); 1836 goto err; 1837 } 1838 1839 uint8_t *host_startaddr = rb->host + start; 1840 1841 if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) { 1842 error_report("ram_discard_range: Unaligned start address: %p", 1843 host_startaddr); 1844 goto err; 1845 } 1846 1847 if ((start + length) <= rb->used_length) { 1848 uint8_t *host_endaddr = host_startaddr + length; 1849 if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) { 1850 error_report("ram_discard_range: Unaligned end address: %p", 1851 host_endaddr); 1852 goto err; 1853 } 1854 ret = postcopy_ram_discard_range(mis, host_startaddr, length); 1855 } else { 1856 error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 1857 "/%zx/" RAM_ADDR_FMT")", 1858 block_name, start, length, rb->used_length); 1859 } 1860 1861 err: 1862 rcu_read_unlock(); 1863 1864 return ret; 1865 } 1866 1867 static int ram_save_init_globals(void) 1868 { 1869 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ 1870 1871 dirty_rate_high_cnt = 0; 1872 bitmap_sync_count = 0; 1873 migration_bitmap_sync_init(); 1874 qemu_mutex_init(&migration_bitmap_mutex); 1875 1876 if (migrate_use_xbzrle()) { 1877 XBZRLE_cache_lock(); 1878 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE); 1879 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / 1880 TARGET_PAGE_SIZE, 1881 TARGET_PAGE_SIZE); 1882 if (!XBZRLE.cache) { 1883 XBZRLE_cache_unlock(); 1884 error_report("Error creating cache"); 1885 return -1; 1886 } 1887 XBZRLE_cache_unlock(); 1888 1889 /* We prefer not to abort if there is no memory */ 1890 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 1891 if (!XBZRLE.encoded_buf) { 1892 error_report("Error allocating encoded_buf"); 1893 return -1; 1894 } 1895 1896 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 1897 if (!XBZRLE.current_buf) { 1898 error_report("Error allocating current_buf"); 1899 g_free(XBZRLE.encoded_buf); 1900 XBZRLE.encoded_buf = NULL; 1901 return -1; 1902 } 1903 1904 acct_clear(); 1905 } 1906 1907 /* For memory_global_dirty_log_start below. */ 1908 qemu_mutex_lock_iothread(); 1909 1910 qemu_mutex_lock_ramlist(); 1911 rcu_read_lock(); 1912 bytes_transferred = 0; 1913 reset_ram_globals(); 1914 1915 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; 1916 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); 1917 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); 1918 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); 1919 1920 if (migrate_postcopy_ram()) { 1921 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); 1922 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); 1923 } 1924 1925 /* 1926 * Count the total number of pages used by ram blocks not including any 1927 * gaps due to alignment or unplugs. 1928 */ 1929 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 1930 1931 memory_global_dirty_log_start(); 1932 migration_bitmap_sync(); 1933 qemu_mutex_unlock_ramlist(); 1934 qemu_mutex_unlock_iothread(); 1935 rcu_read_unlock(); 1936 1937 return 0; 1938 } 1939 1940 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has 1941 * long-running RCU critical section. When rcu-reclaims in the code 1942 * start to become numerous it will be necessary to reduce the 1943 * granularity of these critical sections. 1944 */ 1945 1946 static int ram_save_setup(QEMUFile *f, void *opaque) 1947 { 1948 RAMBlock *block; 1949 1950 /* migration has already setup the bitmap, reuse it. */ 1951 if (!migration_in_colo_state()) { 1952 if (ram_save_init_globals() < 0) { 1953 return -1; 1954 } 1955 } 1956 1957 rcu_read_lock(); 1958 1959 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); 1960 1961 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1962 qemu_put_byte(f, strlen(block->idstr)); 1963 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 1964 qemu_put_be64(f, block->used_length); 1965 } 1966 1967 rcu_read_unlock(); 1968 1969 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 1970 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 1971 1972 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 1973 1974 return 0; 1975 } 1976 1977 static int ram_save_iterate(QEMUFile *f, void *opaque) 1978 { 1979 int ret; 1980 int i; 1981 int64_t t0; 1982 int done = 0; 1983 1984 rcu_read_lock(); 1985 if (ram_list.version != last_version) { 1986 reset_ram_globals(); 1987 } 1988 1989 /* Read version before ram_list.blocks */ 1990 smp_rmb(); 1991 1992 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 1993 1994 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 1995 i = 0; 1996 while ((ret = qemu_file_rate_limit(f)) == 0) { 1997 int pages; 1998 1999 pages = ram_find_and_save_block(f, false, &bytes_transferred); 2000 /* no more pages to sent */ 2001 if (pages == 0) { 2002 done = 1; 2003 break; 2004 } 2005 acct_info.iterations++; 2006 2007 /* we want to check in the 1st loop, just in case it was the 1st time 2008 and we had to sync the dirty bitmap. 2009 qemu_get_clock_ns() is a bit expensive, so we only check each some 2010 iterations 2011 */ 2012 if ((i & 63) == 0) { 2013 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; 2014 if (t1 > MAX_WAIT) { 2015 trace_ram_save_iterate_big_wait(t1, i); 2016 break; 2017 } 2018 } 2019 i++; 2020 } 2021 flush_compressed_data(f); 2022 rcu_read_unlock(); 2023 2024 /* 2025 * Must occur before EOS (or any QEMUFile operation) 2026 * because of RDMA protocol. 2027 */ 2028 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2029 2030 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2031 bytes_transferred += 8; 2032 2033 ret = qemu_file_get_error(f); 2034 if (ret < 0) { 2035 return ret; 2036 } 2037 2038 return done; 2039 } 2040 2041 /* Called with iothread lock */ 2042 static int ram_save_complete(QEMUFile *f, void *opaque) 2043 { 2044 rcu_read_lock(); 2045 2046 if (!migration_in_postcopy(migrate_get_current())) { 2047 migration_bitmap_sync(); 2048 } 2049 2050 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2051 2052 /* try transferring iterative blocks of memory */ 2053 2054 /* flush all remaining blocks regardless of rate limiting */ 2055 while (true) { 2056 int pages; 2057 2058 pages = ram_find_and_save_block(f, !migration_in_colo_state(), 2059 &bytes_transferred); 2060 /* no more blocks to sent */ 2061 if (pages == 0) { 2062 break; 2063 } 2064 } 2065 2066 flush_compressed_data(f); 2067 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2068 2069 rcu_read_unlock(); 2070 2071 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2072 2073 return 0; 2074 } 2075 2076 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2077 uint64_t *non_postcopiable_pending, 2078 uint64_t *postcopiable_pending) 2079 { 2080 uint64_t remaining_size; 2081 2082 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; 2083 2084 if (!migration_in_postcopy(migrate_get_current()) && 2085 remaining_size < max_size) { 2086 qemu_mutex_lock_iothread(); 2087 rcu_read_lock(); 2088 migration_bitmap_sync(); 2089 rcu_read_unlock(); 2090 qemu_mutex_unlock_iothread(); 2091 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; 2092 } 2093 2094 /* We can do postcopy, and all the data is postcopiable */ 2095 *postcopiable_pending += remaining_size; 2096 } 2097 2098 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2099 { 2100 unsigned int xh_len; 2101 int xh_flags; 2102 uint8_t *loaded_data; 2103 2104 if (!xbzrle_decoded_buf) { 2105 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2106 } 2107 loaded_data = xbzrle_decoded_buf; 2108 2109 /* extract RLE header */ 2110 xh_flags = qemu_get_byte(f); 2111 xh_len = qemu_get_be16(f); 2112 2113 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2114 error_report("Failed to load XBZRLE page - wrong compression!"); 2115 return -1; 2116 } 2117 2118 if (xh_len > TARGET_PAGE_SIZE) { 2119 error_report("Failed to load XBZRLE page - len overflow!"); 2120 return -1; 2121 } 2122 /* load data and decode */ 2123 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2124 2125 /* decode RLE */ 2126 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2127 TARGET_PAGE_SIZE) == -1) { 2128 error_report("Failed to load XBZRLE page - decode error!"); 2129 return -1; 2130 } 2131 2132 return 0; 2133 } 2134 2135 /* Must be called from within a rcu critical section. 2136 * Returns a pointer from within the RCU-protected ram_list. 2137 */ 2138 /* 2139 * Read a RAMBlock ID from the stream f. 2140 * 2141 * f: Stream to read from 2142 * flags: Page flags (mostly to see if it's a continuation of previous block) 2143 */ 2144 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, 2145 int flags) 2146 { 2147 static RAMBlock *block = NULL; 2148 char id[256]; 2149 uint8_t len; 2150 2151 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2152 if (!block) { 2153 error_report("Ack, bad migration stream!"); 2154 return NULL; 2155 } 2156 return block; 2157 } 2158 2159 len = qemu_get_byte(f); 2160 qemu_get_buffer(f, (uint8_t *)id, len); 2161 id[len] = 0; 2162 2163 block = qemu_ram_block_by_name(id); 2164 if (!block) { 2165 error_report("Can't find block %s", id); 2166 return NULL; 2167 } 2168 2169 return block; 2170 } 2171 2172 static inline void *host_from_ram_block_offset(RAMBlock *block, 2173 ram_addr_t offset) 2174 { 2175 if (!offset_in_ramblock(block, offset)) { 2176 return NULL; 2177 } 2178 2179 return block->host + offset; 2180 } 2181 2182 /* 2183 * If a page (or a whole RDMA chunk) has been 2184 * determined to be zero, then zap it. 2185 */ 2186 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2187 { 2188 if (ch != 0 || !is_zero_range(host, size)) { 2189 memset(host, ch, size); 2190 } 2191 } 2192 2193 static void *do_data_decompress(void *opaque) 2194 { 2195 DecompressParam *param = opaque; 2196 unsigned long pagesize; 2197 uint8_t *des; 2198 int len; 2199 2200 qemu_mutex_lock(¶m->mutex); 2201 while (!param->quit) { 2202 if (param->des) { 2203 des = param->des; 2204 len = param->len; 2205 param->des = 0; 2206 qemu_mutex_unlock(¶m->mutex); 2207 2208 pagesize = TARGET_PAGE_SIZE; 2209 /* uncompress() will return failed in some case, especially 2210 * when the page is dirted when doing the compression, it's 2211 * not a problem because the dirty page will be retransferred 2212 * and uncompress() won't break the data in other pages. 2213 */ 2214 uncompress((Bytef *)des, &pagesize, 2215 (const Bytef *)param->compbuf, len); 2216 2217 qemu_mutex_lock(&decomp_done_lock); 2218 param->done = true; 2219 qemu_cond_signal(&decomp_done_cond); 2220 qemu_mutex_unlock(&decomp_done_lock); 2221 2222 qemu_mutex_lock(¶m->mutex); 2223 } else { 2224 qemu_cond_wait(¶m->cond, ¶m->mutex); 2225 } 2226 } 2227 qemu_mutex_unlock(¶m->mutex); 2228 2229 return NULL; 2230 } 2231 2232 static void wait_for_decompress_done(void) 2233 { 2234 int idx, thread_count; 2235 2236 if (!migrate_use_compression()) { 2237 return; 2238 } 2239 2240 thread_count = migrate_decompress_threads(); 2241 qemu_mutex_lock(&decomp_done_lock); 2242 for (idx = 0; idx < thread_count; idx++) { 2243 while (!decomp_param[idx].done) { 2244 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2245 } 2246 } 2247 qemu_mutex_unlock(&decomp_done_lock); 2248 } 2249 2250 void migrate_decompress_threads_create(void) 2251 { 2252 int i, thread_count; 2253 2254 thread_count = migrate_decompress_threads(); 2255 decompress_threads = g_new0(QemuThread, thread_count); 2256 decomp_param = g_new0(DecompressParam, thread_count); 2257 qemu_mutex_init(&decomp_done_lock); 2258 qemu_cond_init(&decomp_done_cond); 2259 for (i = 0; i < thread_count; i++) { 2260 qemu_mutex_init(&decomp_param[i].mutex); 2261 qemu_cond_init(&decomp_param[i].cond); 2262 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2263 decomp_param[i].done = true; 2264 decomp_param[i].quit = false; 2265 qemu_thread_create(decompress_threads + i, "decompress", 2266 do_data_decompress, decomp_param + i, 2267 QEMU_THREAD_JOINABLE); 2268 } 2269 } 2270 2271 void migrate_decompress_threads_join(void) 2272 { 2273 int i, thread_count; 2274 2275 thread_count = migrate_decompress_threads(); 2276 for (i = 0; i < thread_count; i++) { 2277 qemu_mutex_lock(&decomp_param[i].mutex); 2278 decomp_param[i].quit = true; 2279 qemu_cond_signal(&decomp_param[i].cond); 2280 qemu_mutex_unlock(&decomp_param[i].mutex); 2281 } 2282 for (i = 0; i < thread_count; i++) { 2283 qemu_thread_join(decompress_threads + i); 2284 qemu_mutex_destroy(&decomp_param[i].mutex); 2285 qemu_cond_destroy(&decomp_param[i].cond); 2286 g_free(decomp_param[i].compbuf); 2287 } 2288 g_free(decompress_threads); 2289 g_free(decomp_param); 2290 decompress_threads = NULL; 2291 decomp_param = NULL; 2292 } 2293 2294 static void decompress_data_with_multi_threads(QEMUFile *f, 2295 void *host, int len) 2296 { 2297 int idx, thread_count; 2298 2299 thread_count = migrate_decompress_threads(); 2300 qemu_mutex_lock(&decomp_done_lock); 2301 while (true) { 2302 for (idx = 0; idx < thread_count; idx++) { 2303 if (decomp_param[idx].done) { 2304 decomp_param[idx].done = false; 2305 qemu_mutex_lock(&decomp_param[idx].mutex); 2306 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2307 decomp_param[idx].des = host; 2308 decomp_param[idx].len = len; 2309 qemu_cond_signal(&decomp_param[idx].cond); 2310 qemu_mutex_unlock(&decomp_param[idx].mutex); 2311 break; 2312 } 2313 } 2314 if (idx < thread_count) { 2315 break; 2316 } else { 2317 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2318 } 2319 } 2320 qemu_mutex_unlock(&decomp_done_lock); 2321 } 2322 2323 /* 2324 * Allocate data structures etc needed by incoming migration with postcopy-ram 2325 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work 2326 */ 2327 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 2328 { 2329 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; 2330 2331 return postcopy_ram_incoming_init(mis, ram_pages); 2332 } 2333 2334 /* 2335 * Called in postcopy mode by ram_load(). 2336 * rcu_read_lock is taken prior to this being called. 2337 */ 2338 static int ram_load_postcopy(QEMUFile *f) 2339 { 2340 int flags = 0, ret = 0; 2341 bool place_needed = false; 2342 bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE; 2343 MigrationIncomingState *mis = migration_incoming_get_current(); 2344 /* Temporary page that is later 'placed' */ 2345 void *postcopy_host_page = postcopy_get_tmp_page(mis); 2346 void *last_host = NULL; 2347 bool all_zero = false; 2348 2349 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2350 ram_addr_t addr; 2351 void *host = NULL; 2352 void *page_buffer = NULL; 2353 void *place_source = NULL; 2354 uint8_t ch; 2355 2356 addr = qemu_get_be64(f); 2357 flags = addr & ~TARGET_PAGE_MASK; 2358 addr &= TARGET_PAGE_MASK; 2359 2360 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 2361 place_needed = false; 2362 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) { 2363 RAMBlock *block = ram_block_from_stream(f, flags); 2364 2365 host = host_from_ram_block_offset(block, addr); 2366 if (!host) { 2367 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2368 ret = -EINVAL; 2369 break; 2370 } 2371 /* 2372 * Postcopy requires that we place whole host pages atomically. 2373 * To make it atomic, the data is read into a temporary page 2374 * that's moved into place later. 2375 * The migration protocol uses, possibly smaller, target-pages 2376 * however the source ensures it always sends all the components 2377 * of a host page in order. 2378 */ 2379 page_buffer = postcopy_host_page + 2380 ((uintptr_t)host & ~qemu_host_page_mask); 2381 /* If all TP are zero then we can optimise the place */ 2382 if (!((uintptr_t)host & ~qemu_host_page_mask)) { 2383 all_zero = true; 2384 } else { 2385 /* not the 1st TP within the HP */ 2386 if (host != (last_host + TARGET_PAGE_SIZE)) { 2387 error_report("Non-sequential target page %p/%p", 2388 host, last_host); 2389 ret = -EINVAL; 2390 break; 2391 } 2392 } 2393 2394 2395 /* 2396 * If it's the last part of a host page then we place the host 2397 * page 2398 */ 2399 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & 2400 ~qemu_host_page_mask) == 0; 2401 place_source = postcopy_host_page; 2402 } 2403 last_host = host; 2404 2405 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2406 case RAM_SAVE_FLAG_COMPRESS: 2407 ch = qemu_get_byte(f); 2408 memset(page_buffer, ch, TARGET_PAGE_SIZE); 2409 if (ch) { 2410 all_zero = false; 2411 } 2412 break; 2413 2414 case RAM_SAVE_FLAG_PAGE: 2415 all_zero = false; 2416 if (!place_needed || !matching_page_sizes) { 2417 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 2418 } else { 2419 /* Avoids the qemu_file copy during postcopy, which is 2420 * going to do a copy later; can only do it when we 2421 * do this read in one go (matching page sizes) 2422 */ 2423 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 2424 TARGET_PAGE_SIZE); 2425 } 2426 break; 2427 case RAM_SAVE_FLAG_EOS: 2428 /* normal exit */ 2429 break; 2430 default: 2431 error_report("Unknown combination of migration flags: %#x" 2432 " (postcopy mode)", flags); 2433 ret = -EINVAL; 2434 } 2435 2436 if (place_needed) { 2437 /* This gets called at the last target page in the host page */ 2438 if (all_zero) { 2439 ret = postcopy_place_page_zero(mis, 2440 host + TARGET_PAGE_SIZE - 2441 qemu_host_page_size); 2442 } else { 2443 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE - 2444 qemu_host_page_size, 2445 place_source); 2446 } 2447 } 2448 if (!ret) { 2449 ret = qemu_file_get_error(f); 2450 } 2451 } 2452 2453 return ret; 2454 } 2455 2456 static int ram_load(QEMUFile *f, void *opaque, int version_id) 2457 { 2458 int flags = 0, ret = 0; 2459 static uint64_t seq_iter; 2460 int len = 0; 2461 /* 2462 * If system is running in postcopy mode, page inserts to host memory must 2463 * be atomic 2464 */ 2465 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; 2466 2467 seq_iter++; 2468 2469 if (version_id != 4) { 2470 ret = -EINVAL; 2471 } 2472 2473 /* This RCU critical section can be very long running. 2474 * When RCU reclaims in the code start to become numerous, 2475 * it will be necessary to reduce the granularity of this 2476 * critical section. 2477 */ 2478 rcu_read_lock(); 2479 2480 if (postcopy_running) { 2481 ret = ram_load_postcopy(f); 2482 } 2483 2484 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2485 ram_addr_t addr, total_ram_bytes; 2486 void *host = NULL; 2487 uint8_t ch; 2488 2489 addr = qemu_get_be64(f); 2490 flags = addr & ~TARGET_PAGE_MASK; 2491 addr &= TARGET_PAGE_MASK; 2492 2493 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE | 2494 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 2495 RAMBlock *block = ram_block_from_stream(f, flags); 2496 2497 host = host_from_ram_block_offset(block, addr); 2498 if (!host) { 2499 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2500 ret = -EINVAL; 2501 break; 2502 } 2503 } 2504 2505 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2506 case RAM_SAVE_FLAG_MEM_SIZE: 2507 /* Synchronize RAM block list */ 2508 total_ram_bytes = addr; 2509 while (!ret && total_ram_bytes) { 2510 RAMBlock *block; 2511 char id[256]; 2512 ram_addr_t length; 2513 2514 len = qemu_get_byte(f); 2515 qemu_get_buffer(f, (uint8_t *)id, len); 2516 id[len] = 0; 2517 length = qemu_get_be64(f); 2518 2519 block = qemu_ram_block_by_name(id); 2520 if (block) { 2521 if (length != block->used_length) { 2522 Error *local_err = NULL; 2523 2524 ret = qemu_ram_resize(block, length, 2525 &local_err); 2526 if (local_err) { 2527 error_report_err(local_err); 2528 } 2529 } 2530 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 2531 block->idstr); 2532 } else { 2533 error_report("Unknown ramblock \"%s\", cannot " 2534 "accept migration", id); 2535 ret = -EINVAL; 2536 } 2537 2538 total_ram_bytes -= length; 2539 } 2540 break; 2541 2542 case RAM_SAVE_FLAG_COMPRESS: 2543 ch = qemu_get_byte(f); 2544 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 2545 break; 2546 2547 case RAM_SAVE_FLAG_PAGE: 2548 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 2549 break; 2550 2551 case RAM_SAVE_FLAG_COMPRESS_PAGE: 2552 len = qemu_get_be32(f); 2553 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 2554 error_report("Invalid compressed data length: %d", len); 2555 ret = -EINVAL; 2556 break; 2557 } 2558 decompress_data_with_multi_threads(f, host, len); 2559 break; 2560 2561 case RAM_SAVE_FLAG_XBZRLE: 2562 if (load_xbzrle(f, addr, host) < 0) { 2563 error_report("Failed to decompress XBZRLE page at " 2564 RAM_ADDR_FMT, addr); 2565 ret = -EINVAL; 2566 break; 2567 } 2568 break; 2569 case RAM_SAVE_FLAG_EOS: 2570 /* normal exit */ 2571 break; 2572 default: 2573 if (flags & RAM_SAVE_FLAG_HOOK) { 2574 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 2575 } else { 2576 error_report("Unknown combination of migration flags: %#x", 2577 flags); 2578 ret = -EINVAL; 2579 } 2580 } 2581 if (!ret) { 2582 ret = qemu_file_get_error(f); 2583 } 2584 } 2585 2586 wait_for_decompress_done(); 2587 rcu_read_unlock(); 2588 trace_ram_load_complete(ret, seq_iter); 2589 return ret; 2590 } 2591 2592 static SaveVMHandlers savevm_ram_handlers = { 2593 .save_live_setup = ram_save_setup, 2594 .save_live_iterate = ram_save_iterate, 2595 .save_live_complete_postcopy = ram_save_complete, 2596 .save_live_complete_precopy = ram_save_complete, 2597 .save_live_pending = ram_save_pending, 2598 .load_state = ram_load, 2599 .cleanup = ram_migration_cleanup, 2600 }; 2601 2602 void ram_mig_init(void) 2603 { 2604 qemu_mutex_init(&XBZRLE.lock); 2605 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL); 2606 } 2607