1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 #include "qemu/osdep.h" 29 #include "qemu-common.h" 30 #include "cpu.h" 31 #include <zlib.h> 32 #include "qapi-event.h" 33 #include "qemu/cutils.h" 34 #include "qemu/bitops.h" 35 #include "qemu/bitmap.h" 36 #include "qemu/timer.h" 37 #include "qemu/main-loop.h" 38 #include "xbzrle.h" 39 #include "ram.h" 40 #include "migration.h" 41 #include "migration/register.h" 42 #include "migration/misc.h" 43 #include "qemu-file.h" 44 #include "migration/vmstate.h" 45 #include "postcopy-ram.h" 46 #include "exec/address-spaces.h" 47 #include "migration/page_cache.h" 48 #include "qemu/error-report.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "qemu/rcu_queue.h" 52 #include "migration/colo.h" 53 54 /***********************************************************/ 55 /* ram save/restore */ 56 57 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 58 * worked for pages that where filled with the same char. We switched 59 * it to only search for the zero value. And to avoid confusion with 60 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 61 */ 62 63 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 64 #define RAM_SAVE_FLAG_ZERO 0x02 65 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 66 #define RAM_SAVE_FLAG_PAGE 0x08 67 #define RAM_SAVE_FLAG_EOS 0x10 68 #define RAM_SAVE_FLAG_CONTINUE 0x20 69 #define RAM_SAVE_FLAG_XBZRLE 0x40 70 /* 0x80 is reserved in migration.h start with 0x100 next */ 71 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 72 73 static inline bool is_zero_range(uint8_t *p, uint64_t size) 74 { 75 return buffer_is_zero(p, size); 76 } 77 78 XBZRLECacheStats xbzrle_counters; 79 80 /* struct contains XBZRLE cache and a static page 81 used by the compression */ 82 static struct { 83 /* buffer used for XBZRLE encoding */ 84 uint8_t *encoded_buf; 85 /* buffer for storing page content */ 86 uint8_t *current_buf; 87 /* Cache for XBZRLE, Protected by lock. */ 88 PageCache *cache; 89 QemuMutex lock; 90 /* it will store a page full of zeros */ 91 uint8_t *zero_target_page; 92 } XBZRLE; 93 94 /* buffer used for XBZRLE decoding */ 95 static uint8_t *xbzrle_decoded_buf; 96 97 static void XBZRLE_cache_lock(void) 98 { 99 if (migrate_use_xbzrle()) 100 qemu_mutex_lock(&XBZRLE.lock); 101 } 102 103 static void XBZRLE_cache_unlock(void) 104 { 105 if (migrate_use_xbzrle()) 106 qemu_mutex_unlock(&XBZRLE.lock); 107 } 108 109 /** 110 * xbzrle_cache_resize: resize the xbzrle cache 111 * 112 * This function is called from qmp_migrate_set_cache_size in main 113 * thread, possibly while a migration is in progress. A running 114 * migration may be using the cache and might finish during this call, 115 * hence changes to the cache are protected by XBZRLE.lock(). 116 * 117 * Returns the new_size or negative in case of error. 118 * 119 * @new_size: new cache size 120 */ 121 int64_t xbzrle_cache_resize(int64_t new_size) 122 { 123 PageCache *new_cache; 124 int64_t ret; 125 126 if (new_size < TARGET_PAGE_SIZE) { 127 return -1; 128 } 129 130 XBZRLE_cache_lock(); 131 132 if (XBZRLE.cache != NULL) { 133 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { 134 goto out_new_size; 135 } 136 new_cache = cache_init(new_size / TARGET_PAGE_SIZE, 137 TARGET_PAGE_SIZE); 138 if (!new_cache) { 139 error_report("Error creating cache"); 140 ret = -1; 141 goto out; 142 } 143 144 cache_fini(XBZRLE.cache); 145 XBZRLE.cache = new_cache; 146 } 147 148 out_new_size: 149 ret = pow2floor(new_size); 150 out: 151 XBZRLE_cache_unlock(); 152 return ret; 153 } 154 155 /* 156 * An outstanding page request, on the source, having been received 157 * and queued 158 */ 159 struct RAMSrcPageRequest { 160 RAMBlock *rb; 161 hwaddr offset; 162 hwaddr len; 163 164 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 165 }; 166 167 /* State of RAM for migration */ 168 struct RAMState { 169 /* QEMUFile used for this migration */ 170 QEMUFile *f; 171 /* Last block that we have visited searching for dirty pages */ 172 RAMBlock *last_seen_block; 173 /* Last block from where we have sent data */ 174 RAMBlock *last_sent_block; 175 /* Last dirty target page we have sent */ 176 ram_addr_t last_page; 177 /* last ram version we have seen */ 178 uint32_t last_version; 179 /* We are in the first round */ 180 bool ram_bulk_stage; 181 /* How many times we have dirty too many pages */ 182 int dirty_rate_high_cnt; 183 /* these variables are used for bitmap sync */ 184 /* last time we did a full bitmap_sync */ 185 int64_t time_last_bitmap_sync; 186 /* bytes transferred at start_time */ 187 uint64_t bytes_xfer_prev; 188 /* number of dirty pages since start_time */ 189 uint64_t num_dirty_pages_period; 190 /* xbzrle misses since the beginning of the period */ 191 uint64_t xbzrle_cache_miss_prev; 192 /* number of iterations at the beginning of period */ 193 uint64_t iterations_prev; 194 /* Iterations since start */ 195 uint64_t iterations; 196 /* protects modification of the bitmap */ 197 uint64_t migration_dirty_pages; 198 /* number of dirty bits in the bitmap */ 199 QemuMutex bitmap_mutex; 200 /* The RAMBlock used in the last src_page_requests */ 201 RAMBlock *last_req_rb; 202 /* Queue of outstanding page requests from the destination */ 203 QemuMutex src_page_req_mutex; 204 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests; 205 }; 206 typedef struct RAMState RAMState; 207 208 static RAMState *ram_state; 209 210 uint64_t ram_bytes_remaining(void) 211 { 212 return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE; 213 } 214 215 MigrationStats ram_counters; 216 217 /* used by the search for pages to send */ 218 struct PageSearchStatus { 219 /* Current block being searched */ 220 RAMBlock *block; 221 /* Current page to search from */ 222 unsigned long page; 223 /* Set once we wrap around */ 224 bool complete_round; 225 }; 226 typedef struct PageSearchStatus PageSearchStatus; 227 228 struct CompressParam { 229 bool done; 230 bool quit; 231 QEMUFile *file; 232 QemuMutex mutex; 233 QemuCond cond; 234 RAMBlock *block; 235 ram_addr_t offset; 236 }; 237 typedef struct CompressParam CompressParam; 238 239 struct DecompressParam { 240 bool done; 241 bool quit; 242 QemuMutex mutex; 243 QemuCond cond; 244 void *des; 245 uint8_t *compbuf; 246 int len; 247 }; 248 typedef struct DecompressParam DecompressParam; 249 250 static CompressParam *comp_param; 251 static QemuThread *compress_threads; 252 /* comp_done_cond is used to wake up the migration thread when 253 * one of the compression threads has finished the compression. 254 * comp_done_lock is used to co-work with comp_done_cond. 255 */ 256 static QemuMutex comp_done_lock; 257 static QemuCond comp_done_cond; 258 /* The empty QEMUFileOps will be used by file in CompressParam */ 259 static const QEMUFileOps empty_ops = { }; 260 261 static DecompressParam *decomp_param; 262 static QemuThread *decompress_threads; 263 static QemuMutex decomp_done_lock; 264 static QemuCond decomp_done_cond; 265 266 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 267 ram_addr_t offset); 268 269 static void *do_data_compress(void *opaque) 270 { 271 CompressParam *param = opaque; 272 RAMBlock *block; 273 ram_addr_t offset; 274 275 qemu_mutex_lock(¶m->mutex); 276 while (!param->quit) { 277 if (param->block) { 278 block = param->block; 279 offset = param->offset; 280 param->block = NULL; 281 qemu_mutex_unlock(¶m->mutex); 282 283 do_compress_ram_page(param->file, block, offset); 284 285 qemu_mutex_lock(&comp_done_lock); 286 param->done = true; 287 qemu_cond_signal(&comp_done_cond); 288 qemu_mutex_unlock(&comp_done_lock); 289 290 qemu_mutex_lock(¶m->mutex); 291 } else { 292 qemu_cond_wait(¶m->cond, ¶m->mutex); 293 } 294 } 295 qemu_mutex_unlock(¶m->mutex); 296 297 return NULL; 298 } 299 300 static inline void terminate_compression_threads(void) 301 { 302 int idx, thread_count; 303 304 thread_count = migrate_compress_threads(); 305 306 for (idx = 0; idx < thread_count; idx++) { 307 qemu_mutex_lock(&comp_param[idx].mutex); 308 comp_param[idx].quit = true; 309 qemu_cond_signal(&comp_param[idx].cond); 310 qemu_mutex_unlock(&comp_param[idx].mutex); 311 } 312 } 313 314 void migrate_compress_threads_join(void) 315 { 316 int i, thread_count; 317 318 if (!migrate_use_compression()) { 319 return; 320 } 321 terminate_compression_threads(); 322 thread_count = migrate_compress_threads(); 323 for (i = 0; i < thread_count; i++) { 324 qemu_thread_join(compress_threads + i); 325 qemu_fclose(comp_param[i].file); 326 qemu_mutex_destroy(&comp_param[i].mutex); 327 qemu_cond_destroy(&comp_param[i].cond); 328 } 329 qemu_mutex_destroy(&comp_done_lock); 330 qemu_cond_destroy(&comp_done_cond); 331 g_free(compress_threads); 332 g_free(comp_param); 333 compress_threads = NULL; 334 comp_param = NULL; 335 } 336 337 void migrate_compress_threads_create(void) 338 { 339 int i, thread_count; 340 341 if (!migrate_use_compression()) { 342 return; 343 } 344 thread_count = migrate_compress_threads(); 345 compress_threads = g_new0(QemuThread, thread_count); 346 comp_param = g_new0(CompressParam, thread_count); 347 qemu_cond_init(&comp_done_cond); 348 qemu_mutex_init(&comp_done_lock); 349 for (i = 0; i < thread_count; i++) { 350 /* comp_param[i].file is just used as a dummy buffer to save data, 351 * set its ops to empty. 352 */ 353 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 354 comp_param[i].done = true; 355 comp_param[i].quit = false; 356 qemu_mutex_init(&comp_param[i].mutex); 357 qemu_cond_init(&comp_param[i].cond); 358 qemu_thread_create(compress_threads + i, "compress", 359 do_data_compress, comp_param + i, 360 QEMU_THREAD_JOINABLE); 361 } 362 } 363 364 /** 365 * save_page_header: write page header to wire 366 * 367 * If this is the 1st block, it also writes the block identification 368 * 369 * Returns the number of bytes written 370 * 371 * @f: QEMUFile where to send the data 372 * @block: block that contains the page we want to send 373 * @offset: offset inside the block for the page 374 * in the lower bits, it contains flags 375 */ 376 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 377 ram_addr_t offset) 378 { 379 size_t size, len; 380 381 if (block == rs->last_sent_block) { 382 offset |= RAM_SAVE_FLAG_CONTINUE; 383 } 384 qemu_put_be64(f, offset); 385 size = 8; 386 387 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 388 len = strlen(block->idstr); 389 qemu_put_byte(f, len); 390 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 391 size += 1 + len; 392 rs->last_sent_block = block; 393 } 394 return size; 395 } 396 397 /** 398 * mig_throttle_guest_down: throotle down the guest 399 * 400 * Reduce amount of guest cpu execution to hopefully slow down memory 401 * writes. If guest dirty memory rate is reduced below the rate at 402 * which we can transfer pages to the destination then we should be 403 * able to complete migration. Some workloads dirty memory way too 404 * fast and will not effectively converge, even with auto-converge. 405 */ 406 static void mig_throttle_guest_down(void) 407 { 408 MigrationState *s = migrate_get_current(); 409 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 410 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 411 412 /* We have not started throttling yet. Let's start it. */ 413 if (!cpu_throttle_active()) { 414 cpu_throttle_set(pct_initial); 415 } else { 416 /* Throttling already on, just increase the rate */ 417 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); 418 } 419 } 420 421 /** 422 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 423 * 424 * @rs: current RAM state 425 * @current_addr: address for the zero page 426 * 427 * Update the xbzrle cache to reflect a page that's been sent as all 0. 428 * The important thing is that a stale (not-yet-0'd) page be replaced 429 * by the new data. 430 * As a bonus, if the page wasn't in the cache it gets added so that 431 * when a small write is made into the 0'd page it gets XBZRLE sent. 432 */ 433 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 434 { 435 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 436 return; 437 } 438 439 /* We don't care if this fails to allocate a new cache page 440 * as long as it updated an old one */ 441 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 442 ram_counters.dirty_sync_count); 443 } 444 445 #define ENCODING_FLAG_XBZRLE 0x1 446 447 /** 448 * save_xbzrle_page: compress and send current page 449 * 450 * Returns: 1 means that we wrote the page 451 * 0 means that page is identical to the one already sent 452 * -1 means that xbzrle would be longer than normal 453 * 454 * @rs: current RAM state 455 * @current_data: pointer to the address of the page contents 456 * @current_addr: addr of the page 457 * @block: block that contains the page we want to send 458 * @offset: offset inside the block for the page 459 * @last_stage: if we are at the completion stage 460 */ 461 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 462 ram_addr_t current_addr, RAMBlock *block, 463 ram_addr_t offset, bool last_stage) 464 { 465 int encoded_len = 0, bytes_xbzrle; 466 uint8_t *prev_cached_page; 467 468 if (!cache_is_cached(XBZRLE.cache, current_addr, 469 ram_counters.dirty_sync_count)) { 470 xbzrle_counters.cache_miss++; 471 if (!last_stage) { 472 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 473 ram_counters.dirty_sync_count) == -1) { 474 return -1; 475 } else { 476 /* update *current_data when the page has been 477 inserted into cache */ 478 *current_data = get_cached_data(XBZRLE.cache, current_addr); 479 } 480 } 481 return -1; 482 } 483 484 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 485 486 /* save current buffer into memory */ 487 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 488 489 /* XBZRLE encoding (if there is no overflow) */ 490 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 491 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 492 TARGET_PAGE_SIZE); 493 if (encoded_len == 0) { 494 trace_save_xbzrle_page_skipping(); 495 return 0; 496 } else if (encoded_len == -1) { 497 trace_save_xbzrle_page_overflow(); 498 xbzrle_counters.overflow++; 499 /* update data in the cache */ 500 if (!last_stage) { 501 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); 502 *current_data = prev_cached_page; 503 } 504 return -1; 505 } 506 507 /* we need to update the data in the cache, in order to get the same data */ 508 if (!last_stage) { 509 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 510 } 511 512 /* Send XBZRLE based compressed page */ 513 bytes_xbzrle = save_page_header(rs, rs->f, block, 514 offset | RAM_SAVE_FLAG_XBZRLE); 515 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 516 qemu_put_be16(rs->f, encoded_len); 517 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 518 bytes_xbzrle += encoded_len + 1 + 2; 519 xbzrle_counters.pages++; 520 xbzrle_counters.bytes += bytes_xbzrle; 521 ram_counters.transferred += bytes_xbzrle; 522 523 return 1; 524 } 525 526 /** 527 * migration_bitmap_find_dirty: find the next dirty page from start 528 * 529 * Called with rcu_read_lock() to protect migration_bitmap 530 * 531 * Returns the byte offset within memory region of the start of a dirty page 532 * 533 * @rs: current RAM state 534 * @rb: RAMBlock where to search for dirty pages 535 * @start: page where we start the search 536 */ 537 static inline 538 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 539 unsigned long start) 540 { 541 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 542 unsigned long *bitmap = rb->bmap; 543 unsigned long next; 544 545 if (rs->ram_bulk_stage && start > 0) { 546 next = start + 1; 547 } else { 548 next = find_next_bit(bitmap, size, start); 549 } 550 551 return next; 552 } 553 554 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 555 RAMBlock *rb, 556 unsigned long page) 557 { 558 bool ret; 559 560 ret = test_and_clear_bit(page, rb->bmap); 561 562 if (ret) { 563 rs->migration_dirty_pages--; 564 } 565 return ret; 566 } 567 568 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb, 569 ram_addr_t start, ram_addr_t length) 570 { 571 rs->migration_dirty_pages += 572 cpu_physical_memory_sync_dirty_bitmap(rb, start, length, 573 &rs->num_dirty_pages_period); 574 } 575 576 /** 577 * ram_pagesize_summary: calculate all the pagesizes of a VM 578 * 579 * Returns a summary bitmap of the page sizes of all RAMBlocks 580 * 581 * For VMs with just normal pages this is equivalent to the host page 582 * size. If it's got some huge pages then it's the OR of all the 583 * different page sizes. 584 */ 585 uint64_t ram_pagesize_summary(void) 586 { 587 RAMBlock *block; 588 uint64_t summary = 0; 589 590 RAMBLOCK_FOREACH(block) { 591 summary |= block->page_size; 592 } 593 594 return summary; 595 } 596 597 static void migration_bitmap_sync(RAMState *rs) 598 { 599 RAMBlock *block; 600 int64_t end_time; 601 uint64_t bytes_xfer_now; 602 603 ram_counters.dirty_sync_count++; 604 605 if (!rs->time_last_bitmap_sync) { 606 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 607 } 608 609 trace_migration_bitmap_sync_start(); 610 memory_global_dirty_log_sync(); 611 612 qemu_mutex_lock(&rs->bitmap_mutex); 613 rcu_read_lock(); 614 RAMBLOCK_FOREACH(block) { 615 migration_bitmap_sync_range(rs, block, 0, block->used_length); 616 } 617 rcu_read_unlock(); 618 qemu_mutex_unlock(&rs->bitmap_mutex); 619 620 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 621 622 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 623 624 /* more than 1 second = 1000 millisecons */ 625 if (end_time > rs->time_last_bitmap_sync + 1000) { 626 /* calculate period counters */ 627 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 628 / (end_time - rs->time_last_bitmap_sync); 629 bytes_xfer_now = ram_counters.transferred; 630 631 if (migrate_auto_converge()) { 632 /* The following detection logic can be refined later. For now: 633 Check to see if the dirtied bytes is 50% more than the approx. 634 amount of bytes that just got transferred since the last time we 635 were in this routine. If that happens twice, start or increase 636 throttling */ 637 638 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE > 639 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) && 640 (++rs->dirty_rate_high_cnt >= 2)) { 641 trace_migration_throttle(); 642 rs->dirty_rate_high_cnt = 0; 643 mig_throttle_guest_down(); 644 } 645 } 646 647 if (migrate_use_xbzrle()) { 648 if (rs->iterations_prev != rs->iterations) { 649 xbzrle_counters.cache_miss_rate = 650 (double)(xbzrle_counters.cache_miss - 651 rs->xbzrle_cache_miss_prev) / 652 (rs->iterations - rs->iterations_prev); 653 } 654 rs->iterations_prev = rs->iterations; 655 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 656 } 657 658 /* reset period counters */ 659 rs->time_last_bitmap_sync = end_time; 660 rs->num_dirty_pages_period = 0; 661 rs->bytes_xfer_prev = bytes_xfer_now; 662 } 663 if (migrate_use_events()) { 664 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL); 665 } 666 } 667 668 /** 669 * save_zero_page: send the zero page to the stream 670 * 671 * Returns the number of pages written. 672 * 673 * @rs: current RAM state 674 * @block: block that contains the page we want to send 675 * @offset: offset inside the block for the page 676 * @p: pointer to the page 677 */ 678 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 679 uint8_t *p) 680 { 681 int pages = -1; 682 683 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 684 ram_counters.duplicate++; 685 ram_counters.transferred += 686 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO); 687 qemu_put_byte(rs->f, 0); 688 ram_counters.transferred += 1; 689 pages = 1; 690 } 691 692 return pages; 693 } 694 695 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 696 { 697 if (!migrate_release_ram() || !migration_in_postcopy()) { 698 return; 699 } 700 701 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS); 702 } 703 704 /** 705 * ram_save_page: send the given page to the stream 706 * 707 * Returns the number of pages written. 708 * < 0 - error 709 * >=0 - Number of pages written - this might legally be 0 710 * if xbzrle noticed the page was the same. 711 * 712 * @rs: current RAM state 713 * @block: block that contains the page we want to send 714 * @offset: offset inside the block for the page 715 * @last_stage: if we are at the completion stage 716 */ 717 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 718 { 719 int pages = -1; 720 uint64_t bytes_xmit; 721 ram_addr_t current_addr; 722 uint8_t *p; 723 int ret; 724 bool send_async = true; 725 RAMBlock *block = pss->block; 726 ram_addr_t offset = pss->page << TARGET_PAGE_BITS; 727 728 p = block->host + offset; 729 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 730 731 /* In doubt sent page as normal */ 732 bytes_xmit = 0; 733 ret = ram_control_save_page(rs->f, block->offset, 734 offset, TARGET_PAGE_SIZE, &bytes_xmit); 735 if (bytes_xmit) { 736 ram_counters.transferred += bytes_xmit; 737 pages = 1; 738 } 739 740 XBZRLE_cache_lock(); 741 742 current_addr = block->offset + offset; 743 744 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 745 if (ret != RAM_SAVE_CONTROL_DELAYED) { 746 if (bytes_xmit > 0) { 747 ram_counters.normal++; 748 } else if (bytes_xmit == 0) { 749 ram_counters.duplicate++; 750 } 751 } 752 } else { 753 pages = save_zero_page(rs, block, offset, p); 754 if (pages > 0) { 755 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 756 * page would be stale 757 */ 758 xbzrle_cache_zero_page(rs, current_addr); 759 ram_release_pages(block->idstr, offset, pages); 760 } else if (!rs->ram_bulk_stage && 761 !migration_in_postcopy() && migrate_use_xbzrle()) { 762 pages = save_xbzrle_page(rs, &p, current_addr, block, 763 offset, last_stage); 764 if (!last_stage) { 765 /* Can't send this cached data async, since the cache page 766 * might get updated before it gets to the wire 767 */ 768 send_async = false; 769 } 770 } 771 } 772 773 /* XBZRLE overflow or normal page */ 774 if (pages == -1) { 775 ram_counters.transferred += 776 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE); 777 if (send_async) { 778 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, 779 migrate_release_ram() & 780 migration_in_postcopy()); 781 } else { 782 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE); 783 } 784 ram_counters.transferred += TARGET_PAGE_SIZE; 785 pages = 1; 786 ram_counters.normal++; 787 } 788 789 XBZRLE_cache_unlock(); 790 791 return pages; 792 } 793 794 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 795 ram_addr_t offset) 796 { 797 RAMState *rs = ram_state; 798 int bytes_sent, blen; 799 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 800 801 bytes_sent = save_page_header(rs, f, block, offset | 802 RAM_SAVE_FLAG_COMPRESS_PAGE); 803 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 804 migrate_compress_level()); 805 if (blen < 0) { 806 bytes_sent = 0; 807 qemu_file_set_error(migrate_get_current()->to_dst_file, blen); 808 error_report("compressed data failed!"); 809 } else { 810 bytes_sent += blen; 811 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 812 } 813 814 return bytes_sent; 815 } 816 817 static void flush_compressed_data(RAMState *rs) 818 { 819 int idx, len, thread_count; 820 821 if (!migrate_use_compression()) { 822 return; 823 } 824 thread_count = migrate_compress_threads(); 825 826 qemu_mutex_lock(&comp_done_lock); 827 for (idx = 0; idx < thread_count; idx++) { 828 while (!comp_param[idx].done) { 829 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 830 } 831 } 832 qemu_mutex_unlock(&comp_done_lock); 833 834 for (idx = 0; idx < thread_count; idx++) { 835 qemu_mutex_lock(&comp_param[idx].mutex); 836 if (!comp_param[idx].quit) { 837 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 838 ram_counters.transferred += len; 839 } 840 qemu_mutex_unlock(&comp_param[idx].mutex); 841 } 842 } 843 844 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 845 ram_addr_t offset) 846 { 847 param->block = block; 848 param->offset = offset; 849 } 850 851 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 852 ram_addr_t offset) 853 { 854 int idx, thread_count, bytes_xmit = -1, pages = -1; 855 856 thread_count = migrate_compress_threads(); 857 qemu_mutex_lock(&comp_done_lock); 858 while (true) { 859 for (idx = 0; idx < thread_count; idx++) { 860 if (comp_param[idx].done) { 861 comp_param[idx].done = false; 862 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 863 qemu_mutex_lock(&comp_param[idx].mutex); 864 set_compress_params(&comp_param[idx], block, offset); 865 qemu_cond_signal(&comp_param[idx].cond); 866 qemu_mutex_unlock(&comp_param[idx].mutex); 867 pages = 1; 868 ram_counters.normal++; 869 ram_counters.transferred += bytes_xmit; 870 break; 871 } 872 } 873 if (pages > 0) { 874 break; 875 } else { 876 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 877 } 878 } 879 qemu_mutex_unlock(&comp_done_lock); 880 881 return pages; 882 } 883 884 /** 885 * ram_save_compressed_page: compress the given page and send it to the stream 886 * 887 * Returns the number of pages written. 888 * 889 * @rs: current RAM state 890 * @block: block that contains the page we want to send 891 * @offset: offset inside the block for the page 892 * @last_stage: if we are at the completion stage 893 */ 894 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, 895 bool last_stage) 896 { 897 int pages = -1; 898 uint64_t bytes_xmit = 0; 899 uint8_t *p; 900 int ret, blen; 901 RAMBlock *block = pss->block; 902 ram_addr_t offset = pss->page << TARGET_PAGE_BITS; 903 904 p = block->host + offset; 905 906 ret = ram_control_save_page(rs->f, block->offset, 907 offset, TARGET_PAGE_SIZE, &bytes_xmit); 908 if (bytes_xmit) { 909 ram_counters.transferred += bytes_xmit; 910 pages = 1; 911 } 912 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 913 if (ret != RAM_SAVE_CONTROL_DELAYED) { 914 if (bytes_xmit > 0) { 915 ram_counters.normal++; 916 } else if (bytes_xmit == 0) { 917 ram_counters.duplicate++; 918 } 919 } 920 } else { 921 /* When starting the process of a new block, the first page of 922 * the block should be sent out before other pages in the same 923 * block, and all the pages in last block should have been sent 924 * out, keeping this order is important, because the 'cont' flag 925 * is used to avoid resending the block name. 926 */ 927 if (block != rs->last_sent_block) { 928 flush_compressed_data(rs); 929 pages = save_zero_page(rs, block, offset, p); 930 if (pages == -1) { 931 /* Make sure the first page is sent out before other pages */ 932 bytes_xmit = save_page_header(rs, rs->f, block, offset | 933 RAM_SAVE_FLAG_COMPRESS_PAGE); 934 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE, 935 migrate_compress_level()); 936 if (blen > 0) { 937 ram_counters.transferred += bytes_xmit + blen; 938 ram_counters.normal++; 939 pages = 1; 940 } else { 941 qemu_file_set_error(rs->f, blen); 942 error_report("compressed data failed!"); 943 } 944 } 945 if (pages > 0) { 946 ram_release_pages(block->idstr, offset, pages); 947 } 948 } else { 949 pages = save_zero_page(rs, block, offset, p); 950 if (pages == -1) { 951 pages = compress_page_with_multi_thread(rs, block, offset); 952 } else { 953 ram_release_pages(block->idstr, offset, pages); 954 } 955 } 956 } 957 958 return pages; 959 } 960 961 /** 962 * find_dirty_block: find the next dirty page and update any state 963 * associated with the search process. 964 * 965 * Returns if a page is found 966 * 967 * @rs: current RAM state 968 * @pss: data about the state of the current dirty page scan 969 * @again: set to false if the search has scanned the whole of RAM 970 */ 971 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 972 { 973 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 974 if (pss->complete_round && pss->block == rs->last_seen_block && 975 pss->page >= rs->last_page) { 976 /* 977 * We've been once around the RAM and haven't found anything. 978 * Give up. 979 */ 980 *again = false; 981 return false; 982 } 983 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) { 984 /* Didn't find anything in this RAM Block */ 985 pss->page = 0; 986 pss->block = QLIST_NEXT_RCU(pss->block, next); 987 if (!pss->block) { 988 /* Hit the end of the list */ 989 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 990 /* Flag that we've looped */ 991 pss->complete_round = true; 992 rs->ram_bulk_stage = false; 993 if (migrate_use_xbzrle()) { 994 /* If xbzrle is on, stop using the data compression at this 995 * point. In theory, xbzrle can do better than compression. 996 */ 997 flush_compressed_data(rs); 998 } 999 } 1000 /* Didn't find anything this time, but try again on the new block */ 1001 *again = true; 1002 return false; 1003 } else { 1004 /* Can go around again, but... */ 1005 *again = true; 1006 /* We've found something so probably don't need to */ 1007 return true; 1008 } 1009 } 1010 1011 /** 1012 * unqueue_page: gets a page of the queue 1013 * 1014 * Helper for 'get_queued_page' - gets a page off the queue 1015 * 1016 * Returns the block of the page (or NULL if none available) 1017 * 1018 * @rs: current RAM state 1019 * @offset: used to return the offset within the RAMBlock 1020 */ 1021 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1022 { 1023 RAMBlock *block = NULL; 1024 1025 qemu_mutex_lock(&rs->src_page_req_mutex); 1026 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1027 struct RAMSrcPageRequest *entry = 1028 QSIMPLEQ_FIRST(&rs->src_page_requests); 1029 block = entry->rb; 1030 *offset = entry->offset; 1031 1032 if (entry->len > TARGET_PAGE_SIZE) { 1033 entry->len -= TARGET_PAGE_SIZE; 1034 entry->offset += TARGET_PAGE_SIZE; 1035 } else { 1036 memory_region_unref(block->mr); 1037 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1038 g_free(entry); 1039 } 1040 } 1041 qemu_mutex_unlock(&rs->src_page_req_mutex); 1042 1043 return block; 1044 } 1045 1046 /** 1047 * get_queued_page: unqueue a page from the postocpy requests 1048 * 1049 * Skips pages that are already sent (!dirty) 1050 * 1051 * Returns if a queued page is found 1052 * 1053 * @rs: current RAM state 1054 * @pss: data about the state of the current dirty page scan 1055 */ 1056 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1057 { 1058 RAMBlock *block; 1059 ram_addr_t offset; 1060 bool dirty; 1061 1062 do { 1063 block = unqueue_page(rs, &offset); 1064 /* 1065 * We're sending this page, and since it's postcopy nothing else 1066 * will dirty it, and we must make sure it doesn't get sent again 1067 * even if this queue request was received after the background 1068 * search already sent it. 1069 */ 1070 if (block) { 1071 unsigned long page; 1072 1073 page = offset >> TARGET_PAGE_BITS; 1074 dirty = test_bit(page, block->bmap); 1075 if (!dirty) { 1076 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1077 page, test_bit(page, block->unsentmap)); 1078 } else { 1079 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1080 } 1081 } 1082 1083 } while (block && !dirty); 1084 1085 if (block) { 1086 /* 1087 * As soon as we start servicing pages out of order, then we have 1088 * to kill the bulk stage, since the bulk stage assumes 1089 * in (migration_bitmap_find_and_reset_dirty) that every page is 1090 * dirty, that's no longer true. 1091 */ 1092 rs->ram_bulk_stage = false; 1093 1094 /* 1095 * We want the background search to continue from the queued page 1096 * since the guest is likely to want other pages near to the page 1097 * it just requested. 1098 */ 1099 pss->block = block; 1100 pss->page = offset >> TARGET_PAGE_BITS; 1101 } 1102 1103 return !!block; 1104 } 1105 1106 /** 1107 * migration_page_queue_free: drop any remaining pages in the ram 1108 * request queue 1109 * 1110 * It should be empty at the end anyway, but in error cases there may 1111 * be some left. in case that there is any page left, we drop it. 1112 * 1113 */ 1114 static void migration_page_queue_free(RAMState *rs) 1115 { 1116 struct RAMSrcPageRequest *mspr, *next_mspr; 1117 /* This queue generally should be empty - but in the case of a failed 1118 * migration might have some droppings in. 1119 */ 1120 rcu_read_lock(); 1121 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1122 memory_region_unref(mspr->rb->mr); 1123 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1124 g_free(mspr); 1125 } 1126 rcu_read_unlock(); 1127 } 1128 1129 /** 1130 * ram_save_queue_pages: queue the page for transmission 1131 * 1132 * A request from postcopy destination for example. 1133 * 1134 * Returns zero on success or negative on error 1135 * 1136 * @rbname: Name of the RAMBLock of the request. NULL means the 1137 * same that last one. 1138 * @start: starting address from the start of the RAMBlock 1139 * @len: length (in bytes) to send 1140 */ 1141 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1142 { 1143 RAMBlock *ramblock; 1144 RAMState *rs = ram_state; 1145 1146 ram_counters.postcopy_requests++; 1147 rcu_read_lock(); 1148 if (!rbname) { 1149 /* Reuse last RAMBlock */ 1150 ramblock = rs->last_req_rb; 1151 1152 if (!ramblock) { 1153 /* 1154 * Shouldn't happen, we can't reuse the last RAMBlock if 1155 * it's the 1st request. 1156 */ 1157 error_report("ram_save_queue_pages no previous block"); 1158 goto err; 1159 } 1160 } else { 1161 ramblock = qemu_ram_block_by_name(rbname); 1162 1163 if (!ramblock) { 1164 /* We shouldn't be asked for a non-existent RAMBlock */ 1165 error_report("ram_save_queue_pages no block '%s'", rbname); 1166 goto err; 1167 } 1168 rs->last_req_rb = ramblock; 1169 } 1170 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1171 if (start+len > ramblock->used_length) { 1172 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1173 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1174 __func__, start, len, ramblock->used_length); 1175 goto err; 1176 } 1177 1178 struct RAMSrcPageRequest *new_entry = 1179 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1180 new_entry->rb = ramblock; 1181 new_entry->offset = start; 1182 new_entry->len = len; 1183 1184 memory_region_ref(ramblock->mr); 1185 qemu_mutex_lock(&rs->src_page_req_mutex); 1186 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1187 qemu_mutex_unlock(&rs->src_page_req_mutex); 1188 rcu_read_unlock(); 1189 1190 return 0; 1191 1192 err: 1193 rcu_read_unlock(); 1194 return -1; 1195 } 1196 1197 /** 1198 * ram_save_target_page: save one target page 1199 * 1200 * Returns the number of pages written 1201 * 1202 * @rs: current RAM state 1203 * @ms: current migration state 1204 * @pss: data about the page we want to send 1205 * @last_stage: if we are at the completion stage 1206 */ 1207 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1208 bool last_stage) 1209 { 1210 int res = 0; 1211 1212 /* Check the pages is dirty and if it is send it */ 1213 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 1214 /* 1215 * If xbzrle is on, stop using the data compression after first 1216 * round of migration even if compression is enabled. In theory, 1217 * xbzrle can do better than compression. 1218 */ 1219 if (migrate_use_compression() && 1220 (rs->ram_bulk_stage || !migrate_use_xbzrle())) { 1221 res = ram_save_compressed_page(rs, pss, last_stage); 1222 } else { 1223 res = ram_save_page(rs, pss, last_stage); 1224 } 1225 1226 if (res < 0) { 1227 return res; 1228 } 1229 if (pss->block->unsentmap) { 1230 clear_bit(pss->page, pss->block->unsentmap); 1231 } 1232 } 1233 1234 return res; 1235 } 1236 1237 /** 1238 * ram_save_host_page: save a whole host page 1239 * 1240 * Starting at *offset send pages up to the end of the current host 1241 * page. It's valid for the initial offset to point into the middle of 1242 * a host page in which case the remainder of the hostpage is sent. 1243 * Only dirty target pages are sent. Note that the host page size may 1244 * be a huge page for this block. 1245 * The saving stops at the boundary of the used_length of the block 1246 * if the RAMBlock isn't a multiple of the host page size. 1247 * 1248 * Returns the number of pages written or negative on error 1249 * 1250 * @rs: current RAM state 1251 * @ms: current migration state 1252 * @pss: data about the page we want to send 1253 * @last_stage: if we are at the completion stage 1254 */ 1255 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1256 bool last_stage) 1257 { 1258 int tmppages, pages = 0; 1259 size_t pagesize_bits = 1260 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1261 1262 do { 1263 tmppages = ram_save_target_page(rs, pss, last_stage); 1264 if (tmppages < 0) { 1265 return tmppages; 1266 } 1267 1268 pages += tmppages; 1269 pss->page++; 1270 } while ((pss->page & (pagesize_bits - 1)) && 1271 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); 1272 1273 /* The offset we leave with is the last one we looked at */ 1274 pss->page--; 1275 return pages; 1276 } 1277 1278 /** 1279 * ram_find_and_save_block: finds a dirty page and sends it to f 1280 * 1281 * Called within an RCU critical section. 1282 * 1283 * Returns the number of pages written where zero means no dirty pages 1284 * 1285 * @rs: current RAM state 1286 * @last_stage: if we are at the completion stage 1287 * 1288 * On systems where host-page-size > target-page-size it will send all the 1289 * pages in a host page that are dirty. 1290 */ 1291 1292 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 1293 { 1294 PageSearchStatus pss; 1295 int pages = 0; 1296 bool again, found; 1297 1298 /* No dirty page as there is zero RAM */ 1299 if (!ram_bytes_total()) { 1300 return pages; 1301 } 1302 1303 pss.block = rs->last_seen_block; 1304 pss.page = rs->last_page; 1305 pss.complete_round = false; 1306 1307 if (!pss.block) { 1308 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1309 } 1310 1311 do { 1312 again = true; 1313 found = get_queued_page(rs, &pss); 1314 1315 if (!found) { 1316 /* priority queue empty, so just search for something dirty */ 1317 found = find_dirty_block(rs, &pss, &again); 1318 } 1319 1320 if (found) { 1321 pages = ram_save_host_page(rs, &pss, last_stage); 1322 } 1323 } while (!pages && again); 1324 1325 rs->last_seen_block = pss.block; 1326 rs->last_page = pss.page; 1327 1328 return pages; 1329 } 1330 1331 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1332 { 1333 uint64_t pages = size / TARGET_PAGE_SIZE; 1334 1335 if (zero) { 1336 ram_counters.duplicate += pages; 1337 } else { 1338 ram_counters.normal += pages; 1339 ram_counters.transferred += size; 1340 qemu_update_position(f, size); 1341 } 1342 } 1343 1344 uint64_t ram_bytes_total(void) 1345 { 1346 RAMBlock *block; 1347 uint64_t total = 0; 1348 1349 rcu_read_lock(); 1350 RAMBLOCK_FOREACH(block) { 1351 total += block->used_length; 1352 } 1353 rcu_read_unlock(); 1354 return total; 1355 } 1356 1357 void free_xbzrle_decoded_buf(void) 1358 { 1359 g_free(xbzrle_decoded_buf); 1360 xbzrle_decoded_buf = NULL; 1361 } 1362 1363 static void ram_migration_cleanup(void *opaque) 1364 { 1365 RAMState **rsp = opaque; 1366 RAMBlock *block; 1367 1368 /* caller have hold iothread lock or is in a bh, so there is 1369 * no writing race against this migration_bitmap 1370 */ 1371 memory_global_dirty_log_stop(); 1372 1373 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1374 g_free(block->bmap); 1375 block->bmap = NULL; 1376 g_free(block->unsentmap); 1377 block->unsentmap = NULL; 1378 } 1379 1380 XBZRLE_cache_lock(); 1381 if (XBZRLE.cache) { 1382 cache_fini(XBZRLE.cache); 1383 g_free(XBZRLE.encoded_buf); 1384 g_free(XBZRLE.current_buf); 1385 g_free(XBZRLE.zero_target_page); 1386 XBZRLE.cache = NULL; 1387 XBZRLE.encoded_buf = NULL; 1388 XBZRLE.current_buf = NULL; 1389 XBZRLE.zero_target_page = NULL; 1390 } 1391 XBZRLE_cache_unlock(); 1392 migration_page_queue_free(*rsp); 1393 g_free(*rsp); 1394 *rsp = NULL; 1395 } 1396 1397 static void ram_state_reset(RAMState *rs) 1398 { 1399 rs->last_seen_block = NULL; 1400 rs->last_sent_block = NULL; 1401 rs->last_page = 0; 1402 rs->last_version = ram_list.version; 1403 rs->ram_bulk_stage = true; 1404 } 1405 1406 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1407 1408 /* 1409 * 'expected' is the value you expect the bitmap mostly to be full 1410 * of; it won't bother printing lines that are all this value. 1411 * If 'todump' is null the migration bitmap is dumped. 1412 */ 1413 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 1414 unsigned long pages) 1415 { 1416 int64_t cur; 1417 int64_t linelen = 128; 1418 char linebuf[129]; 1419 1420 for (cur = 0; cur < pages; cur += linelen) { 1421 int64_t curb; 1422 bool found = false; 1423 /* 1424 * Last line; catch the case where the line length 1425 * is longer than remaining ram 1426 */ 1427 if (cur + linelen > pages) { 1428 linelen = pages - cur; 1429 } 1430 for (curb = 0; curb < linelen; curb++) { 1431 bool thisbit = test_bit(cur + curb, todump); 1432 linebuf[curb] = thisbit ? '1' : '.'; 1433 found = found || (thisbit != expected); 1434 } 1435 if (found) { 1436 linebuf[curb] = '\0'; 1437 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1438 } 1439 } 1440 } 1441 1442 /* **** functions for postcopy ***** */ 1443 1444 void ram_postcopy_migrated_memory_release(MigrationState *ms) 1445 { 1446 struct RAMBlock *block; 1447 1448 RAMBLOCK_FOREACH(block) { 1449 unsigned long *bitmap = block->bmap; 1450 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 1451 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 1452 1453 while (run_start < range) { 1454 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 1455 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS, 1456 (run_end - run_start) << TARGET_PAGE_BITS); 1457 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 1458 } 1459 } 1460 } 1461 1462 /** 1463 * postcopy_send_discard_bm_ram: discard a RAMBlock 1464 * 1465 * Returns zero on success 1466 * 1467 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1468 * Note: At this point the 'unsentmap' is the processed bitmap combined 1469 * with the dirtymap; so a '1' means it's either dirty or unsent. 1470 * 1471 * @ms: current migration state 1472 * @pds: state for postcopy 1473 * @start: RAMBlock starting page 1474 * @length: RAMBlock size 1475 */ 1476 static int postcopy_send_discard_bm_ram(MigrationState *ms, 1477 PostcopyDiscardState *pds, 1478 RAMBlock *block) 1479 { 1480 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 1481 unsigned long current; 1482 unsigned long *unsentmap = block->unsentmap; 1483 1484 for (current = 0; current < end; ) { 1485 unsigned long one = find_next_bit(unsentmap, end, current); 1486 1487 if (one <= end) { 1488 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); 1489 unsigned long discard_length; 1490 1491 if (zero >= end) { 1492 discard_length = end - one; 1493 } else { 1494 discard_length = zero - one; 1495 } 1496 if (discard_length) { 1497 postcopy_discard_send_range(ms, pds, one, discard_length); 1498 } 1499 current = one + discard_length; 1500 } else { 1501 current = one; 1502 } 1503 } 1504 1505 return 0; 1506 } 1507 1508 /** 1509 * postcopy_each_ram_send_discard: discard all RAMBlocks 1510 * 1511 * Returns 0 for success or negative for error 1512 * 1513 * Utility for the outgoing postcopy code. 1514 * Calls postcopy_send_discard_bm_ram for each RAMBlock 1515 * passing it bitmap indexes and name. 1516 * (qemu_ram_foreach_block ends up passing unscaled lengths 1517 * which would mean postcopy code would have to deal with target page) 1518 * 1519 * @ms: current migration state 1520 */ 1521 static int postcopy_each_ram_send_discard(MigrationState *ms) 1522 { 1523 struct RAMBlock *block; 1524 int ret; 1525 1526 RAMBLOCK_FOREACH(block) { 1527 PostcopyDiscardState *pds = 1528 postcopy_discard_send_init(ms, block->idstr); 1529 1530 /* 1531 * Postcopy sends chunks of bitmap over the wire, but it 1532 * just needs indexes at this point, avoids it having 1533 * target page specific code. 1534 */ 1535 ret = postcopy_send_discard_bm_ram(ms, pds, block); 1536 postcopy_discard_send_finish(ms, pds); 1537 if (ret) { 1538 return ret; 1539 } 1540 } 1541 1542 return 0; 1543 } 1544 1545 /** 1546 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages 1547 * 1548 * Helper for postcopy_chunk_hostpages; it's called twice to 1549 * canonicalize the two bitmaps, that are similar, but one is 1550 * inverted. 1551 * 1552 * Postcopy requires that all target pages in a hostpage are dirty or 1553 * clean, not a mix. This function canonicalizes the bitmaps. 1554 * 1555 * @ms: current migration state 1556 * @unsent_pass: if true we need to canonicalize partially unsent host pages 1557 * otherwise we need to canonicalize partially dirty host pages 1558 * @block: block that contains the page we want to canonicalize 1559 * @pds: state for postcopy 1560 */ 1561 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, 1562 RAMBlock *block, 1563 PostcopyDiscardState *pds) 1564 { 1565 RAMState *rs = ram_state; 1566 unsigned long *bitmap = block->bmap; 1567 unsigned long *unsentmap = block->unsentmap; 1568 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 1569 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 1570 unsigned long run_start; 1571 1572 if (block->page_size == TARGET_PAGE_SIZE) { 1573 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 1574 return; 1575 } 1576 1577 if (unsent_pass) { 1578 /* Find a sent page */ 1579 run_start = find_next_zero_bit(unsentmap, pages, 0); 1580 } else { 1581 /* Find a dirty page */ 1582 run_start = find_next_bit(bitmap, pages, 0); 1583 } 1584 1585 while (run_start < pages) { 1586 bool do_fixup = false; 1587 unsigned long fixup_start_addr; 1588 unsigned long host_offset; 1589 1590 /* 1591 * If the start of this run of pages is in the middle of a host 1592 * page, then we need to fixup this host page. 1593 */ 1594 host_offset = run_start % host_ratio; 1595 if (host_offset) { 1596 do_fixup = true; 1597 run_start -= host_offset; 1598 fixup_start_addr = run_start; 1599 /* For the next pass */ 1600 run_start = run_start + host_ratio; 1601 } else { 1602 /* Find the end of this run */ 1603 unsigned long run_end; 1604 if (unsent_pass) { 1605 run_end = find_next_bit(unsentmap, pages, run_start + 1); 1606 } else { 1607 run_end = find_next_zero_bit(bitmap, pages, run_start + 1); 1608 } 1609 /* 1610 * If the end isn't at the start of a host page, then the 1611 * run doesn't finish at the end of a host page 1612 * and we need to discard. 1613 */ 1614 host_offset = run_end % host_ratio; 1615 if (host_offset) { 1616 do_fixup = true; 1617 fixup_start_addr = run_end - host_offset; 1618 /* 1619 * This host page has gone, the next loop iteration starts 1620 * from after the fixup 1621 */ 1622 run_start = fixup_start_addr + host_ratio; 1623 } else { 1624 /* 1625 * No discards on this iteration, next loop starts from 1626 * next sent/dirty page 1627 */ 1628 run_start = run_end + 1; 1629 } 1630 } 1631 1632 if (do_fixup) { 1633 unsigned long page; 1634 1635 /* Tell the destination to discard this page */ 1636 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { 1637 /* For the unsent_pass we: 1638 * discard partially sent pages 1639 * For the !unsent_pass (dirty) we: 1640 * discard partially dirty pages that were sent 1641 * (any partially sent pages were already discarded 1642 * by the previous unsent_pass) 1643 */ 1644 postcopy_discard_send_range(ms, pds, fixup_start_addr, 1645 host_ratio); 1646 } 1647 1648 /* Clean up the bitmap */ 1649 for (page = fixup_start_addr; 1650 page < fixup_start_addr + host_ratio; page++) { 1651 /* All pages in this host page are now not sent */ 1652 set_bit(page, unsentmap); 1653 1654 /* 1655 * Remark them as dirty, updating the count for any pages 1656 * that weren't previously dirty. 1657 */ 1658 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 1659 } 1660 } 1661 1662 if (unsent_pass) { 1663 /* Find the next sent page for the next iteration */ 1664 run_start = find_next_zero_bit(unsentmap, pages, run_start); 1665 } else { 1666 /* Find the next dirty page for the next iteration */ 1667 run_start = find_next_bit(bitmap, pages, run_start); 1668 } 1669 } 1670 } 1671 1672 /** 1673 * postcopy_chuck_hostpages: discrad any partially sent host page 1674 * 1675 * Utility for the outgoing postcopy code. 1676 * 1677 * Discard any partially sent host-page size chunks, mark any partially 1678 * dirty host-page size chunks as all dirty. In this case the host-page 1679 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 1680 * 1681 * Returns zero on success 1682 * 1683 * @ms: current migration state 1684 * @block: block we want to work with 1685 */ 1686 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 1687 { 1688 PostcopyDiscardState *pds = 1689 postcopy_discard_send_init(ms, block->idstr); 1690 1691 /* First pass: Discard all partially sent host pages */ 1692 postcopy_chunk_hostpages_pass(ms, true, block, pds); 1693 /* 1694 * Second pass: Ensure that all partially dirty host pages are made 1695 * fully dirty. 1696 */ 1697 postcopy_chunk_hostpages_pass(ms, false, block, pds); 1698 1699 postcopy_discard_send_finish(ms, pds); 1700 return 0; 1701 } 1702 1703 /** 1704 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 1705 * 1706 * Returns zero on success 1707 * 1708 * Transmit the set of pages to be discarded after precopy to the target 1709 * these are pages that: 1710 * a) Have been previously transmitted but are now dirty again 1711 * b) Pages that have never been transmitted, this ensures that 1712 * any pages on the destination that have been mapped by background 1713 * tasks get discarded (transparent huge pages is the specific concern) 1714 * Hopefully this is pretty sparse 1715 * 1716 * @ms: current migration state 1717 */ 1718 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 1719 { 1720 RAMState *rs = ram_state; 1721 RAMBlock *block; 1722 int ret; 1723 1724 rcu_read_lock(); 1725 1726 /* This should be our last sync, the src is now paused */ 1727 migration_bitmap_sync(rs); 1728 1729 /* Easiest way to make sure we don't resume in the middle of a host-page */ 1730 rs->last_seen_block = NULL; 1731 rs->last_sent_block = NULL; 1732 rs->last_page = 0; 1733 1734 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1735 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 1736 unsigned long *bitmap = block->bmap; 1737 unsigned long *unsentmap = block->unsentmap; 1738 1739 if (!unsentmap) { 1740 /* We don't have a safe way to resize the sentmap, so 1741 * if the bitmap was resized it will be NULL at this 1742 * point. 1743 */ 1744 error_report("migration ram resized during precopy phase"); 1745 rcu_read_unlock(); 1746 return -EINVAL; 1747 } 1748 /* Deal with TPS != HPS and huge pages */ 1749 ret = postcopy_chunk_hostpages(ms, block); 1750 if (ret) { 1751 rcu_read_unlock(); 1752 return ret; 1753 } 1754 1755 /* 1756 * Update the unsentmap to be unsentmap = unsentmap | dirty 1757 */ 1758 bitmap_or(unsentmap, unsentmap, bitmap, pages); 1759 #ifdef DEBUG_POSTCOPY 1760 ram_debug_dump_bitmap(unsentmap, true, pages); 1761 #endif 1762 } 1763 trace_ram_postcopy_send_discard_bitmap(); 1764 1765 ret = postcopy_each_ram_send_discard(ms); 1766 rcu_read_unlock(); 1767 1768 return ret; 1769 } 1770 1771 /** 1772 * ram_discard_range: discard dirtied pages at the beginning of postcopy 1773 * 1774 * Returns zero on success 1775 * 1776 * @rbname: name of the RAMBlock of the request. NULL means the 1777 * same that last one. 1778 * @start: RAMBlock starting page 1779 * @length: RAMBlock size 1780 */ 1781 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 1782 { 1783 int ret = -1; 1784 1785 trace_ram_discard_range(rbname, start, length); 1786 1787 rcu_read_lock(); 1788 RAMBlock *rb = qemu_ram_block_by_name(rbname); 1789 1790 if (!rb) { 1791 error_report("ram_discard_range: Failed to find block '%s'", rbname); 1792 goto err; 1793 } 1794 1795 ret = ram_block_discard_range(rb, start, length); 1796 1797 err: 1798 rcu_read_unlock(); 1799 1800 return ret; 1801 } 1802 1803 static int ram_state_init(RAMState **rsp) 1804 { 1805 *rsp = g_new0(RAMState, 1); 1806 1807 qemu_mutex_init(&(*rsp)->bitmap_mutex); 1808 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 1809 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 1810 1811 if (migrate_use_xbzrle()) { 1812 XBZRLE_cache_lock(); 1813 XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE); 1814 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / 1815 TARGET_PAGE_SIZE, 1816 TARGET_PAGE_SIZE); 1817 if (!XBZRLE.cache) { 1818 XBZRLE_cache_unlock(); 1819 error_report("Error creating cache"); 1820 g_free(*rsp); 1821 *rsp = NULL; 1822 return -1; 1823 } 1824 XBZRLE_cache_unlock(); 1825 1826 /* We prefer not to abort if there is no memory */ 1827 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 1828 if (!XBZRLE.encoded_buf) { 1829 error_report("Error allocating encoded_buf"); 1830 g_free(*rsp); 1831 *rsp = NULL; 1832 return -1; 1833 } 1834 1835 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 1836 if (!XBZRLE.current_buf) { 1837 error_report("Error allocating current_buf"); 1838 g_free(XBZRLE.encoded_buf); 1839 XBZRLE.encoded_buf = NULL; 1840 g_free(*rsp); 1841 *rsp = NULL; 1842 return -1; 1843 } 1844 } 1845 1846 /* For memory_global_dirty_log_start below. */ 1847 qemu_mutex_lock_iothread(); 1848 1849 qemu_mutex_lock_ramlist(); 1850 rcu_read_lock(); 1851 ram_state_reset(*rsp); 1852 1853 /* Skip setting bitmap if there is no RAM */ 1854 if (ram_bytes_total()) { 1855 RAMBlock *block; 1856 1857 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1858 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 1859 1860 block->bmap = bitmap_new(pages); 1861 bitmap_set(block->bmap, 0, pages); 1862 if (migrate_postcopy_ram()) { 1863 block->unsentmap = bitmap_new(pages); 1864 bitmap_set(block->unsentmap, 0, pages); 1865 } 1866 } 1867 } 1868 1869 /* 1870 * Count the total number of pages used by ram blocks not including any 1871 * gaps due to alignment or unplugs. 1872 */ 1873 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 1874 1875 memory_global_dirty_log_start(); 1876 migration_bitmap_sync(*rsp); 1877 qemu_mutex_unlock_ramlist(); 1878 qemu_mutex_unlock_iothread(); 1879 rcu_read_unlock(); 1880 1881 return 0; 1882 } 1883 1884 /* 1885 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 1886 * long-running RCU critical section. When rcu-reclaims in the code 1887 * start to become numerous it will be necessary to reduce the 1888 * granularity of these critical sections. 1889 */ 1890 1891 /** 1892 * ram_save_setup: Setup RAM for migration 1893 * 1894 * Returns zero to indicate success and negative for error 1895 * 1896 * @f: QEMUFile where to send the data 1897 * @opaque: RAMState pointer 1898 */ 1899 static int ram_save_setup(QEMUFile *f, void *opaque) 1900 { 1901 RAMState **rsp = opaque; 1902 RAMBlock *block; 1903 1904 /* migration has already setup the bitmap, reuse it. */ 1905 if (!migration_in_colo_state()) { 1906 if (ram_state_init(rsp) != 0) { 1907 return -1; 1908 } 1909 } 1910 (*rsp)->f = f; 1911 1912 rcu_read_lock(); 1913 1914 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); 1915 1916 RAMBLOCK_FOREACH(block) { 1917 qemu_put_byte(f, strlen(block->idstr)); 1918 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 1919 qemu_put_be64(f, block->used_length); 1920 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) { 1921 qemu_put_be64(f, block->page_size); 1922 } 1923 } 1924 1925 rcu_read_unlock(); 1926 1927 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 1928 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 1929 1930 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 1931 1932 return 0; 1933 } 1934 1935 /** 1936 * ram_save_iterate: iterative stage for migration 1937 * 1938 * Returns zero to indicate success and negative for error 1939 * 1940 * @f: QEMUFile where to send the data 1941 * @opaque: RAMState pointer 1942 */ 1943 static int ram_save_iterate(QEMUFile *f, void *opaque) 1944 { 1945 RAMState **temp = opaque; 1946 RAMState *rs = *temp; 1947 int ret; 1948 int i; 1949 int64_t t0; 1950 int done = 0; 1951 1952 rcu_read_lock(); 1953 if (ram_list.version != rs->last_version) { 1954 ram_state_reset(rs); 1955 } 1956 1957 /* Read version before ram_list.blocks */ 1958 smp_rmb(); 1959 1960 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 1961 1962 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 1963 i = 0; 1964 while ((ret = qemu_file_rate_limit(f)) == 0) { 1965 int pages; 1966 1967 pages = ram_find_and_save_block(rs, false); 1968 /* no more pages to sent */ 1969 if (pages == 0) { 1970 done = 1; 1971 break; 1972 } 1973 rs->iterations++; 1974 1975 /* we want to check in the 1st loop, just in case it was the 1st time 1976 and we had to sync the dirty bitmap. 1977 qemu_get_clock_ns() is a bit expensive, so we only check each some 1978 iterations 1979 */ 1980 if ((i & 63) == 0) { 1981 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; 1982 if (t1 > MAX_WAIT) { 1983 trace_ram_save_iterate_big_wait(t1, i); 1984 break; 1985 } 1986 } 1987 i++; 1988 } 1989 flush_compressed_data(rs); 1990 rcu_read_unlock(); 1991 1992 /* 1993 * Must occur before EOS (or any QEMUFile operation) 1994 * because of RDMA protocol. 1995 */ 1996 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 1997 1998 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 1999 ram_counters.transferred += 8; 2000 2001 ret = qemu_file_get_error(f); 2002 if (ret < 0) { 2003 return ret; 2004 } 2005 2006 return done; 2007 } 2008 2009 /** 2010 * ram_save_complete: function called to send the remaining amount of ram 2011 * 2012 * Returns zero to indicate success 2013 * 2014 * Called with iothread lock 2015 * 2016 * @f: QEMUFile where to send the data 2017 * @opaque: RAMState pointer 2018 */ 2019 static int ram_save_complete(QEMUFile *f, void *opaque) 2020 { 2021 RAMState **temp = opaque; 2022 RAMState *rs = *temp; 2023 2024 rcu_read_lock(); 2025 2026 if (!migration_in_postcopy()) { 2027 migration_bitmap_sync(rs); 2028 } 2029 2030 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2031 2032 /* try transferring iterative blocks of memory */ 2033 2034 /* flush all remaining blocks regardless of rate limiting */ 2035 while (true) { 2036 int pages; 2037 2038 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2039 /* no more blocks to sent */ 2040 if (pages == 0) { 2041 break; 2042 } 2043 } 2044 2045 flush_compressed_data(rs); 2046 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2047 2048 rcu_read_unlock(); 2049 2050 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2051 2052 return 0; 2053 } 2054 2055 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2056 uint64_t *non_postcopiable_pending, 2057 uint64_t *postcopiable_pending) 2058 { 2059 RAMState **temp = opaque; 2060 RAMState *rs = *temp; 2061 uint64_t remaining_size; 2062 2063 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2064 2065 if (!migration_in_postcopy() && 2066 remaining_size < max_size) { 2067 qemu_mutex_lock_iothread(); 2068 rcu_read_lock(); 2069 migration_bitmap_sync(rs); 2070 rcu_read_unlock(); 2071 qemu_mutex_unlock_iothread(); 2072 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2073 } 2074 2075 /* We can do postcopy, and all the data is postcopiable */ 2076 *postcopiable_pending += remaining_size; 2077 } 2078 2079 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2080 { 2081 unsigned int xh_len; 2082 int xh_flags; 2083 uint8_t *loaded_data; 2084 2085 if (!xbzrle_decoded_buf) { 2086 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2087 } 2088 loaded_data = xbzrle_decoded_buf; 2089 2090 /* extract RLE header */ 2091 xh_flags = qemu_get_byte(f); 2092 xh_len = qemu_get_be16(f); 2093 2094 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2095 error_report("Failed to load XBZRLE page - wrong compression!"); 2096 return -1; 2097 } 2098 2099 if (xh_len > TARGET_PAGE_SIZE) { 2100 error_report("Failed to load XBZRLE page - len overflow!"); 2101 return -1; 2102 } 2103 /* load data and decode */ 2104 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2105 2106 /* decode RLE */ 2107 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2108 TARGET_PAGE_SIZE) == -1) { 2109 error_report("Failed to load XBZRLE page - decode error!"); 2110 return -1; 2111 } 2112 2113 return 0; 2114 } 2115 2116 /** 2117 * ram_block_from_stream: read a RAMBlock id from the migration stream 2118 * 2119 * Must be called from within a rcu critical section. 2120 * 2121 * Returns a pointer from within the RCU-protected ram_list. 2122 * 2123 * @f: QEMUFile where to read the data from 2124 * @flags: Page flags (mostly to see if it's a continuation of previous block) 2125 */ 2126 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 2127 { 2128 static RAMBlock *block = NULL; 2129 char id[256]; 2130 uint8_t len; 2131 2132 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2133 if (!block) { 2134 error_report("Ack, bad migration stream!"); 2135 return NULL; 2136 } 2137 return block; 2138 } 2139 2140 len = qemu_get_byte(f); 2141 qemu_get_buffer(f, (uint8_t *)id, len); 2142 id[len] = 0; 2143 2144 block = qemu_ram_block_by_name(id); 2145 if (!block) { 2146 error_report("Can't find block %s", id); 2147 return NULL; 2148 } 2149 2150 return block; 2151 } 2152 2153 static inline void *host_from_ram_block_offset(RAMBlock *block, 2154 ram_addr_t offset) 2155 { 2156 if (!offset_in_ramblock(block, offset)) { 2157 return NULL; 2158 } 2159 2160 return block->host + offset; 2161 } 2162 2163 /** 2164 * ram_handle_compressed: handle the zero page case 2165 * 2166 * If a page (or a whole RDMA chunk) has been 2167 * determined to be zero, then zap it. 2168 * 2169 * @host: host address for the zero page 2170 * @ch: what the page is filled from. We only support zero 2171 * @size: size of the zero page 2172 */ 2173 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2174 { 2175 if (ch != 0 || !is_zero_range(host, size)) { 2176 memset(host, ch, size); 2177 } 2178 } 2179 2180 static void *do_data_decompress(void *opaque) 2181 { 2182 DecompressParam *param = opaque; 2183 unsigned long pagesize; 2184 uint8_t *des; 2185 int len; 2186 2187 qemu_mutex_lock(¶m->mutex); 2188 while (!param->quit) { 2189 if (param->des) { 2190 des = param->des; 2191 len = param->len; 2192 param->des = 0; 2193 qemu_mutex_unlock(¶m->mutex); 2194 2195 pagesize = TARGET_PAGE_SIZE; 2196 /* uncompress() will return failed in some case, especially 2197 * when the page is dirted when doing the compression, it's 2198 * not a problem because the dirty page will be retransferred 2199 * and uncompress() won't break the data in other pages. 2200 */ 2201 uncompress((Bytef *)des, &pagesize, 2202 (const Bytef *)param->compbuf, len); 2203 2204 qemu_mutex_lock(&decomp_done_lock); 2205 param->done = true; 2206 qemu_cond_signal(&decomp_done_cond); 2207 qemu_mutex_unlock(&decomp_done_lock); 2208 2209 qemu_mutex_lock(¶m->mutex); 2210 } else { 2211 qemu_cond_wait(¶m->cond, ¶m->mutex); 2212 } 2213 } 2214 qemu_mutex_unlock(¶m->mutex); 2215 2216 return NULL; 2217 } 2218 2219 static void wait_for_decompress_done(void) 2220 { 2221 int idx, thread_count; 2222 2223 if (!migrate_use_compression()) { 2224 return; 2225 } 2226 2227 thread_count = migrate_decompress_threads(); 2228 qemu_mutex_lock(&decomp_done_lock); 2229 for (idx = 0; idx < thread_count; idx++) { 2230 while (!decomp_param[idx].done) { 2231 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2232 } 2233 } 2234 qemu_mutex_unlock(&decomp_done_lock); 2235 } 2236 2237 void migrate_decompress_threads_create(void) 2238 { 2239 int i, thread_count; 2240 2241 thread_count = migrate_decompress_threads(); 2242 decompress_threads = g_new0(QemuThread, thread_count); 2243 decomp_param = g_new0(DecompressParam, thread_count); 2244 qemu_mutex_init(&decomp_done_lock); 2245 qemu_cond_init(&decomp_done_cond); 2246 for (i = 0; i < thread_count; i++) { 2247 qemu_mutex_init(&decomp_param[i].mutex); 2248 qemu_cond_init(&decomp_param[i].cond); 2249 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2250 decomp_param[i].done = true; 2251 decomp_param[i].quit = false; 2252 qemu_thread_create(decompress_threads + i, "decompress", 2253 do_data_decompress, decomp_param + i, 2254 QEMU_THREAD_JOINABLE); 2255 } 2256 } 2257 2258 void migrate_decompress_threads_join(void) 2259 { 2260 int i, thread_count; 2261 2262 thread_count = migrate_decompress_threads(); 2263 for (i = 0; i < thread_count; i++) { 2264 qemu_mutex_lock(&decomp_param[i].mutex); 2265 decomp_param[i].quit = true; 2266 qemu_cond_signal(&decomp_param[i].cond); 2267 qemu_mutex_unlock(&decomp_param[i].mutex); 2268 } 2269 for (i = 0; i < thread_count; i++) { 2270 qemu_thread_join(decompress_threads + i); 2271 qemu_mutex_destroy(&decomp_param[i].mutex); 2272 qemu_cond_destroy(&decomp_param[i].cond); 2273 g_free(decomp_param[i].compbuf); 2274 } 2275 g_free(decompress_threads); 2276 g_free(decomp_param); 2277 decompress_threads = NULL; 2278 decomp_param = NULL; 2279 } 2280 2281 static void decompress_data_with_multi_threads(QEMUFile *f, 2282 void *host, int len) 2283 { 2284 int idx, thread_count; 2285 2286 thread_count = migrate_decompress_threads(); 2287 qemu_mutex_lock(&decomp_done_lock); 2288 while (true) { 2289 for (idx = 0; idx < thread_count; idx++) { 2290 if (decomp_param[idx].done) { 2291 decomp_param[idx].done = false; 2292 qemu_mutex_lock(&decomp_param[idx].mutex); 2293 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2294 decomp_param[idx].des = host; 2295 decomp_param[idx].len = len; 2296 qemu_cond_signal(&decomp_param[idx].cond); 2297 qemu_mutex_unlock(&decomp_param[idx].mutex); 2298 break; 2299 } 2300 } 2301 if (idx < thread_count) { 2302 break; 2303 } else { 2304 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2305 } 2306 } 2307 qemu_mutex_unlock(&decomp_done_lock); 2308 } 2309 2310 /** 2311 * ram_postcopy_incoming_init: allocate postcopy data structures 2312 * 2313 * Returns 0 for success and negative if there was one error 2314 * 2315 * @mis: current migration incoming state 2316 * 2317 * Allocate data structures etc needed by incoming migration with 2318 * postcopy-ram. postcopy-ram's similarly names 2319 * postcopy_ram_incoming_init does the work. 2320 */ 2321 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 2322 { 2323 unsigned long ram_pages = last_ram_page(); 2324 2325 return postcopy_ram_incoming_init(mis, ram_pages); 2326 } 2327 2328 /** 2329 * ram_load_postcopy: load a page in postcopy case 2330 * 2331 * Returns 0 for success or -errno in case of error 2332 * 2333 * Called in postcopy mode by ram_load(). 2334 * rcu_read_lock is taken prior to this being called. 2335 * 2336 * @f: QEMUFile where to send the data 2337 */ 2338 static int ram_load_postcopy(QEMUFile *f) 2339 { 2340 int flags = 0, ret = 0; 2341 bool place_needed = false; 2342 bool matching_page_sizes = false; 2343 MigrationIncomingState *mis = migration_incoming_get_current(); 2344 /* Temporary page that is later 'placed' */ 2345 void *postcopy_host_page = postcopy_get_tmp_page(mis); 2346 void *last_host = NULL; 2347 bool all_zero = false; 2348 2349 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2350 ram_addr_t addr; 2351 void *host = NULL; 2352 void *page_buffer = NULL; 2353 void *place_source = NULL; 2354 RAMBlock *block = NULL; 2355 uint8_t ch; 2356 2357 addr = qemu_get_be64(f); 2358 flags = addr & ~TARGET_PAGE_MASK; 2359 addr &= TARGET_PAGE_MASK; 2360 2361 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 2362 place_needed = false; 2363 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { 2364 block = ram_block_from_stream(f, flags); 2365 2366 host = host_from_ram_block_offset(block, addr); 2367 if (!host) { 2368 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2369 ret = -EINVAL; 2370 break; 2371 } 2372 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE; 2373 /* 2374 * Postcopy requires that we place whole host pages atomically; 2375 * these may be huge pages for RAMBlocks that are backed by 2376 * hugetlbfs. 2377 * To make it atomic, the data is read into a temporary page 2378 * that's moved into place later. 2379 * The migration protocol uses, possibly smaller, target-pages 2380 * however the source ensures it always sends all the components 2381 * of a host page in order. 2382 */ 2383 page_buffer = postcopy_host_page + 2384 ((uintptr_t)host & (block->page_size - 1)); 2385 /* If all TP are zero then we can optimise the place */ 2386 if (!((uintptr_t)host & (block->page_size - 1))) { 2387 all_zero = true; 2388 } else { 2389 /* not the 1st TP within the HP */ 2390 if (host != (last_host + TARGET_PAGE_SIZE)) { 2391 error_report("Non-sequential target page %p/%p", 2392 host, last_host); 2393 ret = -EINVAL; 2394 break; 2395 } 2396 } 2397 2398 2399 /* 2400 * If it's the last part of a host page then we place the host 2401 * page 2402 */ 2403 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & 2404 (block->page_size - 1)) == 0; 2405 place_source = postcopy_host_page; 2406 } 2407 last_host = host; 2408 2409 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2410 case RAM_SAVE_FLAG_ZERO: 2411 ch = qemu_get_byte(f); 2412 memset(page_buffer, ch, TARGET_PAGE_SIZE); 2413 if (ch) { 2414 all_zero = false; 2415 } 2416 break; 2417 2418 case RAM_SAVE_FLAG_PAGE: 2419 all_zero = false; 2420 if (!place_needed || !matching_page_sizes) { 2421 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 2422 } else { 2423 /* Avoids the qemu_file copy during postcopy, which is 2424 * going to do a copy later; can only do it when we 2425 * do this read in one go (matching page sizes) 2426 */ 2427 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 2428 TARGET_PAGE_SIZE); 2429 } 2430 break; 2431 case RAM_SAVE_FLAG_EOS: 2432 /* normal exit */ 2433 break; 2434 default: 2435 error_report("Unknown combination of migration flags: %#x" 2436 " (postcopy mode)", flags); 2437 ret = -EINVAL; 2438 } 2439 2440 if (place_needed) { 2441 /* This gets called at the last target page in the host page */ 2442 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size; 2443 2444 if (all_zero) { 2445 ret = postcopy_place_page_zero(mis, place_dest, 2446 block->page_size); 2447 } else { 2448 ret = postcopy_place_page(mis, place_dest, 2449 place_source, block->page_size); 2450 } 2451 } 2452 if (!ret) { 2453 ret = qemu_file_get_error(f); 2454 } 2455 } 2456 2457 return ret; 2458 } 2459 2460 static int ram_load(QEMUFile *f, void *opaque, int version_id) 2461 { 2462 int flags = 0, ret = 0; 2463 static uint64_t seq_iter; 2464 int len = 0; 2465 /* 2466 * If system is running in postcopy mode, page inserts to host memory must 2467 * be atomic 2468 */ 2469 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; 2470 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 2471 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE; 2472 2473 seq_iter++; 2474 2475 if (version_id != 4) { 2476 ret = -EINVAL; 2477 } 2478 2479 /* This RCU critical section can be very long running. 2480 * When RCU reclaims in the code start to become numerous, 2481 * it will be necessary to reduce the granularity of this 2482 * critical section. 2483 */ 2484 rcu_read_lock(); 2485 2486 if (postcopy_running) { 2487 ret = ram_load_postcopy(f); 2488 } 2489 2490 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2491 ram_addr_t addr, total_ram_bytes; 2492 void *host = NULL; 2493 uint8_t ch; 2494 2495 addr = qemu_get_be64(f); 2496 flags = addr & ~TARGET_PAGE_MASK; 2497 addr &= TARGET_PAGE_MASK; 2498 2499 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 2500 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 2501 RAMBlock *block = ram_block_from_stream(f, flags); 2502 2503 host = host_from_ram_block_offset(block, addr); 2504 if (!host) { 2505 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2506 ret = -EINVAL; 2507 break; 2508 } 2509 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 2510 } 2511 2512 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2513 case RAM_SAVE_FLAG_MEM_SIZE: 2514 /* Synchronize RAM block list */ 2515 total_ram_bytes = addr; 2516 while (!ret && total_ram_bytes) { 2517 RAMBlock *block; 2518 char id[256]; 2519 ram_addr_t length; 2520 2521 len = qemu_get_byte(f); 2522 qemu_get_buffer(f, (uint8_t *)id, len); 2523 id[len] = 0; 2524 length = qemu_get_be64(f); 2525 2526 block = qemu_ram_block_by_name(id); 2527 if (block) { 2528 if (length != block->used_length) { 2529 Error *local_err = NULL; 2530 2531 ret = qemu_ram_resize(block, length, 2532 &local_err); 2533 if (local_err) { 2534 error_report_err(local_err); 2535 } 2536 } 2537 /* For postcopy we need to check hugepage sizes match */ 2538 if (postcopy_advised && 2539 block->page_size != qemu_host_page_size) { 2540 uint64_t remote_page_size = qemu_get_be64(f); 2541 if (remote_page_size != block->page_size) { 2542 error_report("Mismatched RAM page size %s " 2543 "(local) %zd != %" PRId64, 2544 id, block->page_size, 2545 remote_page_size); 2546 ret = -EINVAL; 2547 } 2548 } 2549 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 2550 block->idstr); 2551 } else { 2552 error_report("Unknown ramblock \"%s\", cannot " 2553 "accept migration", id); 2554 ret = -EINVAL; 2555 } 2556 2557 total_ram_bytes -= length; 2558 } 2559 break; 2560 2561 case RAM_SAVE_FLAG_ZERO: 2562 ch = qemu_get_byte(f); 2563 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 2564 break; 2565 2566 case RAM_SAVE_FLAG_PAGE: 2567 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 2568 break; 2569 2570 case RAM_SAVE_FLAG_COMPRESS_PAGE: 2571 len = qemu_get_be32(f); 2572 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 2573 error_report("Invalid compressed data length: %d", len); 2574 ret = -EINVAL; 2575 break; 2576 } 2577 decompress_data_with_multi_threads(f, host, len); 2578 break; 2579 2580 case RAM_SAVE_FLAG_XBZRLE: 2581 if (load_xbzrle(f, addr, host) < 0) { 2582 error_report("Failed to decompress XBZRLE page at " 2583 RAM_ADDR_FMT, addr); 2584 ret = -EINVAL; 2585 break; 2586 } 2587 break; 2588 case RAM_SAVE_FLAG_EOS: 2589 /* normal exit */ 2590 break; 2591 default: 2592 if (flags & RAM_SAVE_FLAG_HOOK) { 2593 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 2594 } else { 2595 error_report("Unknown combination of migration flags: %#x", 2596 flags); 2597 ret = -EINVAL; 2598 } 2599 } 2600 if (!ret) { 2601 ret = qemu_file_get_error(f); 2602 } 2603 } 2604 2605 wait_for_decompress_done(); 2606 rcu_read_unlock(); 2607 trace_ram_load_complete(ret, seq_iter); 2608 return ret; 2609 } 2610 2611 static SaveVMHandlers savevm_ram_handlers = { 2612 .save_live_setup = ram_save_setup, 2613 .save_live_iterate = ram_save_iterate, 2614 .save_live_complete_postcopy = ram_save_complete, 2615 .save_live_complete_precopy = ram_save_complete, 2616 .save_live_pending = ram_save_pending, 2617 .load_state = ram_load, 2618 .cleanup = ram_migration_cleanup, 2619 }; 2620 2621 void ram_mig_init(void) 2622 { 2623 qemu_mutex_init(&XBZRLE.lock); 2624 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state); 2625 } 2626