1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "cpu.h" 31 #include <zlib.h> 32 #include "qemu/cutils.h" 33 #include "qemu/bitops.h" 34 #include "qemu/bitmap.h" 35 #include "qemu/main-loop.h" 36 #include "xbzrle.h" 37 #include "ram.h" 38 #include "migration.h" 39 #include "socket.h" 40 #include "migration/register.h" 41 #include "migration/misc.h" 42 #include "qemu-file.h" 43 #include "postcopy-ram.h" 44 #include "page_cache.h" 45 #include "qemu/error-report.h" 46 #include "qapi/error.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qmp/qerror.h" 49 #include "trace.h" 50 #include "exec/ram_addr.h" 51 #include "exec/target_page.h" 52 #include "qemu/rcu_queue.h" 53 #include "migration/colo.h" 54 #include "block.h" 55 #include "sysemu/sysemu.h" 56 #include "qemu/uuid.h" 57 #include "savevm.h" 58 59 /***********************************************************/ 60 /* ram save/restore */ 61 62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it 63 * worked for pages that where filled with the same char. We switched 64 * it to only search for the zero value. And to avoid confusion with 65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it. 66 */ 67 68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 69 #define RAM_SAVE_FLAG_ZERO 0x02 70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 71 #define RAM_SAVE_FLAG_PAGE 0x08 72 #define RAM_SAVE_FLAG_EOS 0x10 73 #define RAM_SAVE_FLAG_CONTINUE 0x20 74 #define RAM_SAVE_FLAG_XBZRLE 0x40 75 /* 0x80 is reserved in migration.h start with 0x100 next */ 76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 77 78 static inline bool is_zero_range(uint8_t *p, uint64_t size) 79 { 80 return buffer_is_zero(p, size); 81 } 82 83 XBZRLECacheStats xbzrle_counters; 84 85 /* struct contains XBZRLE cache and a static page 86 used by the compression */ 87 static struct { 88 /* buffer used for XBZRLE encoding */ 89 uint8_t *encoded_buf; 90 /* buffer for storing page content */ 91 uint8_t *current_buf; 92 /* Cache for XBZRLE, Protected by lock. */ 93 PageCache *cache; 94 QemuMutex lock; 95 /* it will store a page full of zeros */ 96 uint8_t *zero_target_page; 97 /* buffer used for XBZRLE decoding */ 98 uint8_t *decoded_buf; 99 } XBZRLE; 100 101 static void XBZRLE_cache_lock(void) 102 { 103 if (migrate_use_xbzrle()) 104 qemu_mutex_lock(&XBZRLE.lock); 105 } 106 107 static void XBZRLE_cache_unlock(void) 108 { 109 if (migrate_use_xbzrle()) 110 qemu_mutex_unlock(&XBZRLE.lock); 111 } 112 113 /** 114 * xbzrle_cache_resize: resize the xbzrle cache 115 * 116 * This function is called from qmp_migrate_set_cache_size in main 117 * thread, possibly while a migration is in progress. A running 118 * migration may be using the cache and might finish during this call, 119 * hence changes to the cache are protected by XBZRLE.lock(). 120 * 121 * Returns 0 for success or -1 for error 122 * 123 * @new_size: new cache size 124 * @errp: set *errp if the check failed, with reason 125 */ 126 int xbzrle_cache_resize(int64_t new_size, Error **errp) 127 { 128 PageCache *new_cache; 129 int64_t ret = 0; 130 131 /* Check for truncation */ 132 if (new_size != (size_t)new_size) { 133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 134 "exceeding address space"); 135 return -1; 136 } 137 138 if (new_size == migrate_xbzrle_cache_size()) { 139 /* nothing to do */ 140 return 0; 141 } 142 143 XBZRLE_cache_lock(); 144 145 if (XBZRLE.cache != NULL) { 146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 147 if (!new_cache) { 148 ret = -1; 149 goto out; 150 } 151 152 cache_fini(XBZRLE.cache); 153 XBZRLE.cache = new_cache; 154 } 155 out: 156 XBZRLE_cache_unlock(); 157 return ret; 158 } 159 160 static void ramblock_recv_map_init(void) 161 { 162 RAMBlock *rb; 163 164 RAMBLOCK_FOREACH(rb) { 165 assert(!rb->receivedmap); 166 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 167 } 168 } 169 170 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 171 { 172 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 173 rb->receivedmap); 174 } 175 176 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 177 { 178 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 179 } 180 181 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 182 { 183 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 184 } 185 186 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 187 size_t nr) 188 { 189 bitmap_set_atomic(rb->receivedmap, 190 ramblock_recv_bitmap_offset(host_addr, rb), 191 nr); 192 } 193 194 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 195 196 /* 197 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 198 * 199 * Returns >0 if success with sent bytes, or <0 if error. 200 */ 201 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 202 const char *block_name) 203 { 204 RAMBlock *block = qemu_ram_block_by_name(block_name); 205 unsigned long *le_bitmap, nbits; 206 uint64_t size; 207 208 if (!block) { 209 error_report("%s: invalid block name: %s", __func__, block_name); 210 return -1; 211 } 212 213 nbits = block->used_length >> TARGET_PAGE_BITS; 214 215 /* 216 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 217 * machines we may need 4 more bytes for padding (see below 218 * comment). So extend it a bit before hand. 219 */ 220 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 221 222 /* 223 * Always use little endian when sending the bitmap. This is 224 * required that when source and destination VMs are not using the 225 * same endianess. (Note: big endian won't work.) 226 */ 227 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 228 229 /* Size of the bitmap, in bytes */ 230 size = nbits / 8; 231 232 /* 233 * size is always aligned to 8 bytes for 64bit machines, but it 234 * may not be true for 32bit machines. We need this padding to 235 * make sure the migration can survive even between 32bit and 236 * 64bit machines. 237 */ 238 size = ROUND_UP(size, 8); 239 240 qemu_put_be64(file, size); 241 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 242 /* 243 * Mark as an end, in case the middle part is screwed up due to 244 * some "misterious" reason. 245 */ 246 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 247 qemu_fflush(file); 248 249 free(le_bitmap); 250 251 if (qemu_file_get_error(file)) { 252 return qemu_file_get_error(file); 253 } 254 255 return size + sizeof(size); 256 } 257 258 /* 259 * An outstanding page request, on the source, having been received 260 * and queued 261 */ 262 struct RAMSrcPageRequest { 263 RAMBlock *rb; 264 hwaddr offset; 265 hwaddr len; 266 267 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 268 }; 269 270 /* State of RAM for migration */ 271 struct RAMState { 272 /* QEMUFile used for this migration */ 273 QEMUFile *f; 274 /* Last block that we have visited searching for dirty pages */ 275 RAMBlock *last_seen_block; 276 /* Last block from where we have sent data */ 277 RAMBlock *last_sent_block; 278 /* Last dirty target page we have sent */ 279 ram_addr_t last_page; 280 /* last ram version we have seen */ 281 uint32_t last_version; 282 /* We are in the first round */ 283 bool ram_bulk_stage; 284 /* How many times we have dirty too many pages */ 285 int dirty_rate_high_cnt; 286 /* these variables are used for bitmap sync */ 287 /* last time we did a full bitmap_sync */ 288 int64_t time_last_bitmap_sync; 289 /* bytes transferred at start_time */ 290 uint64_t bytes_xfer_prev; 291 /* number of dirty pages since start_time */ 292 uint64_t num_dirty_pages_period; 293 /* xbzrle misses since the beginning of the period */ 294 uint64_t xbzrle_cache_miss_prev; 295 /* number of iterations at the beginning of period */ 296 uint64_t iterations_prev; 297 /* Iterations since start */ 298 uint64_t iterations; 299 /* number of dirty bits in the bitmap */ 300 uint64_t migration_dirty_pages; 301 /* protects modification of the bitmap */ 302 QemuMutex bitmap_mutex; 303 /* The RAMBlock used in the last src_page_requests */ 304 RAMBlock *last_req_rb; 305 /* Queue of outstanding page requests from the destination */ 306 QemuMutex src_page_req_mutex; 307 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests; 308 }; 309 typedef struct RAMState RAMState; 310 311 static RAMState *ram_state; 312 313 uint64_t ram_bytes_remaining(void) 314 { 315 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 316 0; 317 } 318 319 MigrationStats ram_counters; 320 321 /* used by the search for pages to send */ 322 struct PageSearchStatus { 323 /* Current block being searched */ 324 RAMBlock *block; 325 /* Current page to search from */ 326 unsigned long page; 327 /* Set once we wrap around */ 328 bool complete_round; 329 }; 330 typedef struct PageSearchStatus PageSearchStatus; 331 332 struct CompressParam { 333 bool done; 334 bool quit; 335 QEMUFile *file; 336 QemuMutex mutex; 337 QemuCond cond; 338 RAMBlock *block; 339 ram_addr_t offset; 340 341 /* internally used fields */ 342 z_stream stream; 343 uint8_t *originbuf; 344 }; 345 typedef struct CompressParam CompressParam; 346 347 struct DecompressParam { 348 bool done; 349 bool quit; 350 QemuMutex mutex; 351 QemuCond cond; 352 void *des; 353 uint8_t *compbuf; 354 int len; 355 z_stream stream; 356 }; 357 typedef struct DecompressParam DecompressParam; 358 359 static CompressParam *comp_param; 360 static QemuThread *compress_threads; 361 /* comp_done_cond is used to wake up the migration thread when 362 * one of the compression threads has finished the compression. 363 * comp_done_lock is used to co-work with comp_done_cond. 364 */ 365 static QemuMutex comp_done_lock; 366 static QemuCond comp_done_cond; 367 /* The empty QEMUFileOps will be used by file in CompressParam */ 368 static const QEMUFileOps empty_ops = { }; 369 370 static QEMUFile *decomp_file; 371 static DecompressParam *decomp_param; 372 static QemuThread *decompress_threads; 373 static QemuMutex decomp_done_lock; 374 static QemuCond decomp_done_cond; 375 376 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 377 ram_addr_t offset, uint8_t *source_buf); 378 379 static void *do_data_compress(void *opaque) 380 { 381 CompressParam *param = opaque; 382 RAMBlock *block; 383 ram_addr_t offset; 384 385 qemu_mutex_lock(¶m->mutex); 386 while (!param->quit) { 387 if (param->block) { 388 block = param->block; 389 offset = param->offset; 390 param->block = NULL; 391 qemu_mutex_unlock(¶m->mutex); 392 393 do_compress_ram_page(param->file, ¶m->stream, block, offset, 394 param->originbuf); 395 396 qemu_mutex_lock(&comp_done_lock); 397 param->done = true; 398 qemu_cond_signal(&comp_done_cond); 399 qemu_mutex_unlock(&comp_done_lock); 400 401 qemu_mutex_lock(¶m->mutex); 402 } else { 403 qemu_cond_wait(¶m->cond, ¶m->mutex); 404 } 405 } 406 qemu_mutex_unlock(¶m->mutex); 407 408 return NULL; 409 } 410 411 static inline void terminate_compression_threads(void) 412 { 413 int idx, thread_count; 414 415 thread_count = migrate_compress_threads(); 416 417 for (idx = 0; idx < thread_count; idx++) { 418 qemu_mutex_lock(&comp_param[idx].mutex); 419 comp_param[idx].quit = true; 420 qemu_cond_signal(&comp_param[idx].cond); 421 qemu_mutex_unlock(&comp_param[idx].mutex); 422 } 423 } 424 425 static void compress_threads_save_cleanup(void) 426 { 427 int i, thread_count; 428 429 if (!migrate_use_compression()) { 430 return; 431 } 432 terminate_compression_threads(); 433 thread_count = migrate_compress_threads(); 434 for (i = 0; i < thread_count; i++) { 435 /* 436 * we use it as a indicator which shows if the thread is 437 * properly init'd or not 438 */ 439 if (!comp_param[i].file) { 440 break; 441 } 442 qemu_thread_join(compress_threads + i); 443 qemu_mutex_destroy(&comp_param[i].mutex); 444 qemu_cond_destroy(&comp_param[i].cond); 445 deflateEnd(&comp_param[i].stream); 446 g_free(comp_param[i].originbuf); 447 qemu_fclose(comp_param[i].file); 448 comp_param[i].file = NULL; 449 } 450 qemu_mutex_destroy(&comp_done_lock); 451 qemu_cond_destroy(&comp_done_cond); 452 g_free(compress_threads); 453 g_free(comp_param); 454 compress_threads = NULL; 455 comp_param = NULL; 456 } 457 458 static int compress_threads_save_setup(void) 459 { 460 int i, thread_count; 461 462 if (!migrate_use_compression()) { 463 return 0; 464 } 465 thread_count = migrate_compress_threads(); 466 compress_threads = g_new0(QemuThread, thread_count); 467 comp_param = g_new0(CompressParam, thread_count); 468 qemu_cond_init(&comp_done_cond); 469 qemu_mutex_init(&comp_done_lock); 470 for (i = 0; i < thread_count; i++) { 471 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); 472 if (!comp_param[i].originbuf) { 473 goto exit; 474 } 475 476 if (deflateInit(&comp_param[i].stream, 477 migrate_compress_level()) != Z_OK) { 478 g_free(comp_param[i].originbuf); 479 goto exit; 480 } 481 482 /* comp_param[i].file is just used as a dummy buffer to save data, 483 * set its ops to empty. 484 */ 485 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 486 comp_param[i].done = true; 487 comp_param[i].quit = false; 488 qemu_mutex_init(&comp_param[i].mutex); 489 qemu_cond_init(&comp_param[i].cond); 490 qemu_thread_create(compress_threads + i, "compress", 491 do_data_compress, comp_param + i, 492 QEMU_THREAD_JOINABLE); 493 } 494 return 0; 495 496 exit: 497 compress_threads_save_cleanup(); 498 return -1; 499 } 500 501 /* Multiple fd's */ 502 503 #define MULTIFD_MAGIC 0x11223344U 504 #define MULTIFD_VERSION 1 505 506 typedef struct { 507 uint32_t magic; 508 uint32_t version; 509 unsigned char uuid[16]; /* QemuUUID */ 510 uint8_t id; 511 } __attribute__((packed)) MultiFDInit_t; 512 513 typedef struct { 514 /* this fields are not changed once the thread is created */ 515 /* channel number */ 516 uint8_t id; 517 /* channel thread name */ 518 char *name; 519 /* channel thread id */ 520 QemuThread thread; 521 /* communication channel */ 522 QIOChannel *c; 523 /* sem where to wait for more work */ 524 QemuSemaphore sem; 525 /* this mutex protects the following parameters */ 526 QemuMutex mutex; 527 /* is this channel thread running */ 528 bool running; 529 /* should this thread finish */ 530 bool quit; 531 } MultiFDSendParams; 532 533 typedef struct { 534 /* this fields are not changed once the thread is created */ 535 /* channel number */ 536 uint8_t id; 537 /* channel thread name */ 538 char *name; 539 /* channel thread id */ 540 QemuThread thread; 541 /* communication channel */ 542 QIOChannel *c; 543 /* sem where to wait for more work */ 544 QemuSemaphore sem; 545 /* this mutex protects the following parameters */ 546 QemuMutex mutex; 547 /* is this channel thread running */ 548 bool running; 549 /* should this thread finish */ 550 bool quit; 551 } MultiFDRecvParams; 552 553 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) 554 { 555 MultiFDInit_t msg; 556 int ret; 557 558 msg.magic = cpu_to_be32(MULTIFD_MAGIC); 559 msg.version = cpu_to_be32(MULTIFD_VERSION); 560 msg.id = p->id; 561 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid)); 562 563 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp); 564 if (ret != 0) { 565 return -1; 566 } 567 return 0; 568 } 569 570 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) 571 { 572 MultiFDInit_t msg; 573 int ret; 574 575 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp); 576 if (ret != 0) { 577 return -1; 578 } 579 580 be32_to_cpus(&msg.magic); 581 be32_to_cpus(&msg.version); 582 583 if (msg.magic != MULTIFD_MAGIC) { 584 error_setg(errp, "multifd: received packet magic %x " 585 "expected %x", msg.magic, MULTIFD_MAGIC); 586 return -1; 587 } 588 589 if (msg.version != MULTIFD_VERSION) { 590 error_setg(errp, "multifd: received packet version %d " 591 "expected %d", msg.version, MULTIFD_VERSION); 592 return -1; 593 } 594 595 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) { 596 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid); 597 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid); 598 599 error_setg(errp, "multifd: received uuid '%s' and expected " 600 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id); 601 g_free(uuid); 602 g_free(msg_uuid); 603 return -1; 604 } 605 606 if (msg.id > migrate_multifd_channels()) { 607 error_setg(errp, "multifd: received channel version %d " 608 "expected %d", msg.version, MULTIFD_VERSION); 609 return -1; 610 } 611 612 return msg.id; 613 } 614 615 struct { 616 MultiFDSendParams *params; 617 /* number of created threads */ 618 int count; 619 } *multifd_send_state; 620 621 static void multifd_send_terminate_threads(Error *err) 622 { 623 int i; 624 625 if (err) { 626 MigrationState *s = migrate_get_current(); 627 migrate_set_error(s, err); 628 if (s->state == MIGRATION_STATUS_SETUP || 629 s->state == MIGRATION_STATUS_PRE_SWITCHOVER || 630 s->state == MIGRATION_STATUS_DEVICE || 631 s->state == MIGRATION_STATUS_ACTIVE) { 632 migrate_set_state(&s->state, s->state, 633 MIGRATION_STATUS_FAILED); 634 } 635 } 636 637 for (i = 0; i < migrate_multifd_channels(); i++) { 638 MultiFDSendParams *p = &multifd_send_state->params[i]; 639 640 qemu_mutex_lock(&p->mutex); 641 p->quit = true; 642 qemu_sem_post(&p->sem); 643 qemu_mutex_unlock(&p->mutex); 644 } 645 } 646 647 int multifd_save_cleanup(Error **errp) 648 { 649 int i; 650 int ret = 0; 651 652 if (!migrate_use_multifd()) { 653 return 0; 654 } 655 multifd_send_terminate_threads(NULL); 656 for (i = 0; i < migrate_multifd_channels(); i++) { 657 MultiFDSendParams *p = &multifd_send_state->params[i]; 658 659 if (p->running) { 660 qemu_thread_join(&p->thread); 661 } 662 socket_send_channel_destroy(p->c); 663 p->c = NULL; 664 qemu_mutex_destroy(&p->mutex); 665 qemu_sem_destroy(&p->sem); 666 g_free(p->name); 667 p->name = NULL; 668 } 669 g_free(multifd_send_state->params); 670 multifd_send_state->params = NULL; 671 g_free(multifd_send_state); 672 multifd_send_state = NULL; 673 return ret; 674 } 675 676 static void *multifd_send_thread(void *opaque) 677 { 678 MultiFDSendParams *p = opaque; 679 Error *local_err = NULL; 680 681 if (multifd_send_initial_packet(p, &local_err) < 0) { 682 goto out; 683 } 684 685 while (true) { 686 qemu_mutex_lock(&p->mutex); 687 if (p->quit) { 688 qemu_mutex_unlock(&p->mutex); 689 break; 690 } 691 qemu_mutex_unlock(&p->mutex); 692 qemu_sem_wait(&p->sem); 693 } 694 695 out: 696 if (local_err) { 697 multifd_send_terminate_threads(local_err); 698 } 699 700 qemu_mutex_lock(&p->mutex); 701 p->running = false; 702 qemu_mutex_unlock(&p->mutex); 703 704 return NULL; 705 } 706 707 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) 708 { 709 MultiFDSendParams *p = opaque; 710 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task)); 711 Error *local_err = NULL; 712 713 if (qio_task_propagate_error(task, &local_err)) { 714 if (multifd_save_cleanup(&local_err) != 0) { 715 migrate_set_error(migrate_get_current(), local_err); 716 } 717 } else { 718 p->c = QIO_CHANNEL(sioc); 719 qio_channel_set_delay(p->c, false); 720 p->running = true; 721 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, 722 QEMU_THREAD_JOINABLE); 723 724 atomic_inc(&multifd_send_state->count); 725 } 726 } 727 728 int multifd_save_setup(void) 729 { 730 int thread_count; 731 uint8_t i; 732 733 if (!migrate_use_multifd()) { 734 return 0; 735 } 736 thread_count = migrate_multifd_channels(); 737 multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); 738 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); 739 atomic_set(&multifd_send_state->count, 0); 740 for (i = 0; i < thread_count; i++) { 741 MultiFDSendParams *p = &multifd_send_state->params[i]; 742 743 qemu_mutex_init(&p->mutex); 744 qemu_sem_init(&p->sem, 0); 745 p->quit = false; 746 p->id = i; 747 p->name = g_strdup_printf("multifdsend_%d", i); 748 socket_send_channel_create(multifd_new_send_channel_async, p); 749 } 750 return 0; 751 } 752 753 struct { 754 MultiFDRecvParams *params; 755 /* number of created threads */ 756 int count; 757 } *multifd_recv_state; 758 759 static void multifd_recv_terminate_threads(Error *err) 760 { 761 int i; 762 763 if (err) { 764 MigrationState *s = migrate_get_current(); 765 migrate_set_error(s, err); 766 if (s->state == MIGRATION_STATUS_SETUP || 767 s->state == MIGRATION_STATUS_ACTIVE) { 768 migrate_set_state(&s->state, s->state, 769 MIGRATION_STATUS_FAILED); 770 } 771 } 772 773 for (i = 0; i < migrate_multifd_channels(); i++) { 774 MultiFDRecvParams *p = &multifd_recv_state->params[i]; 775 776 qemu_mutex_lock(&p->mutex); 777 p->quit = true; 778 qemu_sem_post(&p->sem); 779 qemu_mutex_unlock(&p->mutex); 780 } 781 } 782 783 int multifd_load_cleanup(Error **errp) 784 { 785 int i; 786 int ret = 0; 787 788 if (!migrate_use_multifd()) { 789 return 0; 790 } 791 multifd_recv_terminate_threads(NULL); 792 for (i = 0; i < migrate_multifd_channels(); i++) { 793 MultiFDRecvParams *p = &multifd_recv_state->params[i]; 794 795 if (p->running) { 796 qemu_thread_join(&p->thread); 797 } 798 object_unref(OBJECT(p->c)); 799 p->c = NULL; 800 qemu_mutex_destroy(&p->mutex); 801 qemu_sem_destroy(&p->sem); 802 g_free(p->name); 803 p->name = NULL; 804 } 805 g_free(multifd_recv_state->params); 806 multifd_recv_state->params = NULL; 807 g_free(multifd_recv_state); 808 multifd_recv_state = NULL; 809 810 return ret; 811 } 812 813 static void *multifd_recv_thread(void *opaque) 814 { 815 MultiFDRecvParams *p = opaque; 816 817 while (true) { 818 qemu_mutex_lock(&p->mutex); 819 if (p->quit) { 820 qemu_mutex_unlock(&p->mutex); 821 break; 822 } 823 qemu_mutex_unlock(&p->mutex); 824 qemu_sem_wait(&p->sem); 825 } 826 827 qemu_mutex_lock(&p->mutex); 828 p->running = false; 829 qemu_mutex_unlock(&p->mutex); 830 831 return NULL; 832 } 833 834 int multifd_load_setup(void) 835 { 836 int thread_count; 837 uint8_t i; 838 839 if (!migrate_use_multifd()) { 840 return 0; 841 } 842 thread_count = migrate_multifd_channels(); 843 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); 844 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); 845 atomic_set(&multifd_recv_state->count, 0); 846 for (i = 0; i < thread_count; i++) { 847 MultiFDRecvParams *p = &multifd_recv_state->params[i]; 848 849 qemu_mutex_init(&p->mutex); 850 qemu_sem_init(&p->sem, 0); 851 p->quit = false; 852 p->id = i; 853 p->name = g_strdup_printf("multifdrecv_%d", i); 854 } 855 return 0; 856 } 857 858 bool multifd_recv_all_channels_created(void) 859 { 860 int thread_count = migrate_multifd_channels(); 861 862 if (!migrate_use_multifd()) { 863 return true; 864 } 865 866 return thread_count == atomic_read(&multifd_recv_state->count); 867 } 868 869 void multifd_recv_new_channel(QIOChannel *ioc) 870 { 871 MultiFDRecvParams *p; 872 Error *local_err = NULL; 873 int id; 874 875 id = multifd_recv_initial_packet(ioc, &local_err); 876 if (id < 0) { 877 multifd_recv_terminate_threads(local_err); 878 return; 879 } 880 881 p = &multifd_recv_state->params[id]; 882 if (p->c != NULL) { 883 error_setg(&local_err, "multifd: received id '%d' already setup'", 884 id); 885 multifd_recv_terminate_threads(local_err); 886 return; 887 } 888 p->c = ioc; 889 object_ref(OBJECT(ioc)); 890 891 p->running = true; 892 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, 893 QEMU_THREAD_JOINABLE); 894 atomic_inc(&multifd_recv_state->count); 895 if (multifd_recv_state->count == migrate_multifd_channels()) { 896 migration_incoming_process(); 897 } 898 } 899 900 /** 901 * save_page_header: write page header to wire 902 * 903 * If this is the 1st block, it also writes the block identification 904 * 905 * Returns the number of bytes written 906 * 907 * @f: QEMUFile where to send the data 908 * @block: block that contains the page we want to send 909 * @offset: offset inside the block for the page 910 * in the lower bits, it contains flags 911 */ 912 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, 913 ram_addr_t offset) 914 { 915 size_t size, len; 916 917 if (block == rs->last_sent_block) { 918 offset |= RAM_SAVE_FLAG_CONTINUE; 919 } 920 qemu_put_be64(f, offset); 921 size = 8; 922 923 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 924 len = strlen(block->idstr); 925 qemu_put_byte(f, len); 926 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 927 size += 1 + len; 928 rs->last_sent_block = block; 929 } 930 return size; 931 } 932 933 /** 934 * mig_throttle_guest_down: throotle down the guest 935 * 936 * Reduce amount of guest cpu execution to hopefully slow down memory 937 * writes. If guest dirty memory rate is reduced below the rate at 938 * which we can transfer pages to the destination then we should be 939 * able to complete migration. Some workloads dirty memory way too 940 * fast and will not effectively converge, even with auto-converge. 941 */ 942 static void mig_throttle_guest_down(void) 943 { 944 MigrationState *s = migrate_get_current(); 945 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 946 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 947 948 /* We have not started throttling yet. Let's start it. */ 949 if (!cpu_throttle_active()) { 950 cpu_throttle_set(pct_initial); 951 } else { 952 /* Throttling already on, just increase the rate */ 953 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); 954 } 955 } 956 957 /** 958 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 959 * 960 * @rs: current RAM state 961 * @current_addr: address for the zero page 962 * 963 * Update the xbzrle cache to reflect a page that's been sent as all 0. 964 * The important thing is that a stale (not-yet-0'd) page be replaced 965 * by the new data. 966 * As a bonus, if the page wasn't in the cache it gets added so that 967 * when a small write is made into the 0'd page it gets XBZRLE sent. 968 */ 969 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) 970 { 971 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 972 return; 973 } 974 975 /* We don't care if this fails to allocate a new cache page 976 * as long as it updated an old one */ 977 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 978 ram_counters.dirty_sync_count); 979 } 980 981 #define ENCODING_FLAG_XBZRLE 0x1 982 983 /** 984 * save_xbzrle_page: compress and send current page 985 * 986 * Returns: 1 means that we wrote the page 987 * 0 means that page is identical to the one already sent 988 * -1 means that xbzrle would be longer than normal 989 * 990 * @rs: current RAM state 991 * @current_data: pointer to the address of the page contents 992 * @current_addr: addr of the page 993 * @block: block that contains the page we want to send 994 * @offset: offset inside the block for the page 995 * @last_stage: if we are at the completion stage 996 */ 997 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, 998 ram_addr_t current_addr, RAMBlock *block, 999 ram_addr_t offset, bool last_stage) 1000 { 1001 int encoded_len = 0, bytes_xbzrle; 1002 uint8_t *prev_cached_page; 1003 1004 if (!cache_is_cached(XBZRLE.cache, current_addr, 1005 ram_counters.dirty_sync_count)) { 1006 xbzrle_counters.cache_miss++; 1007 if (!last_stage) { 1008 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 1009 ram_counters.dirty_sync_count) == -1) { 1010 return -1; 1011 } else { 1012 /* update *current_data when the page has been 1013 inserted into cache */ 1014 *current_data = get_cached_data(XBZRLE.cache, current_addr); 1015 } 1016 } 1017 return -1; 1018 } 1019 1020 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 1021 1022 /* save current buffer into memory */ 1023 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 1024 1025 /* XBZRLE encoding (if there is no overflow) */ 1026 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 1027 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 1028 TARGET_PAGE_SIZE); 1029 if (encoded_len == 0) { 1030 trace_save_xbzrle_page_skipping(); 1031 return 0; 1032 } else if (encoded_len == -1) { 1033 trace_save_xbzrle_page_overflow(); 1034 xbzrle_counters.overflow++; 1035 /* update data in the cache */ 1036 if (!last_stage) { 1037 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); 1038 *current_data = prev_cached_page; 1039 } 1040 return -1; 1041 } 1042 1043 /* we need to update the data in the cache, in order to get the same data */ 1044 if (!last_stage) { 1045 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 1046 } 1047 1048 /* Send XBZRLE based compressed page */ 1049 bytes_xbzrle = save_page_header(rs, rs->f, block, 1050 offset | RAM_SAVE_FLAG_XBZRLE); 1051 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); 1052 qemu_put_be16(rs->f, encoded_len); 1053 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); 1054 bytes_xbzrle += encoded_len + 1 + 2; 1055 xbzrle_counters.pages++; 1056 xbzrle_counters.bytes += bytes_xbzrle; 1057 ram_counters.transferred += bytes_xbzrle; 1058 1059 return 1; 1060 } 1061 1062 /** 1063 * migration_bitmap_find_dirty: find the next dirty page from start 1064 * 1065 * Called with rcu_read_lock() to protect migration_bitmap 1066 * 1067 * Returns the byte offset within memory region of the start of a dirty page 1068 * 1069 * @rs: current RAM state 1070 * @rb: RAMBlock where to search for dirty pages 1071 * @start: page where we start the search 1072 */ 1073 static inline 1074 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 1075 unsigned long start) 1076 { 1077 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 1078 unsigned long *bitmap = rb->bmap; 1079 unsigned long next; 1080 1081 if (rs->ram_bulk_stage && start > 0) { 1082 next = start + 1; 1083 } else { 1084 next = find_next_bit(bitmap, size, start); 1085 } 1086 1087 return next; 1088 } 1089 1090 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 1091 RAMBlock *rb, 1092 unsigned long page) 1093 { 1094 bool ret; 1095 1096 ret = test_and_clear_bit(page, rb->bmap); 1097 1098 if (ret) { 1099 rs->migration_dirty_pages--; 1100 } 1101 return ret; 1102 } 1103 1104 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb, 1105 ram_addr_t start, ram_addr_t length) 1106 { 1107 rs->migration_dirty_pages += 1108 cpu_physical_memory_sync_dirty_bitmap(rb, start, length, 1109 &rs->num_dirty_pages_period); 1110 } 1111 1112 /** 1113 * ram_pagesize_summary: calculate all the pagesizes of a VM 1114 * 1115 * Returns a summary bitmap of the page sizes of all RAMBlocks 1116 * 1117 * For VMs with just normal pages this is equivalent to the host page 1118 * size. If it's got some huge pages then it's the OR of all the 1119 * different page sizes. 1120 */ 1121 uint64_t ram_pagesize_summary(void) 1122 { 1123 RAMBlock *block; 1124 uint64_t summary = 0; 1125 1126 RAMBLOCK_FOREACH(block) { 1127 summary |= block->page_size; 1128 } 1129 1130 return summary; 1131 } 1132 1133 static void migration_bitmap_sync(RAMState *rs) 1134 { 1135 RAMBlock *block; 1136 int64_t end_time; 1137 uint64_t bytes_xfer_now; 1138 1139 ram_counters.dirty_sync_count++; 1140 1141 if (!rs->time_last_bitmap_sync) { 1142 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1143 } 1144 1145 trace_migration_bitmap_sync_start(); 1146 memory_global_dirty_log_sync(); 1147 1148 qemu_mutex_lock(&rs->bitmap_mutex); 1149 rcu_read_lock(); 1150 RAMBLOCK_FOREACH(block) { 1151 migration_bitmap_sync_range(rs, block, 0, block->used_length); 1152 } 1153 rcu_read_unlock(); 1154 qemu_mutex_unlock(&rs->bitmap_mutex); 1155 1156 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1157 1158 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1159 1160 /* more than 1 second = 1000 millisecons */ 1161 if (end_time > rs->time_last_bitmap_sync + 1000) { 1162 /* calculate period counters */ 1163 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000 1164 / (end_time - rs->time_last_bitmap_sync); 1165 bytes_xfer_now = ram_counters.transferred; 1166 1167 /* During block migration the auto-converge logic incorrectly detects 1168 * that ram migration makes no progress. Avoid this by disabling the 1169 * throttling logic during the bulk phase of block migration. */ 1170 if (migrate_auto_converge() && !blk_mig_bulk_active()) { 1171 /* The following detection logic can be refined later. For now: 1172 Check to see if the dirtied bytes is 50% more than the approx. 1173 amount of bytes that just got transferred since the last time we 1174 were in this routine. If that happens twice, start or increase 1175 throttling */ 1176 1177 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE > 1178 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) && 1179 (++rs->dirty_rate_high_cnt >= 2)) { 1180 trace_migration_throttle(); 1181 rs->dirty_rate_high_cnt = 0; 1182 mig_throttle_guest_down(); 1183 } 1184 } 1185 1186 if (migrate_use_xbzrle()) { 1187 if (rs->iterations_prev != rs->iterations) { 1188 xbzrle_counters.cache_miss_rate = 1189 (double)(xbzrle_counters.cache_miss - 1190 rs->xbzrle_cache_miss_prev) / 1191 (rs->iterations - rs->iterations_prev); 1192 } 1193 rs->iterations_prev = rs->iterations; 1194 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 1195 } 1196 1197 /* reset period counters */ 1198 rs->time_last_bitmap_sync = end_time; 1199 rs->num_dirty_pages_period = 0; 1200 rs->bytes_xfer_prev = bytes_xfer_now; 1201 } 1202 if (migrate_use_events()) { 1203 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL); 1204 } 1205 } 1206 1207 /** 1208 * save_zero_page: send the zero page to the stream 1209 * 1210 * Returns the number of pages written. 1211 * 1212 * @rs: current RAM state 1213 * @block: block that contains the page we want to send 1214 * @offset: offset inside the block for the page 1215 */ 1216 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) 1217 { 1218 uint8_t *p = block->host + offset; 1219 int pages = -1; 1220 1221 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 1222 ram_counters.duplicate++; 1223 ram_counters.transferred += 1224 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO); 1225 qemu_put_byte(rs->f, 0); 1226 ram_counters.transferred += 1; 1227 pages = 1; 1228 } 1229 1230 return pages; 1231 } 1232 1233 static void ram_release_pages(const char *rbname, uint64_t offset, int pages) 1234 { 1235 if (!migrate_release_ram() || !migration_in_postcopy()) { 1236 return; 1237 } 1238 1239 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS); 1240 } 1241 1242 /* 1243 * @pages: the number of pages written by the control path, 1244 * < 0 - error 1245 * > 0 - number of pages written 1246 * 1247 * Return true if the pages has been saved, otherwise false is returned. 1248 */ 1249 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1250 int *pages) 1251 { 1252 uint64_t bytes_xmit = 0; 1253 int ret; 1254 1255 *pages = -1; 1256 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, 1257 &bytes_xmit); 1258 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1259 return false; 1260 } 1261 1262 if (bytes_xmit) { 1263 ram_counters.transferred += bytes_xmit; 1264 *pages = 1; 1265 } 1266 1267 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1268 return true; 1269 } 1270 1271 if (bytes_xmit > 0) { 1272 ram_counters.normal++; 1273 } else if (bytes_xmit == 0) { 1274 ram_counters.duplicate++; 1275 } 1276 1277 return true; 1278 } 1279 1280 /* 1281 * directly send the page to the stream 1282 * 1283 * Returns the number of pages written. 1284 * 1285 * @rs: current RAM state 1286 * @block: block that contains the page we want to send 1287 * @offset: offset inside the block for the page 1288 * @buf: the page to be sent 1289 * @async: send to page asyncly 1290 */ 1291 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, 1292 uint8_t *buf, bool async) 1293 { 1294 ram_counters.transferred += save_page_header(rs, rs->f, block, 1295 offset | RAM_SAVE_FLAG_PAGE); 1296 if (async) { 1297 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, 1298 migrate_release_ram() & 1299 migration_in_postcopy()); 1300 } else { 1301 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); 1302 } 1303 ram_counters.transferred += TARGET_PAGE_SIZE; 1304 ram_counters.normal++; 1305 return 1; 1306 } 1307 1308 /** 1309 * ram_save_page: send the given page to the stream 1310 * 1311 * Returns the number of pages written. 1312 * < 0 - error 1313 * >=0 - Number of pages written - this might legally be 0 1314 * if xbzrle noticed the page was the same. 1315 * 1316 * @rs: current RAM state 1317 * @block: block that contains the page we want to send 1318 * @offset: offset inside the block for the page 1319 * @last_stage: if we are at the completion stage 1320 */ 1321 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) 1322 { 1323 int pages = -1; 1324 uint8_t *p; 1325 bool send_async = true; 1326 RAMBlock *block = pss->block; 1327 ram_addr_t offset = pss->page << TARGET_PAGE_BITS; 1328 ram_addr_t current_addr = block->offset + offset; 1329 1330 p = block->host + offset; 1331 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1332 1333 XBZRLE_cache_lock(); 1334 if (!rs->ram_bulk_stage && !migration_in_postcopy() && 1335 migrate_use_xbzrle()) { 1336 pages = save_xbzrle_page(rs, &p, current_addr, block, 1337 offset, last_stage); 1338 if (!last_stage) { 1339 /* Can't send this cached data async, since the cache page 1340 * might get updated before it gets to the wire 1341 */ 1342 send_async = false; 1343 } 1344 } 1345 1346 /* XBZRLE overflow or normal page */ 1347 if (pages == -1) { 1348 pages = save_normal_page(rs, block, offset, p, send_async); 1349 } 1350 1351 XBZRLE_cache_unlock(); 1352 1353 return pages; 1354 } 1355 1356 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, 1357 ram_addr_t offset, uint8_t *source_buf) 1358 { 1359 RAMState *rs = ram_state; 1360 int bytes_sent, blen; 1361 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 1362 1363 bytes_sent = save_page_header(rs, f, block, offset | 1364 RAM_SAVE_FLAG_COMPRESS_PAGE); 1365 1366 /* 1367 * copy it to a internal buffer to avoid it being modified by VM 1368 * so that we can catch up the error during compression and 1369 * decompression 1370 */ 1371 memcpy(source_buf, p, TARGET_PAGE_SIZE); 1372 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); 1373 if (blen < 0) { 1374 bytes_sent = 0; 1375 qemu_file_set_error(migrate_get_current()->to_dst_file, blen); 1376 error_report("compressed data failed!"); 1377 } else { 1378 bytes_sent += blen; 1379 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1); 1380 } 1381 1382 return bytes_sent; 1383 } 1384 1385 static void flush_compressed_data(RAMState *rs) 1386 { 1387 int idx, len, thread_count; 1388 1389 if (!migrate_use_compression()) { 1390 return; 1391 } 1392 thread_count = migrate_compress_threads(); 1393 1394 qemu_mutex_lock(&comp_done_lock); 1395 for (idx = 0; idx < thread_count; idx++) { 1396 while (!comp_param[idx].done) { 1397 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1398 } 1399 } 1400 qemu_mutex_unlock(&comp_done_lock); 1401 1402 for (idx = 0; idx < thread_count; idx++) { 1403 qemu_mutex_lock(&comp_param[idx].mutex); 1404 if (!comp_param[idx].quit) { 1405 len = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1406 ram_counters.transferred += len; 1407 } 1408 qemu_mutex_unlock(&comp_param[idx].mutex); 1409 } 1410 } 1411 1412 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 1413 ram_addr_t offset) 1414 { 1415 param->block = block; 1416 param->offset = offset; 1417 } 1418 1419 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, 1420 ram_addr_t offset) 1421 { 1422 int idx, thread_count, bytes_xmit = -1, pages = -1; 1423 1424 thread_count = migrate_compress_threads(); 1425 qemu_mutex_lock(&comp_done_lock); 1426 while (true) { 1427 for (idx = 0; idx < thread_count; idx++) { 1428 if (comp_param[idx].done) { 1429 comp_param[idx].done = false; 1430 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); 1431 qemu_mutex_lock(&comp_param[idx].mutex); 1432 set_compress_params(&comp_param[idx], block, offset); 1433 qemu_cond_signal(&comp_param[idx].cond); 1434 qemu_mutex_unlock(&comp_param[idx].mutex); 1435 pages = 1; 1436 ram_counters.normal++; 1437 ram_counters.transferred += bytes_xmit; 1438 break; 1439 } 1440 } 1441 if (pages > 0) { 1442 break; 1443 } else { 1444 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 1445 } 1446 } 1447 qemu_mutex_unlock(&comp_done_lock); 1448 1449 return pages; 1450 } 1451 1452 /** 1453 * find_dirty_block: find the next dirty page and update any state 1454 * associated with the search process. 1455 * 1456 * Returns if a page is found 1457 * 1458 * @rs: current RAM state 1459 * @pss: data about the state of the current dirty page scan 1460 * @again: set to false if the search has scanned the whole of RAM 1461 */ 1462 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) 1463 { 1464 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); 1465 if (pss->complete_round && pss->block == rs->last_seen_block && 1466 pss->page >= rs->last_page) { 1467 /* 1468 * We've been once around the RAM and haven't found anything. 1469 * Give up. 1470 */ 1471 *again = false; 1472 return false; 1473 } 1474 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) { 1475 /* Didn't find anything in this RAM Block */ 1476 pss->page = 0; 1477 pss->block = QLIST_NEXT_RCU(pss->block, next); 1478 if (!pss->block) { 1479 /* Hit the end of the list */ 1480 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1481 /* Flag that we've looped */ 1482 pss->complete_round = true; 1483 rs->ram_bulk_stage = false; 1484 if (migrate_use_xbzrle()) { 1485 /* If xbzrle is on, stop using the data compression at this 1486 * point. In theory, xbzrle can do better than compression. 1487 */ 1488 flush_compressed_data(rs); 1489 } 1490 } 1491 /* Didn't find anything this time, but try again on the new block */ 1492 *again = true; 1493 return false; 1494 } else { 1495 /* Can go around again, but... */ 1496 *again = true; 1497 /* We've found something so probably don't need to */ 1498 return true; 1499 } 1500 } 1501 1502 /** 1503 * unqueue_page: gets a page of the queue 1504 * 1505 * Helper for 'get_queued_page' - gets a page off the queue 1506 * 1507 * Returns the block of the page (or NULL if none available) 1508 * 1509 * @rs: current RAM state 1510 * @offset: used to return the offset within the RAMBlock 1511 */ 1512 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1513 { 1514 RAMBlock *block = NULL; 1515 1516 qemu_mutex_lock(&rs->src_page_req_mutex); 1517 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) { 1518 struct RAMSrcPageRequest *entry = 1519 QSIMPLEQ_FIRST(&rs->src_page_requests); 1520 block = entry->rb; 1521 *offset = entry->offset; 1522 1523 if (entry->len > TARGET_PAGE_SIZE) { 1524 entry->len -= TARGET_PAGE_SIZE; 1525 entry->offset += TARGET_PAGE_SIZE; 1526 } else { 1527 memory_region_unref(block->mr); 1528 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1529 g_free(entry); 1530 } 1531 } 1532 qemu_mutex_unlock(&rs->src_page_req_mutex); 1533 1534 return block; 1535 } 1536 1537 /** 1538 * get_queued_page: unqueue a page from the postocpy requests 1539 * 1540 * Skips pages that are already sent (!dirty) 1541 * 1542 * Returns if a queued page is found 1543 * 1544 * @rs: current RAM state 1545 * @pss: data about the state of the current dirty page scan 1546 */ 1547 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1548 { 1549 RAMBlock *block; 1550 ram_addr_t offset; 1551 bool dirty; 1552 1553 do { 1554 block = unqueue_page(rs, &offset); 1555 /* 1556 * We're sending this page, and since it's postcopy nothing else 1557 * will dirty it, and we must make sure it doesn't get sent again 1558 * even if this queue request was received after the background 1559 * search already sent it. 1560 */ 1561 if (block) { 1562 unsigned long page; 1563 1564 page = offset >> TARGET_PAGE_BITS; 1565 dirty = test_bit(page, block->bmap); 1566 if (!dirty) { 1567 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1568 page, test_bit(page, block->unsentmap)); 1569 } else { 1570 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1571 } 1572 } 1573 1574 } while (block && !dirty); 1575 1576 if (block) { 1577 /* 1578 * As soon as we start servicing pages out of order, then we have 1579 * to kill the bulk stage, since the bulk stage assumes 1580 * in (migration_bitmap_find_and_reset_dirty) that every page is 1581 * dirty, that's no longer true. 1582 */ 1583 rs->ram_bulk_stage = false; 1584 1585 /* 1586 * We want the background search to continue from the queued page 1587 * since the guest is likely to want other pages near to the page 1588 * it just requested. 1589 */ 1590 pss->block = block; 1591 pss->page = offset >> TARGET_PAGE_BITS; 1592 } 1593 1594 return !!block; 1595 } 1596 1597 /** 1598 * migration_page_queue_free: drop any remaining pages in the ram 1599 * request queue 1600 * 1601 * It should be empty at the end anyway, but in error cases there may 1602 * be some left. in case that there is any page left, we drop it. 1603 * 1604 */ 1605 static void migration_page_queue_free(RAMState *rs) 1606 { 1607 struct RAMSrcPageRequest *mspr, *next_mspr; 1608 /* This queue generally should be empty - but in the case of a failed 1609 * migration might have some droppings in. 1610 */ 1611 rcu_read_lock(); 1612 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1613 memory_region_unref(mspr->rb->mr); 1614 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1615 g_free(mspr); 1616 } 1617 rcu_read_unlock(); 1618 } 1619 1620 /** 1621 * ram_save_queue_pages: queue the page for transmission 1622 * 1623 * A request from postcopy destination for example. 1624 * 1625 * Returns zero on success or negative on error 1626 * 1627 * @rbname: Name of the RAMBLock of the request. NULL means the 1628 * same that last one. 1629 * @start: starting address from the start of the RAMBlock 1630 * @len: length (in bytes) to send 1631 */ 1632 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) 1633 { 1634 RAMBlock *ramblock; 1635 RAMState *rs = ram_state; 1636 1637 ram_counters.postcopy_requests++; 1638 rcu_read_lock(); 1639 if (!rbname) { 1640 /* Reuse last RAMBlock */ 1641 ramblock = rs->last_req_rb; 1642 1643 if (!ramblock) { 1644 /* 1645 * Shouldn't happen, we can't reuse the last RAMBlock if 1646 * it's the 1st request. 1647 */ 1648 error_report("ram_save_queue_pages no previous block"); 1649 goto err; 1650 } 1651 } else { 1652 ramblock = qemu_ram_block_by_name(rbname); 1653 1654 if (!ramblock) { 1655 /* We shouldn't be asked for a non-existent RAMBlock */ 1656 error_report("ram_save_queue_pages no block '%s'", rbname); 1657 goto err; 1658 } 1659 rs->last_req_rb = ramblock; 1660 } 1661 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1662 if (start+len > ramblock->used_length) { 1663 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1664 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1665 __func__, start, len, ramblock->used_length); 1666 goto err; 1667 } 1668 1669 struct RAMSrcPageRequest *new_entry = 1670 g_malloc0(sizeof(struct RAMSrcPageRequest)); 1671 new_entry->rb = ramblock; 1672 new_entry->offset = start; 1673 new_entry->len = len; 1674 1675 memory_region_ref(ramblock->mr); 1676 qemu_mutex_lock(&rs->src_page_req_mutex); 1677 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1678 qemu_mutex_unlock(&rs->src_page_req_mutex); 1679 rcu_read_unlock(); 1680 1681 return 0; 1682 1683 err: 1684 rcu_read_unlock(); 1685 return -1; 1686 } 1687 1688 static bool save_page_use_compression(RAMState *rs) 1689 { 1690 if (!migrate_use_compression()) { 1691 return false; 1692 } 1693 1694 /* 1695 * If xbzrle is on, stop using the data compression after first 1696 * round of migration even if compression is enabled. In theory, 1697 * xbzrle can do better than compression. 1698 */ 1699 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { 1700 return true; 1701 } 1702 1703 return false; 1704 } 1705 1706 /** 1707 * ram_save_target_page: save one target page 1708 * 1709 * Returns the number of pages written 1710 * 1711 * @rs: current RAM state 1712 * @pss: data about the page we want to send 1713 * @last_stage: if we are at the completion stage 1714 */ 1715 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, 1716 bool last_stage) 1717 { 1718 RAMBlock *block = pss->block; 1719 ram_addr_t offset = pss->page << TARGET_PAGE_BITS; 1720 int res; 1721 1722 if (control_save_page(rs, block, offset, &res)) { 1723 return res; 1724 } 1725 1726 /* 1727 * When starting the process of a new block, the first page of 1728 * the block should be sent out before other pages in the same 1729 * block, and all the pages in last block should have been sent 1730 * out, keeping this order is important, because the 'cont' flag 1731 * is used to avoid resending the block name. 1732 */ 1733 if (block != rs->last_sent_block && save_page_use_compression(rs)) { 1734 flush_compressed_data(rs); 1735 } 1736 1737 res = save_zero_page(rs, block, offset); 1738 if (res > 0) { 1739 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 1740 * page would be stale 1741 */ 1742 if (!save_page_use_compression(rs)) { 1743 XBZRLE_cache_lock(); 1744 xbzrle_cache_zero_page(rs, block->offset + offset); 1745 XBZRLE_cache_unlock(); 1746 } 1747 ram_release_pages(block->idstr, offset, res); 1748 return res; 1749 } 1750 1751 /* 1752 * Make sure the first page is sent out before other pages. 1753 * 1754 * we post it as normal page as compression will take much 1755 * CPU resource. 1756 */ 1757 if (block == rs->last_sent_block && save_page_use_compression(rs)) { 1758 return compress_page_with_multi_thread(rs, block, offset); 1759 } 1760 1761 return ram_save_page(rs, pss, last_stage); 1762 } 1763 1764 /** 1765 * ram_save_host_page: save a whole host page 1766 * 1767 * Starting at *offset send pages up to the end of the current host 1768 * page. It's valid for the initial offset to point into the middle of 1769 * a host page in which case the remainder of the hostpage is sent. 1770 * Only dirty target pages are sent. Note that the host page size may 1771 * be a huge page for this block. 1772 * The saving stops at the boundary of the used_length of the block 1773 * if the RAMBlock isn't a multiple of the host page size. 1774 * 1775 * Returns the number of pages written or negative on error 1776 * 1777 * @rs: current RAM state 1778 * @ms: current migration state 1779 * @pss: data about the page we want to send 1780 * @last_stage: if we are at the completion stage 1781 */ 1782 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, 1783 bool last_stage) 1784 { 1785 int tmppages, pages = 0; 1786 size_t pagesize_bits = 1787 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 1788 1789 do { 1790 /* Check the pages is dirty and if it is send it */ 1791 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { 1792 pss->page++; 1793 continue; 1794 } 1795 1796 tmppages = ram_save_target_page(rs, pss, last_stage); 1797 if (tmppages < 0) { 1798 return tmppages; 1799 } 1800 1801 pages += tmppages; 1802 if (pss->block->unsentmap) { 1803 clear_bit(pss->page, pss->block->unsentmap); 1804 } 1805 1806 pss->page++; 1807 } while ((pss->page & (pagesize_bits - 1)) && 1808 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); 1809 1810 /* The offset we leave with is the last one we looked at */ 1811 pss->page--; 1812 return pages; 1813 } 1814 1815 /** 1816 * ram_find_and_save_block: finds a dirty page and sends it to f 1817 * 1818 * Called within an RCU critical section. 1819 * 1820 * Returns the number of pages written where zero means no dirty pages 1821 * 1822 * @rs: current RAM state 1823 * @last_stage: if we are at the completion stage 1824 * 1825 * On systems where host-page-size > target-page-size it will send all the 1826 * pages in a host page that are dirty. 1827 */ 1828 1829 static int ram_find_and_save_block(RAMState *rs, bool last_stage) 1830 { 1831 PageSearchStatus pss; 1832 int pages = 0; 1833 bool again, found; 1834 1835 /* No dirty page as there is zero RAM */ 1836 if (!ram_bytes_total()) { 1837 return pages; 1838 } 1839 1840 pss.block = rs->last_seen_block; 1841 pss.page = rs->last_page; 1842 pss.complete_round = false; 1843 1844 if (!pss.block) { 1845 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1846 } 1847 1848 do { 1849 again = true; 1850 found = get_queued_page(rs, &pss); 1851 1852 if (!found) { 1853 /* priority queue empty, so just search for something dirty */ 1854 found = find_dirty_block(rs, &pss, &again); 1855 } 1856 1857 if (found) { 1858 pages = ram_save_host_page(rs, &pss, last_stage); 1859 } 1860 } while (!pages && again); 1861 1862 rs->last_seen_block = pss.block; 1863 rs->last_page = pss.page; 1864 1865 return pages; 1866 } 1867 1868 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1869 { 1870 uint64_t pages = size / TARGET_PAGE_SIZE; 1871 1872 if (zero) { 1873 ram_counters.duplicate += pages; 1874 } else { 1875 ram_counters.normal += pages; 1876 ram_counters.transferred += size; 1877 qemu_update_position(f, size); 1878 } 1879 } 1880 1881 uint64_t ram_bytes_total(void) 1882 { 1883 RAMBlock *block; 1884 uint64_t total = 0; 1885 1886 rcu_read_lock(); 1887 RAMBLOCK_FOREACH(block) { 1888 total += block->used_length; 1889 } 1890 rcu_read_unlock(); 1891 return total; 1892 } 1893 1894 static void xbzrle_load_setup(void) 1895 { 1896 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 1897 } 1898 1899 static void xbzrle_load_cleanup(void) 1900 { 1901 g_free(XBZRLE.decoded_buf); 1902 XBZRLE.decoded_buf = NULL; 1903 } 1904 1905 static void ram_state_cleanup(RAMState **rsp) 1906 { 1907 if (*rsp) { 1908 migration_page_queue_free(*rsp); 1909 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 1910 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 1911 g_free(*rsp); 1912 *rsp = NULL; 1913 } 1914 } 1915 1916 static void xbzrle_cleanup(void) 1917 { 1918 XBZRLE_cache_lock(); 1919 if (XBZRLE.cache) { 1920 cache_fini(XBZRLE.cache); 1921 g_free(XBZRLE.encoded_buf); 1922 g_free(XBZRLE.current_buf); 1923 g_free(XBZRLE.zero_target_page); 1924 XBZRLE.cache = NULL; 1925 XBZRLE.encoded_buf = NULL; 1926 XBZRLE.current_buf = NULL; 1927 XBZRLE.zero_target_page = NULL; 1928 } 1929 XBZRLE_cache_unlock(); 1930 } 1931 1932 static void ram_save_cleanup(void *opaque) 1933 { 1934 RAMState **rsp = opaque; 1935 RAMBlock *block; 1936 1937 /* caller have hold iothread lock or is in a bh, so there is 1938 * no writing race against this migration_bitmap 1939 */ 1940 memory_global_dirty_log_stop(); 1941 1942 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1943 g_free(block->bmap); 1944 block->bmap = NULL; 1945 g_free(block->unsentmap); 1946 block->unsentmap = NULL; 1947 } 1948 1949 xbzrle_cleanup(); 1950 compress_threads_save_cleanup(); 1951 ram_state_cleanup(rsp); 1952 } 1953 1954 static void ram_state_reset(RAMState *rs) 1955 { 1956 rs->last_seen_block = NULL; 1957 rs->last_sent_block = NULL; 1958 rs->last_page = 0; 1959 rs->last_version = ram_list.version; 1960 rs->ram_bulk_stage = true; 1961 } 1962 1963 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1964 1965 /* 1966 * 'expected' is the value you expect the bitmap mostly to be full 1967 * of; it won't bother printing lines that are all this value. 1968 * If 'todump' is null the migration bitmap is dumped. 1969 */ 1970 void ram_debug_dump_bitmap(unsigned long *todump, bool expected, 1971 unsigned long pages) 1972 { 1973 int64_t cur; 1974 int64_t linelen = 128; 1975 char linebuf[129]; 1976 1977 for (cur = 0; cur < pages; cur += linelen) { 1978 int64_t curb; 1979 bool found = false; 1980 /* 1981 * Last line; catch the case where the line length 1982 * is longer than remaining ram 1983 */ 1984 if (cur + linelen > pages) { 1985 linelen = pages - cur; 1986 } 1987 for (curb = 0; curb < linelen; curb++) { 1988 bool thisbit = test_bit(cur + curb, todump); 1989 linebuf[curb] = thisbit ? '1' : '.'; 1990 found = found || (thisbit != expected); 1991 } 1992 if (found) { 1993 linebuf[curb] = '\0'; 1994 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1995 } 1996 } 1997 } 1998 1999 /* **** functions for postcopy ***** */ 2000 2001 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2002 { 2003 struct RAMBlock *block; 2004 2005 RAMBLOCK_FOREACH(block) { 2006 unsigned long *bitmap = block->bmap; 2007 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2008 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2009 2010 while (run_start < range) { 2011 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2012 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS, 2013 (run_end - run_start) << TARGET_PAGE_BITS); 2014 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2015 } 2016 } 2017 } 2018 2019 /** 2020 * postcopy_send_discard_bm_ram: discard a RAMBlock 2021 * 2022 * Returns zero on success 2023 * 2024 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2025 * Note: At this point the 'unsentmap' is the processed bitmap combined 2026 * with the dirtymap; so a '1' means it's either dirty or unsent. 2027 * 2028 * @ms: current migration state 2029 * @pds: state for postcopy 2030 * @start: RAMBlock starting page 2031 * @length: RAMBlock size 2032 */ 2033 static int postcopy_send_discard_bm_ram(MigrationState *ms, 2034 PostcopyDiscardState *pds, 2035 RAMBlock *block) 2036 { 2037 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2038 unsigned long current; 2039 unsigned long *unsentmap = block->unsentmap; 2040 2041 for (current = 0; current < end; ) { 2042 unsigned long one = find_next_bit(unsentmap, end, current); 2043 2044 if (one <= end) { 2045 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); 2046 unsigned long discard_length; 2047 2048 if (zero >= end) { 2049 discard_length = end - one; 2050 } else { 2051 discard_length = zero - one; 2052 } 2053 if (discard_length) { 2054 postcopy_discard_send_range(ms, pds, one, discard_length); 2055 } 2056 current = one + discard_length; 2057 } else { 2058 current = one; 2059 } 2060 } 2061 2062 return 0; 2063 } 2064 2065 /** 2066 * postcopy_each_ram_send_discard: discard all RAMBlocks 2067 * 2068 * Returns 0 for success or negative for error 2069 * 2070 * Utility for the outgoing postcopy code. 2071 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2072 * passing it bitmap indexes and name. 2073 * (qemu_ram_foreach_block ends up passing unscaled lengths 2074 * which would mean postcopy code would have to deal with target page) 2075 * 2076 * @ms: current migration state 2077 */ 2078 static int postcopy_each_ram_send_discard(MigrationState *ms) 2079 { 2080 struct RAMBlock *block; 2081 int ret; 2082 2083 RAMBLOCK_FOREACH(block) { 2084 PostcopyDiscardState *pds = 2085 postcopy_discard_send_init(ms, block->idstr); 2086 2087 /* 2088 * Postcopy sends chunks of bitmap over the wire, but it 2089 * just needs indexes at this point, avoids it having 2090 * target page specific code. 2091 */ 2092 ret = postcopy_send_discard_bm_ram(ms, pds, block); 2093 postcopy_discard_send_finish(ms, pds); 2094 if (ret) { 2095 return ret; 2096 } 2097 } 2098 2099 return 0; 2100 } 2101 2102 /** 2103 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages 2104 * 2105 * Helper for postcopy_chunk_hostpages; it's called twice to 2106 * canonicalize the two bitmaps, that are similar, but one is 2107 * inverted. 2108 * 2109 * Postcopy requires that all target pages in a hostpage are dirty or 2110 * clean, not a mix. This function canonicalizes the bitmaps. 2111 * 2112 * @ms: current migration state 2113 * @unsent_pass: if true we need to canonicalize partially unsent host pages 2114 * otherwise we need to canonicalize partially dirty host pages 2115 * @block: block that contains the page we want to canonicalize 2116 * @pds: state for postcopy 2117 */ 2118 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, 2119 RAMBlock *block, 2120 PostcopyDiscardState *pds) 2121 { 2122 RAMState *rs = ram_state; 2123 unsigned long *bitmap = block->bmap; 2124 unsigned long *unsentmap = block->unsentmap; 2125 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2126 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2127 unsigned long run_start; 2128 2129 if (block->page_size == TARGET_PAGE_SIZE) { 2130 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2131 return; 2132 } 2133 2134 if (unsent_pass) { 2135 /* Find a sent page */ 2136 run_start = find_next_zero_bit(unsentmap, pages, 0); 2137 } else { 2138 /* Find a dirty page */ 2139 run_start = find_next_bit(bitmap, pages, 0); 2140 } 2141 2142 while (run_start < pages) { 2143 bool do_fixup = false; 2144 unsigned long fixup_start_addr; 2145 unsigned long host_offset; 2146 2147 /* 2148 * If the start of this run of pages is in the middle of a host 2149 * page, then we need to fixup this host page. 2150 */ 2151 host_offset = run_start % host_ratio; 2152 if (host_offset) { 2153 do_fixup = true; 2154 run_start -= host_offset; 2155 fixup_start_addr = run_start; 2156 /* For the next pass */ 2157 run_start = run_start + host_ratio; 2158 } else { 2159 /* Find the end of this run */ 2160 unsigned long run_end; 2161 if (unsent_pass) { 2162 run_end = find_next_bit(unsentmap, pages, run_start + 1); 2163 } else { 2164 run_end = find_next_zero_bit(bitmap, pages, run_start + 1); 2165 } 2166 /* 2167 * If the end isn't at the start of a host page, then the 2168 * run doesn't finish at the end of a host page 2169 * and we need to discard. 2170 */ 2171 host_offset = run_end % host_ratio; 2172 if (host_offset) { 2173 do_fixup = true; 2174 fixup_start_addr = run_end - host_offset; 2175 /* 2176 * This host page has gone, the next loop iteration starts 2177 * from after the fixup 2178 */ 2179 run_start = fixup_start_addr + host_ratio; 2180 } else { 2181 /* 2182 * No discards on this iteration, next loop starts from 2183 * next sent/dirty page 2184 */ 2185 run_start = run_end + 1; 2186 } 2187 } 2188 2189 if (do_fixup) { 2190 unsigned long page; 2191 2192 /* Tell the destination to discard this page */ 2193 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { 2194 /* For the unsent_pass we: 2195 * discard partially sent pages 2196 * For the !unsent_pass (dirty) we: 2197 * discard partially dirty pages that were sent 2198 * (any partially sent pages were already discarded 2199 * by the previous unsent_pass) 2200 */ 2201 postcopy_discard_send_range(ms, pds, fixup_start_addr, 2202 host_ratio); 2203 } 2204 2205 /* Clean up the bitmap */ 2206 for (page = fixup_start_addr; 2207 page < fixup_start_addr + host_ratio; page++) { 2208 /* All pages in this host page are now not sent */ 2209 set_bit(page, unsentmap); 2210 2211 /* 2212 * Remark them as dirty, updating the count for any pages 2213 * that weren't previously dirty. 2214 */ 2215 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2216 } 2217 } 2218 2219 if (unsent_pass) { 2220 /* Find the next sent page for the next iteration */ 2221 run_start = find_next_zero_bit(unsentmap, pages, run_start); 2222 } else { 2223 /* Find the next dirty page for the next iteration */ 2224 run_start = find_next_bit(bitmap, pages, run_start); 2225 } 2226 } 2227 } 2228 2229 /** 2230 * postcopy_chuck_hostpages: discrad any partially sent host page 2231 * 2232 * Utility for the outgoing postcopy code. 2233 * 2234 * Discard any partially sent host-page size chunks, mark any partially 2235 * dirty host-page size chunks as all dirty. In this case the host-page 2236 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 2237 * 2238 * Returns zero on success 2239 * 2240 * @ms: current migration state 2241 * @block: block we want to work with 2242 */ 2243 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block) 2244 { 2245 PostcopyDiscardState *pds = 2246 postcopy_discard_send_init(ms, block->idstr); 2247 2248 /* First pass: Discard all partially sent host pages */ 2249 postcopy_chunk_hostpages_pass(ms, true, block, pds); 2250 /* 2251 * Second pass: Ensure that all partially dirty host pages are made 2252 * fully dirty. 2253 */ 2254 postcopy_chunk_hostpages_pass(ms, false, block, pds); 2255 2256 postcopy_discard_send_finish(ms, pds); 2257 return 0; 2258 } 2259 2260 /** 2261 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2262 * 2263 * Returns zero on success 2264 * 2265 * Transmit the set of pages to be discarded after precopy to the target 2266 * these are pages that: 2267 * a) Have been previously transmitted but are now dirty again 2268 * b) Pages that have never been transmitted, this ensures that 2269 * any pages on the destination that have been mapped by background 2270 * tasks get discarded (transparent huge pages is the specific concern) 2271 * Hopefully this is pretty sparse 2272 * 2273 * @ms: current migration state 2274 */ 2275 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 2276 { 2277 RAMState *rs = ram_state; 2278 RAMBlock *block; 2279 int ret; 2280 2281 rcu_read_lock(); 2282 2283 /* This should be our last sync, the src is now paused */ 2284 migration_bitmap_sync(rs); 2285 2286 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2287 rs->last_seen_block = NULL; 2288 rs->last_sent_block = NULL; 2289 rs->last_page = 0; 2290 2291 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 2292 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2293 unsigned long *bitmap = block->bmap; 2294 unsigned long *unsentmap = block->unsentmap; 2295 2296 if (!unsentmap) { 2297 /* We don't have a safe way to resize the sentmap, so 2298 * if the bitmap was resized it will be NULL at this 2299 * point. 2300 */ 2301 error_report("migration ram resized during precopy phase"); 2302 rcu_read_unlock(); 2303 return -EINVAL; 2304 } 2305 /* Deal with TPS != HPS and huge pages */ 2306 ret = postcopy_chunk_hostpages(ms, block); 2307 if (ret) { 2308 rcu_read_unlock(); 2309 return ret; 2310 } 2311 2312 /* 2313 * Update the unsentmap to be unsentmap = unsentmap | dirty 2314 */ 2315 bitmap_or(unsentmap, unsentmap, bitmap, pages); 2316 #ifdef DEBUG_POSTCOPY 2317 ram_debug_dump_bitmap(unsentmap, true, pages); 2318 #endif 2319 } 2320 trace_ram_postcopy_send_discard_bitmap(); 2321 2322 ret = postcopy_each_ram_send_discard(ms); 2323 rcu_read_unlock(); 2324 2325 return ret; 2326 } 2327 2328 /** 2329 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2330 * 2331 * Returns zero on success 2332 * 2333 * @rbname: name of the RAMBlock of the request. NULL means the 2334 * same that last one. 2335 * @start: RAMBlock starting page 2336 * @length: RAMBlock size 2337 */ 2338 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2339 { 2340 int ret = -1; 2341 2342 trace_ram_discard_range(rbname, start, length); 2343 2344 rcu_read_lock(); 2345 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2346 2347 if (!rb) { 2348 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2349 goto err; 2350 } 2351 2352 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2353 length >> qemu_target_page_bits()); 2354 ret = ram_block_discard_range(rb, start, length); 2355 2356 err: 2357 rcu_read_unlock(); 2358 2359 return ret; 2360 } 2361 2362 /* 2363 * For every allocation, we will try not to crash the VM if the 2364 * allocation failed. 2365 */ 2366 static int xbzrle_init(void) 2367 { 2368 Error *local_err = NULL; 2369 2370 if (!migrate_use_xbzrle()) { 2371 return 0; 2372 } 2373 2374 XBZRLE_cache_lock(); 2375 2376 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2377 if (!XBZRLE.zero_target_page) { 2378 error_report("%s: Error allocating zero page", __func__); 2379 goto err_out; 2380 } 2381 2382 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2383 TARGET_PAGE_SIZE, &local_err); 2384 if (!XBZRLE.cache) { 2385 error_report_err(local_err); 2386 goto free_zero_page; 2387 } 2388 2389 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2390 if (!XBZRLE.encoded_buf) { 2391 error_report("%s: Error allocating encoded_buf", __func__); 2392 goto free_cache; 2393 } 2394 2395 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2396 if (!XBZRLE.current_buf) { 2397 error_report("%s: Error allocating current_buf", __func__); 2398 goto free_encoded_buf; 2399 } 2400 2401 /* We are all good */ 2402 XBZRLE_cache_unlock(); 2403 return 0; 2404 2405 free_encoded_buf: 2406 g_free(XBZRLE.encoded_buf); 2407 XBZRLE.encoded_buf = NULL; 2408 free_cache: 2409 cache_fini(XBZRLE.cache); 2410 XBZRLE.cache = NULL; 2411 free_zero_page: 2412 g_free(XBZRLE.zero_target_page); 2413 XBZRLE.zero_target_page = NULL; 2414 err_out: 2415 XBZRLE_cache_unlock(); 2416 return -ENOMEM; 2417 } 2418 2419 static int ram_state_init(RAMState **rsp) 2420 { 2421 *rsp = g_try_new0(RAMState, 1); 2422 2423 if (!*rsp) { 2424 error_report("%s: Init ramstate fail", __func__); 2425 return -1; 2426 } 2427 2428 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2429 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2430 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2431 2432 /* 2433 * Count the total number of pages used by ram blocks not including any 2434 * gaps due to alignment or unplugs. 2435 */ 2436 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 2437 2438 ram_state_reset(*rsp); 2439 2440 return 0; 2441 } 2442 2443 static void ram_list_init_bitmaps(void) 2444 { 2445 RAMBlock *block; 2446 unsigned long pages; 2447 2448 /* Skip setting bitmap if there is no RAM */ 2449 if (ram_bytes_total()) { 2450 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 2451 pages = block->max_length >> TARGET_PAGE_BITS; 2452 block->bmap = bitmap_new(pages); 2453 bitmap_set(block->bmap, 0, pages); 2454 if (migrate_postcopy_ram()) { 2455 block->unsentmap = bitmap_new(pages); 2456 bitmap_set(block->unsentmap, 0, pages); 2457 } 2458 } 2459 } 2460 } 2461 2462 static void ram_init_bitmaps(RAMState *rs) 2463 { 2464 /* For memory_global_dirty_log_start below. */ 2465 qemu_mutex_lock_iothread(); 2466 qemu_mutex_lock_ramlist(); 2467 rcu_read_lock(); 2468 2469 ram_list_init_bitmaps(); 2470 memory_global_dirty_log_start(); 2471 migration_bitmap_sync(rs); 2472 2473 rcu_read_unlock(); 2474 qemu_mutex_unlock_ramlist(); 2475 qemu_mutex_unlock_iothread(); 2476 } 2477 2478 static int ram_init_all(RAMState **rsp) 2479 { 2480 if (ram_state_init(rsp)) { 2481 return -1; 2482 } 2483 2484 if (xbzrle_init()) { 2485 ram_state_cleanup(rsp); 2486 return -1; 2487 } 2488 2489 ram_init_bitmaps(*rsp); 2490 2491 return 0; 2492 } 2493 2494 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2495 { 2496 RAMBlock *block; 2497 uint64_t pages = 0; 2498 2499 /* 2500 * Postcopy is not using xbzrle/compression, so no need for that. 2501 * Also, since source are already halted, we don't need to care 2502 * about dirty page logging as well. 2503 */ 2504 2505 RAMBLOCK_FOREACH(block) { 2506 pages += bitmap_count_one(block->bmap, 2507 block->used_length >> TARGET_PAGE_BITS); 2508 } 2509 2510 /* This may not be aligned with current bitmaps. Recalculate. */ 2511 rs->migration_dirty_pages = pages; 2512 2513 rs->last_seen_block = NULL; 2514 rs->last_sent_block = NULL; 2515 rs->last_page = 0; 2516 rs->last_version = ram_list.version; 2517 /* 2518 * Disable the bulk stage, otherwise we'll resend the whole RAM no 2519 * matter what we have sent. 2520 */ 2521 rs->ram_bulk_stage = false; 2522 2523 /* Update RAMState cache of output QEMUFile */ 2524 rs->f = out; 2525 2526 trace_ram_state_resume_prepare(pages); 2527 } 2528 2529 /* 2530 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2531 * long-running RCU critical section. When rcu-reclaims in the code 2532 * start to become numerous it will be necessary to reduce the 2533 * granularity of these critical sections. 2534 */ 2535 2536 /** 2537 * ram_save_setup: Setup RAM for migration 2538 * 2539 * Returns zero to indicate success and negative for error 2540 * 2541 * @f: QEMUFile where to send the data 2542 * @opaque: RAMState pointer 2543 */ 2544 static int ram_save_setup(QEMUFile *f, void *opaque) 2545 { 2546 RAMState **rsp = opaque; 2547 RAMBlock *block; 2548 2549 if (compress_threads_save_setup()) { 2550 return -1; 2551 } 2552 2553 /* migration has already setup the bitmap, reuse it. */ 2554 if (!migration_in_colo_state()) { 2555 if (ram_init_all(rsp) != 0) { 2556 compress_threads_save_cleanup(); 2557 return -1; 2558 } 2559 } 2560 (*rsp)->f = f; 2561 2562 rcu_read_lock(); 2563 2564 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); 2565 2566 RAMBLOCK_FOREACH(block) { 2567 qemu_put_byte(f, strlen(block->idstr)); 2568 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2569 qemu_put_be64(f, block->used_length); 2570 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) { 2571 qemu_put_be64(f, block->page_size); 2572 } 2573 } 2574 2575 rcu_read_unlock(); 2576 2577 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2578 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2579 2580 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2581 2582 return 0; 2583 } 2584 2585 /** 2586 * ram_save_iterate: iterative stage for migration 2587 * 2588 * Returns zero to indicate success and negative for error 2589 * 2590 * @f: QEMUFile where to send the data 2591 * @opaque: RAMState pointer 2592 */ 2593 static int ram_save_iterate(QEMUFile *f, void *opaque) 2594 { 2595 RAMState **temp = opaque; 2596 RAMState *rs = *temp; 2597 int ret; 2598 int i; 2599 int64_t t0; 2600 int done = 0; 2601 2602 if (blk_mig_bulk_active()) { 2603 /* Avoid transferring ram during bulk phase of block migration as 2604 * the bulk phase will usually take a long time and transferring 2605 * ram updates during that time is pointless. */ 2606 goto out; 2607 } 2608 2609 rcu_read_lock(); 2610 if (ram_list.version != rs->last_version) { 2611 ram_state_reset(rs); 2612 } 2613 2614 /* Read version before ram_list.blocks */ 2615 smp_rmb(); 2616 2617 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2618 2619 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2620 i = 0; 2621 while ((ret = qemu_file_rate_limit(f)) == 0) { 2622 int pages; 2623 2624 pages = ram_find_and_save_block(rs, false); 2625 /* no more pages to sent */ 2626 if (pages == 0) { 2627 done = 1; 2628 break; 2629 } 2630 rs->iterations++; 2631 2632 /* we want to check in the 1st loop, just in case it was the 1st time 2633 and we had to sync the dirty bitmap. 2634 qemu_get_clock_ns() is a bit expensive, so we only check each some 2635 iterations 2636 */ 2637 if ((i & 63) == 0) { 2638 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; 2639 if (t1 > MAX_WAIT) { 2640 trace_ram_save_iterate_big_wait(t1, i); 2641 break; 2642 } 2643 } 2644 i++; 2645 } 2646 flush_compressed_data(rs); 2647 rcu_read_unlock(); 2648 2649 /* 2650 * Must occur before EOS (or any QEMUFile operation) 2651 * because of RDMA protocol. 2652 */ 2653 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2654 2655 out: 2656 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2657 ram_counters.transferred += 8; 2658 2659 ret = qemu_file_get_error(f); 2660 if (ret < 0) { 2661 return ret; 2662 } 2663 2664 return done; 2665 } 2666 2667 /** 2668 * ram_save_complete: function called to send the remaining amount of ram 2669 * 2670 * Returns zero to indicate success 2671 * 2672 * Called with iothread lock 2673 * 2674 * @f: QEMUFile where to send the data 2675 * @opaque: RAMState pointer 2676 */ 2677 static int ram_save_complete(QEMUFile *f, void *opaque) 2678 { 2679 RAMState **temp = opaque; 2680 RAMState *rs = *temp; 2681 2682 rcu_read_lock(); 2683 2684 if (!migration_in_postcopy()) { 2685 migration_bitmap_sync(rs); 2686 } 2687 2688 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2689 2690 /* try transferring iterative blocks of memory */ 2691 2692 /* flush all remaining blocks regardless of rate limiting */ 2693 while (true) { 2694 int pages; 2695 2696 pages = ram_find_and_save_block(rs, !migration_in_colo_state()); 2697 /* no more blocks to sent */ 2698 if (pages == 0) { 2699 break; 2700 } 2701 } 2702 2703 flush_compressed_data(rs); 2704 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2705 2706 rcu_read_unlock(); 2707 2708 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2709 2710 return 0; 2711 } 2712 2713 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2714 uint64_t *res_precopy_only, 2715 uint64_t *res_compatible, 2716 uint64_t *res_postcopy_only) 2717 { 2718 RAMState **temp = opaque; 2719 RAMState *rs = *temp; 2720 uint64_t remaining_size; 2721 2722 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2723 2724 if (!migration_in_postcopy() && 2725 remaining_size < max_size) { 2726 qemu_mutex_lock_iothread(); 2727 rcu_read_lock(); 2728 migration_bitmap_sync(rs); 2729 rcu_read_unlock(); 2730 qemu_mutex_unlock_iothread(); 2731 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 2732 } 2733 2734 if (migrate_postcopy_ram()) { 2735 /* We can do postcopy, and all the data is postcopiable */ 2736 *res_compatible += remaining_size; 2737 } else { 2738 *res_precopy_only += remaining_size; 2739 } 2740 } 2741 2742 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2743 { 2744 unsigned int xh_len; 2745 int xh_flags; 2746 uint8_t *loaded_data; 2747 2748 /* extract RLE header */ 2749 xh_flags = qemu_get_byte(f); 2750 xh_len = qemu_get_be16(f); 2751 2752 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2753 error_report("Failed to load XBZRLE page - wrong compression!"); 2754 return -1; 2755 } 2756 2757 if (xh_len > TARGET_PAGE_SIZE) { 2758 error_report("Failed to load XBZRLE page - len overflow!"); 2759 return -1; 2760 } 2761 loaded_data = XBZRLE.decoded_buf; 2762 /* load data and decode */ 2763 /* it can change loaded_data to point to an internal buffer */ 2764 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2765 2766 /* decode RLE */ 2767 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2768 TARGET_PAGE_SIZE) == -1) { 2769 error_report("Failed to load XBZRLE page - decode error!"); 2770 return -1; 2771 } 2772 2773 return 0; 2774 } 2775 2776 /** 2777 * ram_block_from_stream: read a RAMBlock id from the migration stream 2778 * 2779 * Must be called from within a rcu critical section. 2780 * 2781 * Returns a pointer from within the RCU-protected ram_list. 2782 * 2783 * @f: QEMUFile where to read the data from 2784 * @flags: Page flags (mostly to see if it's a continuation of previous block) 2785 */ 2786 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) 2787 { 2788 static RAMBlock *block = NULL; 2789 char id[256]; 2790 uint8_t len; 2791 2792 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2793 if (!block) { 2794 error_report("Ack, bad migration stream!"); 2795 return NULL; 2796 } 2797 return block; 2798 } 2799 2800 len = qemu_get_byte(f); 2801 qemu_get_buffer(f, (uint8_t *)id, len); 2802 id[len] = 0; 2803 2804 block = qemu_ram_block_by_name(id); 2805 if (!block) { 2806 error_report("Can't find block %s", id); 2807 return NULL; 2808 } 2809 2810 return block; 2811 } 2812 2813 static inline void *host_from_ram_block_offset(RAMBlock *block, 2814 ram_addr_t offset) 2815 { 2816 if (!offset_in_ramblock(block, offset)) { 2817 return NULL; 2818 } 2819 2820 return block->host + offset; 2821 } 2822 2823 /** 2824 * ram_handle_compressed: handle the zero page case 2825 * 2826 * If a page (or a whole RDMA chunk) has been 2827 * determined to be zero, then zap it. 2828 * 2829 * @host: host address for the zero page 2830 * @ch: what the page is filled from. We only support zero 2831 * @size: size of the zero page 2832 */ 2833 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2834 { 2835 if (ch != 0 || !is_zero_range(host, size)) { 2836 memset(host, ch, size); 2837 } 2838 } 2839 2840 /* return the size after decompression, or negative value on error */ 2841 static int 2842 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, 2843 const uint8_t *source, size_t source_len) 2844 { 2845 int err; 2846 2847 err = inflateReset(stream); 2848 if (err != Z_OK) { 2849 return -1; 2850 } 2851 2852 stream->avail_in = source_len; 2853 stream->next_in = (uint8_t *)source; 2854 stream->avail_out = dest_len; 2855 stream->next_out = dest; 2856 2857 err = inflate(stream, Z_NO_FLUSH); 2858 if (err != Z_STREAM_END) { 2859 return -1; 2860 } 2861 2862 return stream->total_out; 2863 } 2864 2865 static void *do_data_decompress(void *opaque) 2866 { 2867 DecompressParam *param = opaque; 2868 unsigned long pagesize; 2869 uint8_t *des; 2870 int len, ret; 2871 2872 qemu_mutex_lock(¶m->mutex); 2873 while (!param->quit) { 2874 if (param->des) { 2875 des = param->des; 2876 len = param->len; 2877 param->des = 0; 2878 qemu_mutex_unlock(¶m->mutex); 2879 2880 pagesize = TARGET_PAGE_SIZE; 2881 2882 ret = qemu_uncompress_data(¶m->stream, des, pagesize, 2883 param->compbuf, len); 2884 if (ret < 0) { 2885 error_report("decompress data failed"); 2886 qemu_file_set_error(decomp_file, ret); 2887 } 2888 2889 qemu_mutex_lock(&decomp_done_lock); 2890 param->done = true; 2891 qemu_cond_signal(&decomp_done_cond); 2892 qemu_mutex_unlock(&decomp_done_lock); 2893 2894 qemu_mutex_lock(¶m->mutex); 2895 } else { 2896 qemu_cond_wait(¶m->cond, ¶m->mutex); 2897 } 2898 } 2899 qemu_mutex_unlock(¶m->mutex); 2900 2901 return NULL; 2902 } 2903 2904 static int wait_for_decompress_done(void) 2905 { 2906 int idx, thread_count; 2907 2908 if (!migrate_use_compression()) { 2909 return 0; 2910 } 2911 2912 thread_count = migrate_decompress_threads(); 2913 qemu_mutex_lock(&decomp_done_lock); 2914 for (idx = 0; idx < thread_count; idx++) { 2915 while (!decomp_param[idx].done) { 2916 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2917 } 2918 } 2919 qemu_mutex_unlock(&decomp_done_lock); 2920 return qemu_file_get_error(decomp_file); 2921 } 2922 2923 static void compress_threads_load_cleanup(void) 2924 { 2925 int i, thread_count; 2926 2927 if (!migrate_use_compression()) { 2928 return; 2929 } 2930 thread_count = migrate_decompress_threads(); 2931 for (i = 0; i < thread_count; i++) { 2932 /* 2933 * we use it as a indicator which shows if the thread is 2934 * properly init'd or not 2935 */ 2936 if (!decomp_param[i].compbuf) { 2937 break; 2938 } 2939 2940 qemu_mutex_lock(&decomp_param[i].mutex); 2941 decomp_param[i].quit = true; 2942 qemu_cond_signal(&decomp_param[i].cond); 2943 qemu_mutex_unlock(&decomp_param[i].mutex); 2944 } 2945 for (i = 0; i < thread_count; i++) { 2946 if (!decomp_param[i].compbuf) { 2947 break; 2948 } 2949 2950 qemu_thread_join(decompress_threads + i); 2951 qemu_mutex_destroy(&decomp_param[i].mutex); 2952 qemu_cond_destroy(&decomp_param[i].cond); 2953 inflateEnd(&decomp_param[i].stream); 2954 g_free(decomp_param[i].compbuf); 2955 decomp_param[i].compbuf = NULL; 2956 } 2957 g_free(decompress_threads); 2958 g_free(decomp_param); 2959 decompress_threads = NULL; 2960 decomp_param = NULL; 2961 decomp_file = NULL; 2962 } 2963 2964 static int compress_threads_load_setup(QEMUFile *f) 2965 { 2966 int i, thread_count; 2967 2968 if (!migrate_use_compression()) { 2969 return 0; 2970 } 2971 2972 thread_count = migrate_decompress_threads(); 2973 decompress_threads = g_new0(QemuThread, thread_count); 2974 decomp_param = g_new0(DecompressParam, thread_count); 2975 qemu_mutex_init(&decomp_done_lock); 2976 qemu_cond_init(&decomp_done_cond); 2977 decomp_file = f; 2978 for (i = 0; i < thread_count; i++) { 2979 if (inflateInit(&decomp_param[i].stream) != Z_OK) { 2980 goto exit; 2981 } 2982 2983 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2984 qemu_mutex_init(&decomp_param[i].mutex); 2985 qemu_cond_init(&decomp_param[i].cond); 2986 decomp_param[i].done = true; 2987 decomp_param[i].quit = false; 2988 qemu_thread_create(decompress_threads + i, "decompress", 2989 do_data_decompress, decomp_param + i, 2990 QEMU_THREAD_JOINABLE); 2991 } 2992 return 0; 2993 exit: 2994 compress_threads_load_cleanup(); 2995 return -1; 2996 } 2997 2998 static void decompress_data_with_multi_threads(QEMUFile *f, 2999 void *host, int len) 3000 { 3001 int idx, thread_count; 3002 3003 thread_count = migrate_decompress_threads(); 3004 qemu_mutex_lock(&decomp_done_lock); 3005 while (true) { 3006 for (idx = 0; idx < thread_count; idx++) { 3007 if (decomp_param[idx].done) { 3008 decomp_param[idx].done = false; 3009 qemu_mutex_lock(&decomp_param[idx].mutex); 3010 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 3011 decomp_param[idx].des = host; 3012 decomp_param[idx].len = len; 3013 qemu_cond_signal(&decomp_param[idx].cond); 3014 qemu_mutex_unlock(&decomp_param[idx].mutex); 3015 break; 3016 } 3017 } 3018 if (idx < thread_count) { 3019 break; 3020 } else { 3021 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 3022 } 3023 } 3024 qemu_mutex_unlock(&decomp_done_lock); 3025 } 3026 3027 /** 3028 * ram_load_setup: Setup RAM for migration incoming side 3029 * 3030 * Returns zero to indicate success and negative for error 3031 * 3032 * @f: QEMUFile where to receive the data 3033 * @opaque: RAMState pointer 3034 */ 3035 static int ram_load_setup(QEMUFile *f, void *opaque) 3036 { 3037 if (compress_threads_load_setup(f)) { 3038 return -1; 3039 } 3040 3041 xbzrle_load_setup(); 3042 ramblock_recv_map_init(); 3043 return 0; 3044 } 3045 3046 static int ram_load_cleanup(void *opaque) 3047 { 3048 RAMBlock *rb; 3049 xbzrle_load_cleanup(); 3050 compress_threads_load_cleanup(); 3051 3052 RAMBLOCK_FOREACH(rb) { 3053 g_free(rb->receivedmap); 3054 rb->receivedmap = NULL; 3055 } 3056 return 0; 3057 } 3058 3059 /** 3060 * ram_postcopy_incoming_init: allocate postcopy data structures 3061 * 3062 * Returns 0 for success and negative if there was one error 3063 * 3064 * @mis: current migration incoming state 3065 * 3066 * Allocate data structures etc needed by incoming migration with 3067 * postcopy-ram. postcopy-ram's similarly names 3068 * postcopy_ram_incoming_init does the work. 3069 */ 3070 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3071 { 3072 unsigned long ram_pages = last_ram_page(); 3073 3074 return postcopy_ram_incoming_init(mis, ram_pages); 3075 } 3076 3077 /** 3078 * ram_load_postcopy: load a page in postcopy case 3079 * 3080 * Returns 0 for success or -errno in case of error 3081 * 3082 * Called in postcopy mode by ram_load(). 3083 * rcu_read_lock is taken prior to this being called. 3084 * 3085 * @f: QEMUFile where to send the data 3086 */ 3087 static int ram_load_postcopy(QEMUFile *f) 3088 { 3089 int flags = 0, ret = 0; 3090 bool place_needed = false; 3091 bool matching_page_sizes = false; 3092 MigrationIncomingState *mis = migration_incoming_get_current(); 3093 /* Temporary page that is later 'placed' */ 3094 void *postcopy_host_page = postcopy_get_tmp_page(mis); 3095 void *last_host = NULL; 3096 bool all_zero = false; 3097 3098 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3099 ram_addr_t addr; 3100 void *host = NULL; 3101 void *page_buffer = NULL; 3102 void *place_source = NULL; 3103 RAMBlock *block = NULL; 3104 uint8_t ch; 3105 3106 addr = qemu_get_be64(f); 3107 3108 /* 3109 * If qemu file error, we should stop here, and then "addr" 3110 * may be invalid 3111 */ 3112 ret = qemu_file_get_error(f); 3113 if (ret) { 3114 break; 3115 } 3116 3117 flags = addr & ~TARGET_PAGE_MASK; 3118 addr &= TARGET_PAGE_MASK; 3119 3120 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 3121 place_needed = false; 3122 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { 3123 block = ram_block_from_stream(f, flags); 3124 3125 host = host_from_ram_block_offset(block, addr); 3126 if (!host) { 3127 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3128 ret = -EINVAL; 3129 break; 3130 } 3131 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE; 3132 /* 3133 * Postcopy requires that we place whole host pages atomically; 3134 * these may be huge pages for RAMBlocks that are backed by 3135 * hugetlbfs. 3136 * To make it atomic, the data is read into a temporary page 3137 * that's moved into place later. 3138 * The migration protocol uses, possibly smaller, target-pages 3139 * however the source ensures it always sends all the components 3140 * of a host page in order. 3141 */ 3142 page_buffer = postcopy_host_page + 3143 ((uintptr_t)host & (block->page_size - 1)); 3144 /* If all TP are zero then we can optimise the place */ 3145 if (!((uintptr_t)host & (block->page_size - 1))) { 3146 all_zero = true; 3147 } else { 3148 /* not the 1st TP within the HP */ 3149 if (host != (last_host + TARGET_PAGE_SIZE)) { 3150 error_report("Non-sequential target page %p/%p", 3151 host, last_host); 3152 ret = -EINVAL; 3153 break; 3154 } 3155 } 3156 3157 3158 /* 3159 * If it's the last part of a host page then we place the host 3160 * page 3161 */ 3162 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & 3163 (block->page_size - 1)) == 0; 3164 place_source = postcopy_host_page; 3165 } 3166 last_host = host; 3167 3168 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3169 case RAM_SAVE_FLAG_ZERO: 3170 ch = qemu_get_byte(f); 3171 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3172 if (ch) { 3173 all_zero = false; 3174 } 3175 break; 3176 3177 case RAM_SAVE_FLAG_PAGE: 3178 all_zero = false; 3179 if (!place_needed || !matching_page_sizes) { 3180 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3181 } else { 3182 /* Avoids the qemu_file copy during postcopy, which is 3183 * going to do a copy later; can only do it when we 3184 * do this read in one go (matching page sizes) 3185 */ 3186 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3187 TARGET_PAGE_SIZE); 3188 } 3189 break; 3190 case RAM_SAVE_FLAG_EOS: 3191 /* normal exit */ 3192 break; 3193 default: 3194 error_report("Unknown combination of migration flags: %#x" 3195 " (postcopy mode)", flags); 3196 ret = -EINVAL; 3197 break; 3198 } 3199 3200 /* Detect for any possible file errors */ 3201 if (!ret && qemu_file_get_error(f)) { 3202 ret = qemu_file_get_error(f); 3203 } 3204 3205 if (!ret && place_needed) { 3206 /* This gets called at the last target page in the host page */ 3207 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size; 3208 3209 if (all_zero) { 3210 ret = postcopy_place_page_zero(mis, place_dest, 3211 block); 3212 } else { 3213 ret = postcopy_place_page(mis, place_dest, 3214 place_source, block); 3215 } 3216 } 3217 } 3218 3219 return ret; 3220 } 3221 3222 static bool postcopy_is_advised(void) 3223 { 3224 PostcopyState ps = postcopy_state_get(); 3225 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 3226 } 3227 3228 static bool postcopy_is_running(void) 3229 { 3230 PostcopyState ps = postcopy_state_get(); 3231 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3232 } 3233 3234 static int ram_load(QEMUFile *f, void *opaque, int version_id) 3235 { 3236 int flags = 0, ret = 0, invalid_flags = 0; 3237 static uint64_t seq_iter; 3238 int len = 0; 3239 /* 3240 * If system is running in postcopy mode, page inserts to host memory must 3241 * be atomic 3242 */ 3243 bool postcopy_running = postcopy_is_running(); 3244 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 3245 bool postcopy_advised = postcopy_is_advised(); 3246 3247 seq_iter++; 3248 3249 if (version_id != 4) { 3250 ret = -EINVAL; 3251 } 3252 3253 if (!migrate_use_compression()) { 3254 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; 3255 } 3256 /* This RCU critical section can be very long running. 3257 * When RCU reclaims in the code start to become numerous, 3258 * it will be necessary to reduce the granularity of this 3259 * critical section. 3260 */ 3261 rcu_read_lock(); 3262 3263 if (postcopy_running) { 3264 ret = ram_load_postcopy(f); 3265 } 3266 3267 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3268 ram_addr_t addr, total_ram_bytes; 3269 void *host = NULL; 3270 uint8_t ch; 3271 3272 addr = qemu_get_be64(f); 3273 flags = addr & ~TARGET_PAGE_MASK; 3274 addr &= TARGET_PAGE_MASK; 3275 3276 if (flags & invalid_flags) { 3277 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { 3278 error_report("Received an unexpected compressed page"); 3279 } 3280 3281 ret = -EINVAL; 3282 break; 3283 } 3284 3285 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 3286 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 3287 RAMBlock *block = ram_block_from_stream(f, flags); 3288 3289 host = host_from_ram_block_offset(block, addr); 3290 if (!host) { 3291 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3292 ret = -EINVAL; 3293 break; 3294 } 3295 ramblock_recv_bitmap_set(block, host); 3296 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 3297 } 3298 3299 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3300 case RAM_SAVE_FLAG_MEM_SIZE: 3301 /* Synchronize RAM block list */ 3302 total_ram_bytes = addr; 3303 while (!ret && total_ram_bytes) { 3304 RAMBlock *block; 3305 char id[256]; 3306 ram_addr_t length; 3307 3308 len = qemu_get_byte(f); 3309 qemu_get_buffer(f, (uint8_t *)id, len); 3310 id[len] = 0; 3311 length = qemu_get_be64(f); 3312 3313 block = qemu_ram_block_by_name(id); 3314 if (block) { 3315 if (length != block->used_length) { 3316 Error *local_err = NULL; 3317 3318 ret = qemu_ram_resize(block, length, 3319 &local_err); 3320 if (local_err) { 3321 error_report_err(local_err); 3322 } 3323 } 3324 /* For postcopy we need to check hugepage sizes match */ 3325 if (postcopy_advised && 3326 block->page_size != qemu_host_page_size) { 3327 uint64_t remote_page_size = qemu_get_be64(f); 3328 if (remote_page_size != block->page_size) { 3329 error_report("Mismatched RAM page size %s " 3330 "(local) %zd != %" PRId64, 3331 id, block->page_size, 3332 remote_page_size); 3333 ret = -EINVAL; 3334 } 3335 } 3336 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 3337 block->idstr); 3338 } else { 3339 error_report("Unknown ramblock \"%s\", cannot " 3340 "accept migration", id); 3341 ret = -EINVAL; 3342 } 3343 3344 total_ram_bytes -= length; 3345 } 3346 break; 3347 3348 case RAM_SAVE_FLAG_ZERO: 3349 ch = qemu_get_byte(f); 3350 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 3351 break; 3352 3353 case RAM_SAVE_FLAG_PAGE: 3354 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 3355 break; 3356 3357 case RAM_SAVE_FLAG_COMPRESS_PAGE: 3358 len = qemu_get_be32(f); 3359 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 3360 error_report("Invalid compressed data length: %d", len); 3361 ret = -EINVAL; 3362 break; 3363 } 3364 decompress_data_with_multi_threads(f, host, len); 3365 break; 3366 3367 case RAM_SAVE_FLAG_XBZRLE: 3368 if (load_xbzrle(f, addr, host) < 0) { 3369 error_report("Failed to decompress XBZRLE page at " 3370 RAM_ADDR_FMT, addr); 3371 ret = -EINVAL; 3372 break; 3373 } 3374 break; 3375 case RAM_SAVE_FLAG_EOS: 3376 /* normal exit */ 3377 break; 3378 default: 3379 if (flags & RAM_SAVE_FLAG_HOOK) { 3380 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 3381 } else { 3382 error_report("Unknown combination of migration flags: %#x", 3383 flags); 3384 ret = -EINVAL; 3385 } 3386 } 3387 if (!ret) { 3388 ret = qemu_file_get_error(f); 3389 } 3390 } 3391 3392 ret |= wait_for_decompress_done(); 3393 rcu_read_unlock(); 3394 trace_ram_load_complete(ret, seq_iter); 3395 return ret; 3396 } 3397 3398 static bool ram_has_postcopy(void *opaque) 3399 { 3400 return migrate_postcopy_ram(); 3401 } 3402 3403 /* Sync all the dirty bitmap with destination VM. */ 3404 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 3405 { 3406 RAMBlock *block; 3407 QEMUFile *file = s->to_dst_file; 3408 int ramblock_count = 0; 3409 3410 trace_ram_dirty_bitmap_sync_start(); 3411 3412 RAMBLOCK_FOREACH(block) { 3413 qemu_savevm_send_recv_bitmap(file, block->idstr); 3414 trace_ram_dirty_bitmap_request(block->idstr); 3415 ramblock_count++; 3416 } 3417 3418 trace_ram_dirty_bitmap_sync_wait(); 3419 3420 /* Wait until all the ramblocks' dirty bitmap synced */ 3421 while (ramblock_count--) { 3422 qemu_sem_wait(&s->rp_state.rp_sem); 3423 } 3424 3425 trace_ram_dirty_bitmap_sync_complete(); 3426 3427 return 0; 3428 } 3429 3430 static void ram_dirty_bitmap_reload_notify(MigrationState *s) 3431 { 3432 qemu_sem_post(&s->rp_state.rp_sem); 3433 } 3434 3435 /* 3436 * Read the received bitmap, revert it as the initial dirty bitmap. 3437 * This is only used when the postcopy migration is paused but wants 3438 * to resume from a middle point. 3439 */ 3440 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) 3441 { 3442 int ret = -EINVAL; 3443 QEMUFile *file = s->rp_state.from_dst_file; 3444 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; 3445 uint64_t local_size = nbits / 8; 3446 uint64_t size, end_mark; 3447 3448 trace_ram_dirty_bitmap_reload_begin(block->idstr); 3449 3450 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 3451 error_report("%s: incorrect state %s", __func__, 3452 MigrationStatus_str(s->state)); 3453 return -EINVAL; 3454 } 3455 3456 /* 3457 * Note: see comments in ramblock_recv_bitmap_send() on why we 3458 * need the endianess convertion, and the paddings. 3459 */ 3460 local_size = ROUND_UP(local_size, 8); 3461 3462 /* Add paddings */ 3463 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 3464 3465 size = qemu_get_be64(file); 3466 3467 /* The size of the bitmap should match with our ramblock */ 3468 if (size != local_size) { 3469 error_report("%s: ramblock '%s' bitmap size mismatch " 3470 "(0x%"PRIx64" != 0x%"PRIx64")", __func__, 3471 block->idstr, size, local_size); 3472 ret = -EINVAL; 3473 goto out; 3474 } 3475 3476 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 3477 end_mark = qemu_get_be64(file); 3478 3479 ret = qemu_file_get_error(file); 3480 if (ret || size != local_size) { 3481 error_report("%s: read bitmap failed for ramblock '%s': %d" 3482 " (size 0x%"PRIx64", got: 0x%"PRIx64")", 3483 __func__, block->idstr, ret, local_size, size); 3484 ret = -EIO; 3485 goto out; 3486 } 3487 3488 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 3489 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64, 3490 __func__, block->idstr, end_mark); 3491 ret = -EINVAL; 3492 goto out; 3493 } 3494 3495 /* 3496 * Endianess convertion. We are during postcopy (though paused). 3497 * The dirty bitmap won't change. We can directly modify it. 3498 */ 3499 bitmap_from_le(block->bmap, le_bitmap, nbits); 3500 3501 /* 3502 * What we received is "received bitmap". Revert it as the initial 3503 * dirty bitmap for this ramblock. 3504 */ 3505 bitmap_complement(block->bmap, block->bmap, nbits); 3506 3507 trace_ram_dirty_bitmap_reload_complete(block->idstr); 3508 3509 /* 3510 * We succeeded to sync bitmap for current ramblock. If this is 3511 * the last one to sync, we need to notify the main send thread. 3512 */ 3513 ram_dirty_bitmap_reload_notify(s); 3514 3515 ret = 0; 3516 out: 3517 free(le_bitmap); 3518 return ret; 3519 } 3520 3521 static int ram_resume_prepare(MigrationState *s, void *opaque) 3522 { 3523 RAMState *rs = *(RAMState **)opaque; 3524 int ret; 3525 3526 ret = ram_dirty_bitmap_sync_all(s, rs); 3527 if (ret) { 3528 return ret; 3529 } 3530 3531 ram_state_resume_prepare(rs, s->to_dst_file); 3532 3533 return 0; 3534 } 3535 3536 static SaveVMHandlers savevm_ram_handlers = { 3537 .save_setup = ram_save_setup, 3538 .save_live_iterate = ram_save_iterate, 3539 .save_live_complete_postcopy = ram_save_complete, 3540 .save_live_complete_precopy = ram_save_complete, 3541 .has_postcopy = ram_has_postcopy, 3542 .save_live_pending = ram_save_pending, 3543 .load_state = ram_load, 3544 .save_cleanup = ram_save_cleanup, 3545 .load_setup = ram_load_setup, 3546 .load_cleanup = ram_load_cleanup, 3547 .resume_prepare = ram_resume_prepare, 3548 }; 3549 3550 void ram_mig_init(void) 3551 { 3552 qemu_mutex_init(&XBZRLE.lock); 3553 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state); 3554 } 3555