1 /* 2 * RDMA protocol and interfaces 3 * 4 * Copyright IBM, Corp. 2010-2013 5 * Copyright Red Hat, Inc. 2015-2016 6 * 7 * Authors: 8 * Michael R. Hines <mrhines@us.ibm.com> 9 * Jiuxing Liu <jl@us.ibm.com> 10 * Daniel P. Berrange <berrange@redhat.com> 11 * 12 * This work is licensed under the terms of the GNU GPL, version 2 or 13 * later. See the COPYING file in the top-level directory. 14 * 15 */ 16 17 #include "qemu/osdep.h" 18 #include "qapi/error.h" 19 #include "qemu/cutils.h" 20 #include "exec/target_page.h" 21 #include "rdma.h" 22 #include "migration.h" 23 #include "migration-stats.h" 24 #include "qemu-file.h" 25 #include "ram.h" 26 #include "qemu/error-report.h" 27 #include "qemu/main-loop.h" 28 #include "qemu/module.h" 29 #include "qemu/rcu.h" 30 #include "qemu/sockets.h" 31 #include "qemu/bitmap.h" 32 #include "qemu/coroutine.h" 33 #include "exec/memory.h" 34 #include <sys/socket.h> 35 #include <netdb.h> 36 #include <arpa/inet.h> 37 #include <rdma/rdma_cma.h> 38 #include "trace.h" 39 #include "qom/object.h" 40 #include "options.h" 41 #include <poll.h> 42 43 #define RDMA_RESOLVE_TIMEOUT_MS 10000 44 45 /* Do not merge data if larger than this. */ 46 #define RDMA_MERGE_MAX (2 * 1024 * 1024) 47 #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096) 48 49 #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */ 50 51 /* 52 * This is only for non-live state being migrated. 53 * Instead of RDMA_WRITE messages, we use RDMA_SEND 54 * messages for that state, which requires a different 55 * delivery design than main memory. 56 */ 57 #define RDMA_SEND_INCREMENT 32768 58 59 /* 60 * Maximum size infiniband SEND message 61 */ 62 #define RDMA_CONTROL_MAX_BUFFER (512 * 1024) 63 #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096 64 65 #define RDMA_CONTROL_VERSION_CURRENT 1 66 /* 67 * Capabilities for negotiation. 68 */ 69 #define RDMA_CAPABILITY_PIN_ALL 0x01 70 71 /* 72 * Add the other flags above to this list of known capabilities 73 * as they are introduced. 74 */ 75 static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL; 76 77 /* 78 * A work request ID is 64-bits and we split up these bits 79 * into 3 parts: 80 * 81 * bits 0-15 : type of control message, 2^16 82 * bits 16-29: ram block index, 2^14 83 * bits 30-63: ram block chunk number, 2^34 84 * 85 * The last two bit ranges are only used for RDMA writes, 86 * in order to track their completion and potentially 87 * also track unregistration status of the message. 88 */ 89 #define RDMA_WRID_TYPE_SHIFT 0UL 90 #define RDMA_WRID_BLOCK_SHIFT 16UL 91 #define RDMA_WRID_CHUNK_SHIFT 30UL 92 93 #define RDMA_WRID_TYPE_MASK \ 94 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL) 95 96 #define RDMA_WRID_BLOCK_MASK \ 97 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL)) 98 99 #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK) 100 101 /* 102 * RDMA migration protocol: 103 * 1. RDMA Writes (data messages, i.e. RAM) 104 * 2. IB Send/Recv (control channel messages) 105 */ 106 enum { 107 RDMA_WRID_NONE = 0, 108 RDMA_WRID_RDMA_WRITE = 1, 109 RDMA_WRID_SEND_CONTROL = 2000, 110 RDMA_WRID_RECV_CONTROL = 4000, 111 }; 112 113 /* 114 * Work request IDs for IB SEND messages only (not RDMA writes). 115 * This is used by the migration protocol to transmit 116 * control messages (such as device state and registration commands) 117 * 118 * We could use more WRs, but we have enough for now. 119 */ 120 enum { 121 RDMA_WRID_READY = 0, 122 RDMA_WRID_DATA, 123 RDMA_WRID_CONTROL, 124 RDMA_WRID_MAX, 125 }; 126 127 /* 128 * SEND/RECV IB Control Messages. 129 */ 130 enum { 131 RDMA_CONTROL_NONE = 0, 132 RDMA_CONTROL_ERROR, 133 RDMA_CONTROL_READY, /* ready to receive */ 134 RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */ 135 RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */ 136 RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */ 137 RDMA_CONTROL_COMPRESS, /* page contains repeat values */ 138 RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */ 139 RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */ 140 RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */ 141 RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */ 142 RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */ 143 }; 144 145 146 /* 147 * Memory and MR structures used to represent an IB Send/Recv work request. 148 * This is *not* used for RDMA writes, only IB Send/Recv. 149 */ 150 typedef struct { 151 uint8_t control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */ 152 struct ibv_mr *control_mr; /* registration metadata */ 153 size_t control_len; /* length of the message */ 154 uint8_t *control_curr; /* start of unconsumed bytes */ 155 } RDMAWorkRequestData; 156 157 /* 158 * Negotiate RDMA capabilities during connection-setup time. 159 */ 160 typedef struct { 161 uint32_t version; 162 uint32_t flags; 163 } RDMACapabilities; 164 165 static void caps_to_network(RDMACapabilities *cap) 166 { 167 cap->version = htonl(cap->version); 168 cap->flags = htonl(cap->flags); 169 } 170 171 static void network_to_caps(RDMACapabilities *cap) 172 { 173 cap->version = ntohl(cap->version); 174 cap->flags = ntohl(cap->flags); 175 } 176 177 /* 178 * Representation of a RAMBlock from an RDMA perspective. 179 * This is not transmitted, only local. 180 * This and subsequent structures cannot be linked lists 181 * because we're using a single IB message to transmit 182 * the information. It's small anyway, so a list is overkill. 183 */ 184 typedef struct RDMALocalBlock { 185 char *block_name; 186 uint8_t *local_host_addr; /* local virtual address */ 187 uint64_t remote_host_addr; /* remote virtual address */ 188 uint64_t offset; 189 uint64_t length; 190 struct ibv_mr **pmr; /* MRs for chunk-level registration */ 191 struct ibv_mr *mr; /* MR for non-chunk-level registration */ 192 uint32_t *remote_keys; /* rkeys for chunk-level registration */ 193 uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ 194 int index; /* which block are we */ 195 unsigned int src_index; /* (Only used on dest) */ 196 bool is_ram_block; 197 int nb_chunks; 198 unsigned long *transit_bitmap; 199 unsigned long *unregister_bitmap; 200 } RDMALocalBlock; 201 202 /* 203 * Also represents a RAMblock, but only on the dest. 204 * This gets transmitted by the dest during connection-time 205 * to the source VM and then is used to populate the 206 * corresponding RDMALocalBlock with 207 * the information needed to perform the actual RDMA. 208 */ 209 typedef struct QEMU_PACKED RDMADestBlock { 210 uint64_t remote_host_addr; 211 uint64_t offset; 212 uint64_t length; 213 uint32_t remote_rkey; 214 uint32_t padding; 215 } RDMADestBlock; 216 217 static const char *control_desc(unsigned int rdma_control) 218 { 219 static const char *strs[] = { 220 [RDMA_CONTROL_NONE] = "NONE", 221 [RDMA_CONTROL_ERROR] = "ERROR", 222 [RDMA_CONTROL_READY] = "READY", 223 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE", 224 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST", 225 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT", 226 [RDMA_CONTROL_COMPRESS] = "COMPRESS", 227 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST", 228 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT", 229 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED", 230 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST", 231 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED", 232 }; 233 234 if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) { 235 return "??BAD CONTROL VALUE??"; 236 } 237 238 return strs[rdma_control]; 239 } 240 241 static uint64_t htonll(uint64_t v) 242 { 243 union { uint32_t lv[2]; uint64_t llv; } u; 244 u.lv[0] = htonl(v >> 32); 245 u.lv[1] = htonl(v & 0xFFFFFFFFULL); 246 return u.llv; 247 } 248 249 static uint64_t ntohll(uint64_t v) 250 { 251 union { uint32_t lv[2]; uint64_t llv; } u; 252 u.llv = v; 253 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]); 254 } 255 256 static void dest_block_to_network(RDMADestBlock *db) 257 { 258 db->remote_host_addr = htonll(db->remote_host_addr); 259 db->offset = htonll(db->offset); 260 db->length = htonll(db->length); 261 db->remote_rkey = htonl(db->remote_rkey); 262 } 263 264 static void network_to_dest_block(RDMADestBlock *db) 265 { 266 db->remote_host_addr = ntohll(db->remote_host_addr); 267 db->offset = ntohll(db->offset); 268 db->length = ntohll(db->length); 269 db->remote_rkey = ntohl(db->remote_rkey); 270 } 271 272 /* 273 * Virtual address of the above structures used for transmitting 274 * the RAMBlock descriptions at connection-time. 275 * This structure is *not* transmitted. 276 */ 277 typedef struct RDMALocalBlocks { 278 int nb_blocks; 279 bool init; /* main memory init complete */ 280 RDMALocalBlock *block; 281 } RDMALocalBlocks; 282 283 /* 284 * Main data structure for RDMA state. 285 * While there is only one copy of this structure being allocated right now, 286 * this is the place where one would start if you wanted to consider 287 * having more than one RDMA connection open at the same time. 288 */ 289 typedef struct RDMAContext { 290 char *host; 291 int port; 292 293 RDMAWorkRequestData wr_data[RDMA_WRID_MAX]; 294 295 /* 296 * This is used by *_exchange_send() to figure out whether or not 297 * the initial "READY" message has already been received or not. 298 * This is because other functions may potentially poll() and detect 299 * the READY message before send() does, in which case we need to 300 * know if it completed. 301 */ 302 int control_ready_expected; 303 304 /* number of outstanding writes */ 305 int nb_sent; 306 307 /* store info about current buffer so that we can 308 merge it with future sends */ 309 uint64_t current_addr; 310 uint64_t current_length; 311 /* index of ram block the current buffer belongs to */ 312 int current_index; 313 /* index of the chunk in the current ram block */ 314 int current_chunk; 315 316 bool pin_all; 317 318 /* 319 * infiniband-specific variables for opening the device 320 * and maintaining connection state and so forth. 321 * 322 * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in 323 * cm_id->verbs, cm_id->channel, and cm_id->qp. 324 */ 325 struct rdma_cm_id *cm_id; /* connection manager ID */ 326 struct rdma_cm_id *listen_id; 327 bool connected; 328 329 struct ibv_context *verbs; 330 struct rdma_event_channel *channel; 331 struct ibv_qp *qp; /* queue pair */ 332 struct ibv_comp_channel *recv_comp_channel; /* recv completion channel */ 333 struct ibv_comp_channel *send_comp_channel; /* send completion channel */ 334 struct ibv_pd *pd; /* protection domain */ 335 struct ibv_cq *recv_cq; /* recvieve completion queue */ 336 struct ibv_cq *send_cq; /* send completion queue */ 337 338 /* 339 * If a previous write failed (perhaps because of a failed 340 * memory registration, then do not attempt any future work 341 * and remember the error state. 342 */ 343 bool errored; 344 bool error_reported; 345 bool received_error; 346 347 /* 348 * Description of ram blocks used throughout the code. 349 */ 350 RDMALocalBlocks local_ram_blocks; 351 RDMADestBlock *dest_blocks; 352 353 /* Index of the next RAMBlock received during block registration */ 354 unsigned int next_src_index; 355 356 /* 357 * Migration on *destination* started. 358 * Then use coroutine yield function. 359 * Source runs in a thread, so we don't care. 360 */ 361 int migration_started_on_destination; 362 363 int total_registrations; 364 int total_writes; 365 366 int unregister_current, unregister_next; 367 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX]; 368 369 GHashTable *blockmap; 370 371 /* the RDMAContext for return path */ 372 struct RDMAContext *return_path; 373 bool is_return_path; 374 } RDMAContext; 375 376 #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma" 377 OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA) 378 379 380 381 struct QIOChannelRDMA { 382 QIOChannel parent; 383 RDMAContext *rdmain; 384 RDMAContext *rdmaout; 385 QEMUFile *file; 386 bool blocking; /* XXX we don't actually honour this yet */ 387 }; 388 389 /* 390 * Main structure for IB Send/Recv control messages. 391 * This gets prepended at the beginning of every Send/Recv. 392 */ 393 typedef struct QEMU_PACKED { 394 uint32_t len; /* Total length of data portion */ 395 uint32_t type; /* which control command to perform */ 396 uint32_t repeat; /* number of commands in data portion of same type */ 397 uint32_t padding; 398 } RDMAControlHeader; 399 400 static void control_to_network(RDMAControlHeader *control) 401 { 402 control->type = htonl(control->type); 403 control->len = htonl(control->len); 404 control->repeat = htonl(control->repeat); 405 } 406 407 static void network_to_control(RDMAControlHeader *control) 408 { 409 control->type = ntohl(control->type); 410 control->len = ntohl(control->len); 411 control->repeat = ntohl(control->repeat); 412 } 413 414 /* 415 * Register a single Chunk. 416 * Information sent by the source VM to inform the dest 417 * to register an single chunk of memory before we can perform 418 * the actual RDMA operation. 419 */ 420 typedef struct QEMU_PACKED { 421 union QEMU_PACKED { 422 uint64_t current_addr; /* offset into the ram_addr_t space */ 423 uint64_t chunk; /* chunk to lookup if unregistering */ 424 } key; 425 uint32_t current_index; /* which ramblock the chunk belongs to */ 426 uint32_t padding; 427 uint64_t chunks; /* how many sequential chunks to register */ 428 } RDMARegister; 429 430 static bool rdma_errored(RDMAContext *rdma) 431 { 432 if (rdma->errored && !rdma->error_reported) { 433 error_report("RDMA is in an error state waiting migration" 434 " to abort!"); 435 rdma->error_reported = true; 436 } 437 return rdma->errored; 438 } 439 440 static void register_to_network(RDMAContext *rdma, RDMARegister *reg) 441 { 442 RDMALocalBlock *local_block; 443 local_block = &rdma->local_ram_blocks.block[reg->current_index]; 444 445 if (local_block->is_ram_block) { 446 /* 447 * current_addr as passed in is an address in the local ram_addr_t 448 * space, we need to translate this for the destination 449 */ 450 reg->key.current_addr -= local_block->offset; 451 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset; 452 } 453 reg->key.current_addr = htonll(reg->key.current_addr); 454 reg->current_index = htonl(reg->current_index); 455 reg->chunks = htonll(reg->chunks); 456 } 457 458 static void network_to_register(RDMARegister *reg) 459 { 460 reg->key.current_addr = ntohll(reg->key.current_addr); 461 reg->current_index = ntohl(reg->current_index); 462 reg->chunks = ntohll(reg->chunks); 463 } 464 465 typedef struct QEMU_PACKED { 466 uint32_t value; /* if zero, we will madvise() */ 467 uint32_t block_idx; /* which ram block index */ 468 uint64_t offset; /* Address in remote ram_addr_t space */ 469 uint64_t length; /* length of the chunk */ 470 } RDMACompress; 471 472 static void compress_to_network(RDMAContext *rdma, RDMACompress *comp) 473 { 474 comp->value = htonl(comp->value); 475 /* 476 * comp->offset as passed in is an address in the local ram_addr_t 477 * space, we need to translate this for the destination 478 */ 479 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset; 480 comp->offset += rdma->dest_blocks[comp->block_idx].offset; 481 comp->block_idx = htonl(comp->block_idx); 482 comp->offset = htonll(comp->offset); 483 comp->length = htonll(comp->length); 484 } 485 486 static void network_to_compress(RDMACompress *comp) 487 { 488 comp->value = ntohl(comp->value); 489 comp->block_idx = ntohl(comp->block_idx); 490 comp->offset = ntohll(comp->offset); 491 comp->length = ntohll(comp->length); 492 } 493 494 /* 495 * The result of the dest's memory registration produces an "rkey" 496 * which the source VM must reference in order to perform 497 * the RDMA operation. 498 */ 499 typedef struct QEMU_PACKED { 500 uint32_t rkey; 501 uint32_t padding; 502 uint64_t host_addr; 503 } RDMARegisterResult; 504 505 static void result_to_network(RDMARegisterResult *result) 506 { 507 result->rkey = htonl(result->rkey); 508 result->host_addr = htonll(result->host_addr); 509 }; 510 511 static void network_to_result(RDMARegisterResult *result) 512 { 513 result->rkey = ntohl(result->rkey); 514 result->host_addr = ntohll(result->host_addr); 515 }; 516 517 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, 518 uint8_t *data, RDMAControlHeader *resp, 519 int *resp_idx, 520 int (*callback)(RDMAContext *rdma, 521 Error **errp), 522 Error **errp); 523 524 static inline uint64_t ram_chunk_index(const uint8_t *start, 525 const uint8_t *host) 526 { 527 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT; 528 } 529 530 static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block, 531 uint64_t i) 532 { 533 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr + 534 (i << RDMA_REG_CHUNK_SHIFT)); 535 } 536 537 static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block, 538 uint64_t i) 539 { 540 uint8_t *result = ram_chunk_start(rdma_ram_block, i) + 541 (1UL << RDMA_REG_CHUNK_SHIFT); 542 543 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) { 544 result = rdma_ram_block->local_host_addr + rdma_ram_block->length; 545 } 546 547 return result; 548 } 549 550 static void rdma_add_block(RDMAContext *rdma, const char *block_name, 551 void *host_addr, 552 ram_addr_t block_offset, uint64_t length) 553 { 554 RDMALocalBlocks *local = &rdma->local_ram_blocks; 555 RDMALocalBlock *block; 556 RDMALocalBlock *old = local->block; 557 558 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1); 559 560 if (local->nb_blocks) { 561 if (rdma->blockmap) { 562 for (int x = 0; x < local->nb_blocks; x++) { 563 g_hash_table_remove(rdma->blockmap, 564 (void *)(uintptr_t)old[x].offset); 565 g_hash_table_insert(rdma->blockmap, 566 (void *)(uintptr_t)old[x].offset, 567 &local->block[x]); 568 } 569 } 570 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks); 571 g_free(old); 572 } 573 574 block = &local->block[local->nb_blocks]; 575 576 block->block_name = g_strdup(block_name); 577 block->local_host_addr = host_addr; 578 block->offset = block_offset; 579 block->length = length; 580 block->index = local->nb_blocks; 581 block->src_index = ~0U; /* Filled in by the receipt of the block list */ 582 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL; 583 block->transit_bitmap = bitmap_new(block->nb_chunks); 584 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); 585 block->unregister_bitmap = bitmap_new(block->nb_chunks); 586 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks); 587 block->remote_keys = g_new0(uint32_t, block->nb_chunks); 588 589 block->is_ram_block = local->init ? false : true; 590 591 if (rdma->blockmap) { 592 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block); 593 } 594 595 trace_rdma_add_block(block_name, local->nb_blocks, 596 (uintptr_t) block->local_host_addr, 597 block->offset, block->length, 598 (uintptr_t) (block->local_host_addr + block->length), 599 BITS_TO_LONGS(block->nb_chunks) * 600 sizeof(unsigned long) * 8, 601 block->nb_chunks); 602 603 local->nb_blocks++; 604 } 605 606 /* 607 * Memory regions need to be registered with the device and queue pairs setup 608 * in advanced before the migration starts. This tells us where the RAM blocks 609 * are so that we can register them individually. 610 */ 611 static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque) 612 { 613 const char *block_name = qemu_ram_get_idstr(rb); 614 void *host_addr = qemu_ram_get_host_addr(rb); 615 ram_addr_t block_offset = qemu_ram_get_offset(rb); 616 ram_addr_t length = qemu_ram_get_used_length(rb); 617 rdma_add_block(opaque, block_name, host_addr, block_offset, length); 618 return 0; 619 } 620 621 /* 622 * Identify the RAMBlocks and their quantity. They will be references to 623 * identify chunk boundaries inside each RAMBlock and also be referenced 624 * during dynamic page registration. 625 */ 626 static void qemu_rdma_init_ram_blocks(RDMAContext *rdma) 627 { 628 RDMALocalBlocks *local = &rdma->local_ram_blocks; 629 int ret; 630 631 assert(rdma->blockmap == NULL); 632 memset(local, 0, sizeof *local); 633 ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma); 634 assert(!ret); 635 trace_qemu_rdma_init_ram_blocks(local->nb_blocks); 636 rdma->dest_blocks = g_new0(RDMADestBlock, 637 rdma->local_ram_blocks.nb_blocks); 638 local->init = true; 639 } 640 641 /* 642 * Note: If used outside of cleanup, the caller must ensure that the destination 643 * block structures are also updated 644 */ 645 static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) 646 { 647 RDMALocalBlocks *local = &rdma->local_ram_blocks; 648 RDMALocalBlock *old = local->block; 649 650 if (rdma->blockmap) { 651 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset); 652 } 653 if (block->pmr) { 654 for (int j = 0; j < block->nb_chunks; j++) { 655 if (!block->pmr[j]) { 656 continue; 657 } 658 ibv_dereg_mr(block->pmr[j]); 659 rdma->total_registrations--; 660 } 661 g_free(block->pmr); 662 block->pmr = NULL; 663 } 664 665 if (block->mr) { 666 ibv_dereg_mr(block->mr); 667 rdma->total_registrations--; 668 block->mr = NULL; 669 } 670 671 g_free(block->transit_bitmap); 672 block->transit_bitmap = NULL; 673 674 g_free(block->unregister_bitmap); 675 block->unregister_bitmap = NULL; 676 677 g_free(block->remote_keys); 678 block->remote_keys = NULL; 679 680 g_free(block->block_name); 681 block->block_name = NULL; 682 683 if (rdma->blockmap) { 684 for (int x = 0; x < local->nb_blocks; x++) { 685 g_hash_table_remove(rdma->blockmap, 686 (void *)(uintptr_t)old[x].offset); 687 } 688 } 689 690 if (local->nb_blocks > 1) { 691 692 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1); 693 694 if (block->index) { 695 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index); 696 } 697 698 if (block->index < (local->nb_blocks - 1)) { 699 memcpy(local->block + block->index, old + (block->index + 1), 700 sizeof(RDMALocalBlock) * 701 (local->nb_blocks - (block->index + 1))); 702 for (int x = block->index; x < local->nb_blocks - 1; x++) { 703 local->block[x].index--; 704 } 705 } 706 } else { 707 assert(block == local->block); 708 local->block = NULL; 709 } 710 711 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr, 712 block->offset, block->length, 713 (uintptr_t)(block->local_host_addr + block->length), 714 BITS_TO_LONGS(block->nb_chunks) * 715 sizeof(unsigned long) * 8, block->nb_chunks); 716 717 g_free(old); 718 719 local->nb_blocks--; 720 721 if (local->nb_blocks && rdma->blockmap) { 722 for (int x = 0; x < local->nb_blocks; x++) { 723 g_hash_table_insert(rdma->blockmap, 724 (void *)(uintptr_t)local->block[x].offset, 725 &local->block[x]); 726 } 727 } 728 } 729 730 /* 731 * Trace RDMA device open, with device details. 732 */ 733 static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs) 734 { 735 struct ibv_port_attr port; 736 737 if (ibv_query_port(verbs, 1, &port)) { 738 trace_qemu_rdma_dump_id_failed(who); 739 return; 740 } 741 742 trace_qemu_rdma_dump_id(who, 743 verbs->device->name, 744 verbs->device->dev_name, 745 verbs->device->dev_path, 746 verbs->device->ibdev_path, 747 port.link_layer, 748 port.link_layer == IBV_LINK_LAYER_INFINIBAND ? "Infiniband" 749 : port.link_layer == IBV_LINK_LAYER_ETHERNET ? "Ethernet" 750 : "Unknown"); 751 } 752 753 /* 754 * Trace RDMA gid addressing information. 755 * Useful for understanding the RDMA device hierarchy in the kernel. 756 */ 757 static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) 758 { 759 char sgid[33]; 760 char dgid[33]; 761 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid); 762 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid); 763 trace_qemu_rdma_dump_gid(who, sgid, dgid); 764 } 765 766 /* 767 * As of now, IPv6 over RoCE / iWARP is not supported by linux. 768 * We will try the next addrinfo struct, and fail if there are 769 * no other valid addresses to bind against. 770 * 771 * If user is listening on '[::]', then we will not have a opened a device 772 * yet and have no way of verifying if the device is RoCE or not. 773 * 774 * In this case, the source VM will throw an error for ALL types of 775 * connections (both IPv4 and IPv6) if the destination machine does not have 776 * a regular infiniband network available for use. 777 * 778 * The only way to guarantee that an error is thrown for broken kernels is 779 * for the management software to choose a *specific* interface at bind time 780 * and validate what time of hardware it is. 781 * 782 * Unfortunately, this puts the user in a fix: 783 * 784 * If the source VM connects with an IPv4 address without knowing that the 785 * destination has bound to '[::]' the migration will unconditionally fail 786 * unless the management software is explicitly listening on the IPv4 787 * address while using a RoCE-based device. 788 * 789 * If the source VM connects with an IPv6 address, then we're OK because we can 790 * throw an error on the source (and similarly on the destination). 791 * 792 * But in mixed environments, this will be broken for a while until it is fixed 793 * inside linux. 794 * 795 * We do provide a *tiny* bit of help in this function: We can list all of the 796 * devices in the system and check to see if all the devices are RoCE or 797 * Infiniband. 798 * 799 * If we detect that we have a *pure* RoCE environment, then we can safely 800 * thrown an error even if the management software has specified '[::]' as the 801 * bind address. 802 * 803 * However, if there is are multiple hetergeneous devices, then we cannot make 804 * this assumption and the user just has to be sure they know what they are 805 * doing. 806 * 807 * Patches are being reviewed on linux-rdma. 808 */ 809 static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) 810 { 811 /* This bug only exists in linux, to our knowledge. */ 812 #ifdef CONFIG_LINUX 813 struct ibv_port_attr port_attr; 814 815 /* 816 * Verbs are only NULL if management has bound to '[::]'. 817 * 818 * Let's iterate through all the devices and see if there any pure IB 819 * devices (non-ethernet). 820 * 821 * If not, then we can safely proceed with the migration. 822 * Otherwise, there are no guarantees until the bug is fixed in linux. 823 */ 824 if (!verbs) { 825 int num_devices; 826 struct ibv_device **dev_list = ibv_get_device_list(&num_devices); 827 bool roce_found = false; 828 bool ib_found = false; 829 830 for (int x = 0; x < num_devices; x++) { 831 verbs = ibv_open_device(dev_list[x]); 832 /* 833 * ibv_open_device() is not documented to set errno. If 834 * it does, it's somebody else's doc bug. If it doesn't, 835 * the use of errno below is wrong. 836 * TODO Find out whether ibv_open_device() sets errno. 837 */ 838 if (!verbs) { 839 if (errno == EPERM) { 840 continue; 841 } else { 842 error_setg_errno(errp, errno, 843 "could not open RDMA device context"); 844 return -1; 845 } 846 } 847 848 if (ibv_query_port(verbs, 1, &port_attr)) { 849 ibv_close_device(verbs); 850 error_setg(errp, 851 "RDMA ERROR: Could not query initial IB port"); 852 return -1; 853 } 854 855 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { 856 ib_found = true; 857 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { 858 roce_found = true; 859 } 860 861 ibv_close_device(verbs); 862 863 } 864 865 if (roce_found) { 866 if (ib_found) { 867 warn_report("migrations may fail:" 868 " IPv6 over RoCE / iWARP in linux" 869 " is broken. But since you appear to have a" 870 " mixed RoCE / IB environment, be sure to only" 871 " migrate over the IB fabric until the kernel " 872 " fixes the bug."); 873 } else { 874 error_setg(errp, "RDMA ERROR: " 875 "You only have RoCE / iWARP devices in your systems" 876 " and your management software has specified '[::]'" 877 ", but IPv6 over RoCE / iWARP is not supported in Linux."); 878 return -1; 879 } 880 } 881 882 return 0; 883 } 884 885 /* 886 * If we have a verbs context, that means that some other than '[::]' was 887 * used by the management software for binding. In which case we can 888 * actually warn the user about a potentially broken kernel. 889 */ 890 891 /* IB ports start with 1, not 0 */ 892 if (ibv_query_port(verbs, 1, &port_attr)) { 893 error_setg(errp, "RDMA ERROR: Could not query initial IB port"); 894 return -1; 895 } 896 897 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { 898 error_setg(errp, "RDMA ERROR: " 899 "Linux kernel's RoCE / iWARP does not support IPv6 " 900 "(but patches on linux-rdma in progress)"); 901 return -1; 902 } 903 904 #endif 905 906 return 0; 907 } 908 909 /* 910 * Figure out which RDMA device corresponds to the requested IP hostname 911 * Also create the initial connection manager identifiers for opening 912 * the connection. 913 */ 914 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) 915 { 916 Error *err = NULL; 917 int ret; 918 struct rdma_addrinfo *res; 919 char port_str[16]; 920 struct rdma_cm_event *cm_event; 921 char ip[40] = "unknown"; 922 923 if (rdma->host == NULL || !strcmp(rdma->host, "")) { 924 error_setg(errp, "RDMA ERROR: RDMA hostname has not been set"); 925 return -1; 926 } 927 928 /* create CM channel */ 929 rdma->channel = rdma_create_event_channel(); 930 if (!rdma->channel) { 931 error_setg(errp, "RDMA ERROR: could not create CM channel"); 932 return -1; 933 } 934 935 /* create CM id */ 936 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP); 937 if (ret < 0) { 938 error_setg(errp, "RDMA ERROR: could not create channel id"); 939 goto err_resolve_create_id; 940 } 941 942 snprintf(port_str, 16, "%d", rdma->port); 943 port_str[15] = '\0'; 944 945 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res); 946 if (ret) { 947 error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s", 948 rdma->host); 949 goto err_resolve_get_addr; 950 } 951 952 /* Try all addresses, saving the first error in @err */ 953 for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) { 954 Error **local_errp = err ? NULL : &err; 955 956 inet_ntop(e->ai_family, 957 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); 958 trace_qemu_rdma_resolve_host_trying(rdma->host, ip); 959 960 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, 961 RDMA_RESOLVE_TIMEOUT_MS); 962 if (ret >= 0) { 963 if (e->ai_family == AF_INET6) { 964 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, 965 local_errp); 966 if (ret < 0) { 967 continue; 968 } 969 } 970 error_free(err); 971 goto route; 972 } 973 } 974 975 rdma_freeaddrinfo(res); 976 if (err) { 977 error_propagate(errp, err); 978 } else { 979 error_setg(errp, "RDMA ERROR: could not resolve address %s", 980 rdma->host); 981 } 982 goto err_resolve_get_addr; 983 984 route: 985 rdma_freeaddrinfo(res); 986 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id); 987 988 ret = rdma_get_cm_event(rdma->channel, &cm_event); 989 if (ret < 0) { 990 error_setg(errp, "RDMA ERROR: could not perform event_addr_resolved"); 991 goto err_resolve_get_addr; 992 } 993 994 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { 995 error_setg(errp, 996 "RDMA ERROR: result not equal to event_addr_resolved %s", 997 rdma_event_str(cm_event->event)); 998 rdma_ack_cm_event(cm_event); 999 goto err_resolve_get_addr; 1000 } 1001 rdma_ack_cm_event(cm_event); 1002 1003 /* resolve route */ 1004 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS); 1005 if (ret < 0) { 1006 error_setg(errp, "RDMA ERROR: could not resolve rdma route"); 1007 goto err_resolve_get_addr; 1008 } 1009 1010 ret = rdma_get_cm_event(rdma->channel, &cm_event); 1011 if (ret < 0) { 1012 error_setg(errp, "RDMA ERROR: could not perform event_route_resolved"); 1013 goto err_resolve_get_addr; 1014 } 1015 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { 1016 error_setg(errp, "RDMA ERROR: " 1017 "result not equal to event_route_resolved: %s", 1018 rdma_event_str(cm_event->event)); 1019 rdma_ack_cm_event(cm_event); 1020 goto err_resolve_get_addr; 1021 } 1022 rdma_ack_cm_event(cm_event); 1023 rdma->verbs = rdma->cm_id->verbs; 1024 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs); 1025 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id); 1026 return 0; 1027 1028 err_resolve_get_addr: 1029 rdma_destroy_id(rdma->cm_id); 1030 rdma->cm_id = NULL; 1031 err_resolve_create_id: 1032 rdma_destroy_event_channel(rdma->channel); 1033 rdma->channel = NULL; 1034 return -1; 1035 } 1036 1037 /* 1038 * Create protection domain and completion queues 1039 */ 1040 static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma, Error **errp) 1041 { 1042 /* allocate pd */ 1043 rdma->pd = ibv_alloc_pd(rdma->verbs); 1044 if (!rdma->pd) { 1045 error_setg(errp, "failed to allocate protection domain"); 1046 return -1; 1047 } 1048 1049 /* create receive completion channel */ 1050 rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs); 1051 if (!rdma->recv_comp_channel) { 1052 error_setg(errp, "failed to allocate receive completion channel"); 1053 goto err_alloc_pd_cq; 1054 } 1055 1056 /* 1057 * Completion queue can be filled by read work requests. 1058 */ 1059 rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), 1060 NULL, rdma->recv_comp_channel, 0); 1061 if (!rdma->recv_cq) { 1062 error_setg(errp, "failed to allocate receive completion queue"); 1063 goto err_alloc_pd_cq; 1064 } 1065 1066 /* create send completion channel */ 1067 rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs); 1068 if (!rdma->send_comp_channel) { 1069 error_setg(errp, "failed to allocate send completion channel"); 1070 goto err_alloc_pd_cq; 1071 } 1072 1073 rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), 1074 NULL, rdma->send_comp_channel, 0); 1075 if (!rdma->send_cq) { 1076 error_setg(errp, "failed to allocate send completion queue"); 1077 goto err_alloc_pd_cq; 1078 } 1079 1080 return 0; 1081 1082 err_alloc_pd_cq: 1083 if (rdma->pd) { 1084 ibv_dealloc_pd(rdma->pd); 1085 } 1086 if (rdma->recv_comp_channel) { 1087 ibv_destroy_comp_channel(rdma->recv_comp_channel); 1088 } 1089 if (rdma->send_comp_channel) { 1090 ibv_destroy_comp_channel(rdma->send_comp_channel); 1091 } 1092 if (rdma->recv_cq) { 1093 ibv_destroy_cq(rdma->recv_cq); 1094 rdma->recv_cq = NULL; 1095 } 1096 rdma->pd = NULL; 1097 rdma->recv_comp_channel = NULL; 1098 rdma->send_comp_channel = NULL; 1099 return -1; 1100 1101 } 1102 1103 /* 1104 * Create queue pairs. 1105 */ 1106 static int qemu_rdma_alloc_qp(RDMAContext *rdma) 1107 { 1108 struct ibv_qp_init_attr attr = { 0 }; 1109 1110 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX; 1111 attr.cap.max_recv_wr = 3; 1112 attr.cap.max_send_sge = 1; 1113 attr.cap.max_recv_sge = 1; 1114 attr.send_cq = rdma->send_cq; 1115 attr.recv_cq = rdma->recv_cq; 1116 attr.qp_type = IBV_QPT_RC; 1117 1118 if (rdma_create_qp(rdma->cm_id, rdma->pd, &attr) < 0) { 1119 return -1; 1120 } 1121 1122 rdma->qp = rdma->cm_id->qp; 1123 return 0; 1124 } 1125 1126 /* Check whether On-Demand Paging is supported by RDAM device */ 1127 static bool rdma_support_odp(struct ibv_context *dev) 1128 { 1129 struct ibv_device_attr_ex attr = {0}; 1130 1131 if (ibv_query_device_ex(dev, NULL, &attr)) { 1132 return false; 1133 } 1134 1135 if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) { 1136 return true; 1137 } 1138 1139 return false; 1140 } 1141 1142 /* 1143 * ibv_advise_mr to avoid RNR NAK error as far as possible. 1144 * The responder mr registering with ODP will sent RNR NAK back to 1145 * the requester in the face of the page fault. 1146 */ 1147 static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr, 1148 uint32_t len, uint32_t lkey, 1149 const char *name, bool wr) 1150 { 1151 #ifdef HAVE_IBV_ADVISE_MR 1152 int ret; 1153 int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE : 1154 IBV_ADVISE_MR_ADVICE_PREFETCH; 1155 struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len}; 1156 1157 ret = ibv_advise_mr(pd, advice, 1158 IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1); 1159 /* ignore the error */ 1160 trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret)); 1161 #endif 1162 } 1163 1164 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma, Error **errp) 1165 { 1166 int i; 1167 RDMALocalBlocks *local = &rdma->local_ram_blocks; 1168 1169 for (i = 0; i < local->nb_blocks; i++) { 1170 int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE; 1171 1172 local->block[i].mr = 1173 ibv_reg_mr(rdma->pd, 1174 local->block[i].local_host_addr, 1175 local->block[i].length, access 1176 ); 1177 /* 1178 * ibv_reg_mr() is not documented to set errno. If it does, 1179 * it's somebody else's doc bug. If it doesn't, the use of 1180 * errno below is wrong. 1181 * TODO Find out whether ibv_reg_mr() sets errno. 1182 */ 1183 if (!local->block[i].mr && 1184 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) { 1185 access |= IBV_ACCESS_ON_DEMAND; 1186 /* register ODP mr */ 1187 local->block[i].mr = 1188 ibv_reg_mr(rdma->pd, 1189 local->block[i].local_host_addr, 1190 local->block[i].length, access); 1191 trace_qemu_rdma_register_odp_mr(local->block[i].block_name); 1192 1193 if (local->block[i].mr) { 1194 qemu_rdma_advise_prefetch_mr(rdma->pd, 1195 (uintptr_t)local->block[i].local_host_addr, 1196 local->block[i].length, 1197 local->block[i].mr->lkey, 1198 local->block[i].block_name, 1199 true); 1200 } 1201 } 1202 1203 if (!local->block[i].mr) { 1204 error_setg_errno(errp, errno, 1205 "Failed to register local dest ram block!"); 1206 goto err; 1207 } 1208 rdma->total_registrations++; 1209 } 1210 1211 return 0; 1212 1213 err: 1214 for (i--; i >= 0; i--) { 1215 ibv_dereg_mr(local->block[i].mr); 1216 local->block[i].mr = NULL; 1217 rdma->total_registrations--; 1218 } 1219 1220 return -1; 1221 1222 } 1223 1224 /* 1225 * Find the ram block that corresponds to the page requested to be 1226 * transmitted by QEMU. 1227 * 1228 * Once the block is found, also identify which 'chunk' within that 1229 * block that the page belongs to. 1230 */ 1231 static void qemu_rdma_search_ram_block(RDMAContext *rdma, 1232 uintptr_t block_offset, 1233 uint64_t offset, 1234 uint64_t length, 1235 uint64_t *block_index, 1236 uint64_t *chunk_index) 1237 { 1238 uint64_t current_addr = block_offset + offset; 1239 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, 1240 (void *) block_offset); 1241 assert(block); 1242 assert(current_addr >= block->offset); 1243 assert((current_addr + length) <= (block->offset + block->length)); 1244 1245 *block_index = block->index; 1246 *chunk_index = ram_chunk_index(block->local_host_addr, 1247 block->local_host_addr + (current_addr - block->offset)); 1248 } 1249 1250 /* 1251 * Register a chunk with IB. If the chunk was already registered 1252 * previously, then skip. 1253 * 1254 * Also return the keys associated with the registration needed 1255 * to perform the actual RDMA operation. 1256 */ 1257 static int qemu_rdma_register_and_get_keys(RDMAContext *rdma, 1258 RDMALocalBlock *block, uintptr_t host_addr, 1259 uint32_t *lkey, uint32_t *rkey, int chunk, 1260 uint8_t *chunk_start, uint8_t *chunk_end) 1261 { 1262 if (block->mr) { 1263 if (lkey) { 1264 *lkey = block->mr->lkey; 1265 } 1266 if (rkey) { 1267 *rkey = block->mr->rkey; 1268 } 1269 return 0; 1270 } 1271 1272 /* allocate memory to store chunk MRs */ 1273 if (!block->pmr) { 1274 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks); 1275 } 1276 1277 /* 1278 * If 'rkey', then we're the destination, so grant access to the source. 1279 * 1280 * If 'lkey', then we're the source VM, so grant access only to ourselves. 1281 */ 1282 if (!block->pmr[chunk]) { 1283 uint64_t len = chunk_end - chunk_start; 1284 int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE : 1285 0; 1286 1287 trace_qemu_rdma_register_and_get_keys(len, chunk_start); 1288 1289 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access); 1290 /* 1291 * ibv_reg_mr() is not documented to set errno. If it does, 1292 * it's somebody else's doc bug. If it doesn't, the use of 1293 * errno below is wrong. 1294 * TODO Find out whether ibv_reg_mr() sets errno. 1295 */ 1296 if (!block->pmr[chunk] && 1297 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) { 1298 access |= IBV_ACCESS_ON_DEMAND; 1299 /* register ODP mr */ 1300 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access); 1301 trace_qemu_rdma_register_odp_mr(block->block_name); 1302 1303 if (block->pmr[chunk]) { 1304 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start, 1305 len, block->pmr[chunk]->lkey, 1306 block->block_name, rkey); 1307 1308 } 1309 } 1310 } 1311 if (!block->pmr[chunk]) { 1312 return -1; 1313 } 1314 rdma->total_registrations++; 1315 1316 if (lkey) { 1317 *lkey = block->pmr[chunk]->lkey; 1318 } 1319 if (rkey) { 1320 *rkey = block->pmr[chunk]->rkey; 1321 } 1322 return 0; 1323 } 1324 1325 /* 1326 * Register (at connection time) the memory used for control 1327 * channel messages. 1328 */ 1329 static int qemu_rdma_reg_control(RDMAContext *rdma, int idx) 1330 { 1331 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd, 1332 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER, 1333 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 1334 if (rdma->wr_data[idx].control_mr) { 1335 rdma->total_registrations++; 1336 return 0; 1337 } 1338 return -1; 1339 } 1340 1341 /* 1342 * Perform a non-optimized memory unregistration after every transfer 1343 * for demonstration purposes, only if pin-all is not requested. 1344 * 1345 * Potential optimizations: 1346 * 1. Start a new thread to run this function continuously 1347 - for bit clearing 1348 - and for receipt of unregister messages 1349 * 2. Use an LRU. 1350 * 3. Use workload hints. 1351 */ 1352 static int qemu_rdma_unregister_waiting(RDMAContext *rdma) 1353 { 1354 Error *err = NULL; 1355 1356 while (rdma->unregistrations[rdma->unregister_current]) { 1357 int ret; 1358 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current]; 1359 uint64_t chunk = 1360 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; 1361 uint64_t index = 1362 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT; 1363 RDMALocalBlock *block = 1364 &(rdma->local_ram_blocks.block[index]); 1365 RDMARegister reg = { .current_index = index }; 1366 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED, 1367 }; 1368 RDMAControlHeader head = { .len = sizeof(RDMARegister), 1369 .type = RDMA_CONTROL_UNREGISTER_REQUEST, 1370 .repeat = 1, 1371 }; 1372 1373 trace_qemu_rdma_unregister_waiting_proc(chunk, 1374 rdma->unregister_current); 1375 1376 rdma->unregistrations[rdma->unregister_current] = 0; 1377 rdma->unregister_current++; 1378 1379 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) { 1380 rdma->unregister_current = 0; 1381 } 1382 1383 1384 /* 1385 * Unregistration is speculative (because migration is single-threaded 1386 * and we cannot break the protocol's inifinband message ordering). 1387 * Thus, if the memory is currently being used for transmission, 1388 * then abort the attempt to unregister and try again 1389 * later the next time a completion is received for this memory. 1390 */ 1391 clear_bit(chunk, block->unregister_bitmap); 1392 1393 if (test_bit(chunk, block->transit_bitmap)) { 1394 trace_qemu_rdma_unregister_waiting_inflight(chunk); 1395 continue; 1396 } 1397 1398 trace_qemu_rdma_unregister_waiting_send(chunk); 1399 1400 ret = ibv_dereg_mr(block->pmr[chunk]); 1401 block->pmr[chunk] = NULL; 1402 block->remote_keys[chunk] = 0; 1403 1404 if (ret != 0) { 1405 error_report("unregistration chunk failed: %s", 1406 strerror(ret)); 1407 return -1; 1408 } 1409 rdma->total_registrations--; 1410 1411 reg.key.chunk = chunk; 1412 register_to_network(rdma, ®); 1413 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, 1414 &resp, NULL, NULL, &err); 1415 if (ret < 0) { 1416 error_report_err(err); 1417 return -1; 1418 } 1419 1420 trace_qemu_rdma_unregister_waiting_complete(chunk); 1421 } 1422 1423 return 0; 1424 } 1425 1426 static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index, 1427 uint64_t chunk) 1428 { 1429 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK; 1430 1431 result |= (index << RDMA_WRID_BLOCK_SHIFT); 1432 result |= (chunk << RDMA_WRID_CHUNK_SHIFT); 1433 1434 return result; 1435 } 1436 1437 /* 1438 * Consult the connection manager to see a work request 1439 * (of any kind) has completed. 1440 * Return the work request ID that completed. 1441 */ 1442 static int qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq, 1443 uint64_t *wr_id_out, uint32_t *byte_len) 1444 { 1445 int ret; 1446 struct ibv_wc wc; 1447 uint64_t wr_id; 1448 1449 ret = ibv_poll_cq(cq, 1, &wc); 1450 1451 if (!ret) { 1452 *wr_id_out = RDMA_WRID_NONE; 1453 return 0; 1454 } 1455 1456 if (ret < 0) { 1457 return -1; 1458 } 1459 1460 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK; 1461 1462 if (wc.status != IBV_WC_SUCCESS) { 1463 return -1; 1464 } 1465 1466 if (rdma->control_ready_expected && 1467 (wr_id >= RDMA_WRID_RECV_CONTROL)) { 1468 trace_qemu_rdma_poll_recv(wr_id - RDMA_WRID_RECV_CONTROL, wr_id, 1469 rdma->nb_sent); 1470 rdma->control_ready_expected = 0; 1471 } 1472 1473 if (wr_id == RDMA_WRID_RDMA_WRITE) { 1474 uint64_t chunk = 1475 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; 1476 uint64_t index = 1477 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT; 1478 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); 1479 1480 trace_qemu_rdma_poll_write(wr_id, rdma->nb_sent, 1481 index, chunk, block->local_host_addr, 1482 (void *)(uintptr_t)block->remote_host_addr); 1483 1484 clear_bit(chunk, block->transit_bitmap); 1485 1486 if (rdma->nb_sent > 0) { 1487 rdma->nb_sent--; 1488 } 1489 } else { 1490 trace_qemu_rdma_poll_other(wr_id, rdma->nb_sent); 1491 } 1492 1493 *wr_id_out = wc.wr_id; 1494 if (byte_len) { 1495 *byte_len = wc.byte_len; 1496 } 1497 1498 return 0; 1499 } 1500 1501 /* Wait for activity on the completion channel. 1502 * Returns 0 on success, none-0 on error. 1503 */ 1504 static int qemu_rdma_wait_comp_channel(RDMAContext *rdma, 1505 struct ibv_comp_channel *comp_channel) 1506 { 1507 struct rdma_cm_event *cm_event; 1508 1509 /* 1510 * Coroutine doesn't start until migration_fd_process_incoming() 1511 * so don't yield unless we know we're running inside of a coroutine. 1512 */ 1513 if (rdma->migration_started_on_destination && 1514 migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) { 1515 yield_until_fd_readable(comp_channel->fd); 1516 } else { 1517 /* This is the source side, we're in a separate thread 1518 * or destination prior to migration_fd_process_incoming() 1519 * after postcopy, the destination also in a separate thread. 1520 * we can't yield; so we have to poll the fd. 1521 * But we need to be able to handle 'cancel' or an error 1522 * without hanging forever. 1523 */ 1524 while (!rdma->errored && !rdma->received_error) { 1525 GPollFD pfds[2]; 1526 pfds[0].fd = comp_channel->fd; 1527 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR; 1528 pfds[0].revents = 0; 1529 1530 pfds[1].fd = rdma->channel->fd; 1531 pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR; 1532 pfds[1].revents = 0; 1533 1534 /* 0.1s timeout, should be fine for a 'cancel' */ 1535 switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) { 1536 case 2: 1537 case 1: /* fd active */ 1538 if (pfds[0].revents) { 1539 return 0; 1540 } 1541 1542 if (pfds[1].revents) { 1543 if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) { 1544 return -1; 1545 } 1546 1547 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED || 1548 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { 1549 rdma_ack_cm_event(cm_event); 1550 return -1; 1551 } 1552 rdma_ack_cm_event(cm_event); 1553 } 1554 break; 1555 1556 case 0: /* Timeout, go around again */ 1557 break; 1558 1559 default: /* Error of some type - 1560 * I don't trust errno from qemu_poll_ns 1561 */ 1562 return -1; 1563 } 1564 1565 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) { 1566 /* Bail out and let the cancellation happen */ 1567 return -1; 1568 } 1569 } 1570 } 1571 1572 if (rdma->received_error) { 1573 return -1; 1574 } 1575 return -rdma->errored; 1576 } 1577 1578 static struct ibv_comp_channel *to_channel(RDMAContext *rdma, uint64_t wrid) 1579 { 1580 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel : 1581 rdma->recv_comp_channel; 1582 } 1583 1584 static struct ibv_cq *to_cq(RDMAContext *rdma, uint64_t wrid) 1585 { 1586 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq; 1587 } 1588 1589 /* 1590 * Block until the next work request has completed. 1591 * 1592 * First poll to see if a work request has already completed, 1593 * otherwise block. 1594 * 1595 * If we encounter completed work requests for IDs other than 1596 * the one we're interested in, then that's generally an error. 1597 * 1598 * The only exception is actual RDMA Write completions. These 1599 * completions only need to be recorded, but do not actually 1600 * need further processing. 1601 */ 1602 static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 1603 uint64_t wrid_requested, 1604 uint32_t *byte_len) 1605 { 1606 int num_cq_events = 0, ret; 1607 struct ibv_cq *cq; 1608 void *cq_ctx; 1609 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; 1610 struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested); 1611 struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested); 1612 1613 if (ibv_req_notify_cq(poll_cq, 0)) { 1614 return -1; 1615 } 1616 /* poll cq first */ 1617 while (wr_id != wrid_requested) { 1618 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len); 1619 if (ret < 0) { 1620 return -1; 1621 } 1622 1623 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; 1624 1625 if (wr_id == RDMA_WRID_NONE) { 1626 break; 1627 } 1628 if (wr_id != wrid_requested) { 1629 trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id); 1630 } 1631 } 1632 1633 if (wr_id == wrid_requested) { 1634 return 0; 1635 } 1636 1637 while (1) { 1638 ret = qemu_rdma_wait_comp_channel(rdma, ch); 1639 if (ret < 0) { 1640 goto err_block_for_wrid; 1641 } 1642 1643 ret = ibv_get_cq_event(ch, &cq, &cq_ctx); 1644 if (ret < 0) { 1645 goto err_block_for_wrid; 1646 } 1647 1648 num_cq_events++; 1649 1650 if (ibv_req_notify_cq(cq, 0)) { 1651 goto err_block_for_wrid; 1652 } 1653 1654 while (wr_id != wrid_requested) { 1655 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len); 1656 if (ret < 0) { 1657 goto err_block_for_wrid; 1658 } 1659 1660 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; 1661 1662 if (wr_id == RDMA_WRID_NONE) { 1663 break; 1664 } 1665 if (wr_id != wrid_requested) { 1666 trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id); 1667 } 1668 } 1669 1670 if (wr_id == wrid_requested) { 1671 goto success_block_for_wrid; 1672 } 1673 } 1674 1675 success_block_for_wrid: 1676 if (num_cq_events) { 1677 ibv_ack_cq_events(cq, num_cq_events); 1678 } 1679 return 0; 1680 1681 err_block_for_wrid: 1682 if (num_cq_events) { 1683 ibv_ack_cq_events(cq, num_cq_events); 1684 } 1685 1686 rdma->errored = true; 1687 return -1; 1688 } 1689 1690 /* 1691 * Post a SEND message work request for the control channel 1692 * containing some data and block until the post completes. 1693 */ 1694 static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, 1695 RDMAControlHeader *head, 1696 Error **errp) 1697 { 1698 int ret; 1699 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL]; 1700 struct ibv_send_wr *bad_wr; 1701 struct ibv_sge sge = { 1702 .addr = (uintptr_t)(wr->control), 1703 .length = head->len + sizeof(RDMAControlHeader), 1704 .lkey = wr->control_mr->lkey, 1705 }; 1706 struct ibv_send_wr send_wr = { 1707 .wr_id = RDMA_WRID_SEND_CONTROL, 1708 .opcode = IBV_WR_SEND, 1709 .send_flags = IBV_SEND_SIGNALED, 1710 .sg_list = &sge, 1711 .num_sge = 1, 1712 }; 1713 1714 trace_qemu_rdma_post_send_control(control_desc(head->type)); 1715 1716 /* 1717 * We don't actually need to do a memcpy() in here if we used 1718 * the "sge" properly, but since we're only sending control messages 1719 * (not RAM in a performance-critical path), then its OK for now. 1720 * 1721 * The copy makes the RDMAControlHeader simpler to manipulate 1722 * for the time being. 1723 */ 1724 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head)); 1725 memcpy(wr->control, head, sizeof(RDMAControlHeader)); 1726 control_to_network((void *) wr->control); 1727 1728 if (buf) { 1729 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len); 1730 } 1731 1732 1733 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); 1734 1735 if (ret > 0) { 1736 error_setg(errp, "Failed to use post IB SEND for control"); 1737 return -1; 1738 } 1739 1740 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL); 1741 if (ret < 0) { 1742 error_setg(errp, "rdma migration: send polling control error"); 1743 return -1; 1744 } 1745 1746 return 0; 1747 } 1748 1749 /* 1750 * Post a RECV work request in anticipation of some future receipt 1751 * of data on the control channel. 1752 */ 1753 static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx, 1754 Error **errp) 1755 { 1756 struct ibv_recv_wr *bad_wr; 1757 struct ibv_sge sge = { 1758 .addr = (uintptr_t)(rdma->wr_data[idx].control), 1759 .length = RDMA_CONTROL_MAX_BUFFER, 1760 .lkey = rdma->wr_data[idx].control_mr->lkey, 1761 }; 1762 1763 struct ibv_recv_wr recv_wr = { 1764 .wr_id = RDMA_WRID_RECV_CONTROL + idx, 1765 .sg_list = &sge, 1766 .num_sge = 1, 1767 }; 1768 1769 1770 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) { 1771 error_setg(errp, "error posting control recv"); 1772 return -1; 1773 } 1774 1775 return 0; 1776 } 1777 1778 /* 1779 * Block and wait for a RECV control channel message to arrive. 1780 */ 1781 static int qemu_rdma_exchange_get_response(RDMAContext *rdma, 1782 RDMAControlHeader *head, uint32_t expecting, int idx, 1783 Error **errp) 1784 { 1785 uint32_t byte_len; 1786 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx, 1787 &byte_len); 1788 1789 if (ret < 0) { 1790 error_setg(errp, "rdma migration: recv polling control error!"); 1791 return -1; 1792 } 1793 1794 network_to_control((void *) rdma->wr_data[idx].control); 1795 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader)); 1796 1797 trace_qemu_rdma_exchange_get_response_start(control_desc(expecting)); 1798 1799 if (expecting == RDMA_CONTROL_NONE) { 1800 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type), 1801 head->type); 1802 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) { 1803 error_setg(errp, "Was expecting a %s (%d) control message" 1804 ", but got: %s (%d), length: %d", 1805 control_desc(expecting), expecting, 1806 control_desc(head->type), head->type, head->len); 1807 if (head->type == RDMA_CONTROL_ERROR) { 1808 rdma->received_error = true; 1809 } 1810 return -1; 1811 } 1812 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) { 1813 error_setg(errp, "too long length: %d", head->len); 1814 return -1; 1815 } 1816 if (sizeof(*head) + head->len != byte_len) { 1817 error_setg(errp, "Malformed length: %d byte_len %d", 1818 head->len, byte_len); 1819 return -1; 1820 } 1821 1822 return 0; 1823 } 1824 1825 /* 1826 * When a RECV work request has completed, the work request's 1827 * buffer is pointed at the header. 1828 * 1829 * This will advance the pointer to the data portion 1830 * of the control message of the work request's buffer that 1831 * was populated after the work request finished. 1832 */ 1833 static void qemu_rdma_move_header(RDMAContext *rdma, int idx, 1834 RDMAControlHeader *head) 1835 { 1836 rdma->wr_data[idx].control_len = head->len; 1837 rdma->wr_data[idx].control_curr = 1838 rdma->wr_data[idx].control + sizeof(RDMAControlHeader); 1839 } 1840 1841 /* 1842 * This is an 'atomic' high-level operation to deliver a single, unified 1843 * control-channel message. 1844 * 1845 * Additionally, if the user is expecting some kind of reply to this message, 1846 * they can request a 'resp' response message be filled in by posting an 1847 * additional work request on behalf of the user and waiting for an additional 1848 * completion. 1849 * 1850 * The extra (optional) response is used during registration to us from having 1851 * to perform an *additional* exchange of message just to provide a response by 1852 * instead piggy-backing on the acknowledgement. 1853 */ 1854 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, 1855 uint8_t *data, RDMAControlHeader *resp, 1856 int *resp_idx, 1857 int (*callback)(RDMAContext *rdma, 1858 Error **errp), 1859 Error **errp) 1860 { 1861 int ret; 1862 1863 /* 1864 * Wait until the dest is ready before attempting to deliver the message 1865 * by waiting for a READY message. 1866 */ 1867 if (rdma->control_ready_expected) { 1868 RDMAControlHeader resp_ignored; 1869 1870 ret = qemu_rdma_exchange_get_response(rdma, &resp_ignored, 1871 RDMA_CONTROL_READY, 1872 RDMA_WRID_READY, errp); 1873 if (ret < 0) { 1874 return -1; 1875 } 1876 } 1877 1878 /* 1879 * If the user is expecting a response, post a WR in anticipation of it. 1880 */ 1881 if (resp) { 1882 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA, errp); 1883 if (ret < 0) { 1884 return -1; 1885 } 1886 } 1887 1888 /* 1889 * Post a WR to replace the one we just consumed for the READY message. 1890 */ 1891 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp); 1892 if (ret < 0) { 1893 return -1; 1894 } 1895 1896 /* 1897 * Deliver the control message that was requested. 1898 */ 1899 ret = qemu_rdma_post_send_control(rdma, data, head, errp); 1900 1901 if (ret < 0) { 1902 return -1; 1903 } 1904 1905 /* 1906 * If we're expecting a response, block and wait for it. 1907 */ 1908 if (resp) { 1909 if (callback) { 1910 trace_qemu_rdma_exchange_send_issue_callback(); 1911 ret = callback(rdma, errp); 1912 if (ret < 0) { 1913 return -1; 1914 } 1915 } 1916 1917 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type)); 1918 ret = qemu_rdma_exchange_get_response(rdma, resp, 1919 resp->type, RDMA_WRID_DATA, 1920 errp); 1921 1922 if (ret < 0) { 1923 return -1; 1924 } 1925 1926 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp); 1927 if (resp_idx) { 1928 *resp_idx = RDMA_WRID_DATA; 1929 } 1930 trace_qemu_rdma_exchange_send_received(control_desc(resp->type)); 1931 } 1932 1933 rdma->control_ready_expected = 1; 1934 1935 return 0; 1936 } 1937 1938 /* 1939 * This is an 'atomic' high-level operation to receive a single, unified 1940 * control-channel message. 1941 */ 1942 static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head, 1943 uint32_t expecting, Error **errp) 1944 { 1945 RDMAControlHeader ready = { 1946 .len = 0, 1947 .type = RDMA_CONTROL_READY, 1948 .repeat = 1, 1949 }; 1950 int ret; 1951 1952 /* 1953 * Inform the source that we're ready to receive a message. 1954 */ 1955 ret = qemu_rdma_post_send_control(rdma, NULL, &ready, errp); 1956 1957 if (ret < 0) { 1958 return -1; 1959 } 1960 1961 /* 1962 * Block and wait for the message. 1963 */ 1964 ret = qemu_rdma_exchange_get_response(rdma, head, 1965 expecting, RDMA_WRID_READY, errp); 1966 1967 if (ret < 0) { 1968 return -1; 1969 } 1970 1971 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head); 1972 1973 /* 1974 * Post a new RECV work request to replace the one we just consumed. 1975 */ 1976 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp); 1977 if (ret < 0) { 1978 return -1; 1979 } 1980 1981 return 0; 1982 } 1983 1984 /* 1985 * Write an actual chunk of memory using RDMA. 1986 * 1987 * If we're using dynamic registration on the dest-side, we have to 1988 * send a registration command first. 1989 */ 1990 static int qemu_rdma_write_one(RDMAContext *rdma, 1991 int current_index, uint64_t current_addr, 1992 uint64_t length, Error **errp) 1993 { 1994 struct ibv_sge sge; 1995 struct ibv_send_wr send_wr = { 0 }; 1996 struct ibv_send_wr *bad_wr; 1997 int reg_result_idx, ret, count = 0; 1998 uint64_t chunk, chunks; 1999 uint8_t *chunk_start, *chunk_end; 2000 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]); 2001 RDMARegister reg; 2002 RDMARegisterResult *reg_result; 2003 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT }; 2004 RDMAControlHeader head = { .len = sizeof(RDMARegister), 2005 .type = RDMA_CONTROL_REGISTER_REQUEST, 2006 .repeat = 1, 2007 }; 2008 2009 retry: 2010 sge.addr = (uintptr_t)(block->local_host_addr + 2011 (current_addr - block->offset)); 2012 sge.length = length; 2013 2014 chunk = ram_chunk_index(block->local_host_addr, 2015 (uint8_t *)(uintptr_t)sge.addr); 2016 chunk_start = ram_chunk_start(block, chunk); 2017 2018 if (block->is_ram_block) { 2019 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT); 2020 2021 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { 2022 chunks--; 2023 } 2024 } else { 2025 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT); 2026 2027 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { 2028 chunks--; 2029 } 2030 } 2031 2032 trace_qemu_rdma_write_one_top(chunks + 1, 2033 (chunks + 1) * 2034 (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024); 2035 2036 chunk_end = ram_chunk_end(block, chunk + chunks); 2037 2038 2039 while (test_bit(chunk, block->transit_bitmap)) { 2040 (void)count; 2041 trace_qemu_rdma_write_one_block(count++, current_index, chunk, 2042 sge.addr, length, rdma->nb_sent, block->nb_chunks); 2043 2044 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); 2045 2046 if (ret < 0) { 2047 error_setg(errp, "Failed to Wait for previous write to complete " 2048 "block %d chunk %" PRIu64 2049 " current %" PRIu64 " len %" PRIu64 " %d", 2050 current_index, chunk, sge.addr, length, rdma->nb_sent); 2051 return -1; 2052 } 2053 } 2054 2055 if (!rdma->pin_all || !block->is_ram_block) { 2056 if (!block->remote_keys[chunk]) { 2057 /* 2058 * This chunk has not yet been registered, so first check to see 2059 * if the entire chunk is zero. If so, tell the other size to 2060 * memset() + madvise() the entire chunk without RDMA. 2061 */ 2062 2063 if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) { 2064 RDMACompress comp = { 2065 .offset = current_addr, 2066 .value = 0, 2067 .block_idx = current_index, 2068 .length = length, 2069 }; 2070 2071 head.len = sizeof(comp); 2072 head.type = RDMA_CONTROL_COMPRESS; 2073 2074 trace_qemu_rdma_write_one_zero(chunk, sge.length, 2075 current_index, current_addr); 2076 2077 compress_to_network(rdma, &comp); 2078 ret = qemu_rdma_exchange_send(rdma, &head, 2079 (uint8_t *) &comp, NULL, NULL, NULL, errp); 2080 2081 if (ret < 0) { 2082 return -1; 2083 } 2084 2085 /* 2086 * TODO: Here we are sending something, but we are not 2087 * accounting for anything transferred. The following is wrong: 2088 * 2089 * stat64_add(&mig_stats.rdma_bytes, sge.length); 2090 * 2091 * because we are using some kind of compression. I 2092 * would think that head.len would be the more similar 2093 * thing to a correct value. 2094 */ 2095 stat64_add(&mig_stats.zero_pages, 2096 sge.length / qemu_target_page_size()); 2097 return 1; 2098 } 2099 2100 /* 2101 * Otherwise, tell other side to register. 2102 */ 2103 reg.current_index = current_index; 2104 if (block->is_ram_block) { 2105 reg.key.current_addr = current_addr; 2106 } else { 2107 reg.key.chunk = chunk; 2108 } 2109 reg.chunks = chunks; 2110 2111 trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index, 2112 current_addr); 2113 2114 register_to_network(rdma, ®); 2115 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, 2116 &resp, ®_result_idx, NULL, errp); 2117 if (ret < 0) { 2118 return -1; 2119 } 2120 2121 /* try to overlap this single registration with the one we sent. */ 2122 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, 2123 &sge.lkey, NULL, chunk, 2124 chunk_start, chunk_end)) { 2125 error_setg(errp, "cannot get lkey"); 2126 return -1; 2127 } 2128 2129 reg_result = (RDMARegisterResult *) 2130 rdma->wr_data[reg_result_idx].control_curr; 2131 2132 network_to_result(reg_result); 2133 2134 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk], 2135 reg_result->rkey, chunk); 2136 2137 block->remote_keys[chunk] = reg_result->rkey; 2138 block->remote_host_addr = reg_result->host_addr; 2139 } else { 2140 /* already registered before */ 2141 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, 2142 &sge.lkey, NULL, chunk, 2143 chunk_start, chunk_end)) { 2144 error_setg(errp, "cannot get lkey!"); 2145 return -1; 2146 } 2147 } 2148 2149 send_wr.wr.rdma.rkey = block->remote_keys[chunk]; 2150 } else { 2151 send_wr.wr.rdma.rkey = block->remote_rkey; 2152 2153 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, 2154 &sge.lkey, NULL, chunk, 2155 chunk_start, chunk_end)) { 2156 error_setg(errp, "cannot get lkey!"); 2157 return -1; 2158 } 2159 } 2160 2161 /* 2162 * Encode the ram block index and chunk within this wrid. 2163 * We will use this information at the time of completion 2164 * to figure out which bitmap to check against and then which 2165 * chunk in the bitmap to look for. 2166 */ 2167 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE, 2168 current_index, chunk); 2169 2170 send_wr.opcode = IBV_WR_RDMA_WRITE; 2171 send_wr.send_flags = IBV_SEND_SIGNALED; 2172 send_wr.sg_list = &sge; 2173 send_wr.num_sge = 1; 2174 send_wr.wr.rdma.remote_addr = block->remote_host_addr + 2175 (current_addr - block->offset); 2176 2177 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr, 2178 sge.length); 2179 2180 /* 2181 * ibv_post_send() does not return negative error numbers, 2182 * per the specification they are positive - no idea why. 2183 */ 2184 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); 2185 2186 if (ret == ENOMEM) { 2187 trace_qemu_rdma_write_one_queue_full(); 2188 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); 2189 if (ret < 0) { 2190 error_setg(errp, "rdma migration: failed to make " 2191 "room in full send queue!"); 2192 return -1; 2193 } 2194 2195 goto retry; 2196 2197 } else if (ret > 0) { 2198 error_setg_errno(errp, ret, 2199 "rdma migration: post rdma write failed"); 2200 return -1; 2201 } 2202 2203 set_bit(chunk, block->transit_bitmap); 2204 stat64_add(&mig_stats.normal_pages, sge.length / qemu_target_page_size()); 2205 /* 2206 * We are adding to transferred the amount of data written, but no 2207 * overhead at all. I will asume that RDMA is magicaly and don't 2208 * need to transfer (at least) the addresses where it wants to 2209 * write the pages. Here it looks like it should be something 2210 * like: 2211 * sizeof(send_wr) + sge.length 2212 * but this being RDMA, who knows. 2213 */ 2214 stat64_add(&mig_stats.rdma_bytes, sge.length); 2215 ram_transferred_add(sge.length); 2216 rdma->total_writes++; 2217 2218 return 0; 2219 } 2220 2221 /* 2222 * Push out any unwritten RDMA operations. 2223 * 2224 * We support sending out multiple chunks at the same time. 2225 * Not all of them need to get signaled in the completion queue. 2226 */ 2227 static int qemu_rdma_write_flush(RDMAContext *rdma, Error **errp) 2228 { 2229 int ret; 2230 2231 if (!rdma->current_length) { 2232 return 0; 2233 } 2234 2235 ret = qemu_rdma_write_one(rdma, rdma->current_index, rdma->current_addr, 2236 rdma->current_length, errp); 2237 2238 if (ret < 0) { 2239 return -1; 2240 } 2241 2242 if (ret == 0) { 2243 rdma->nb_sent++; 2244 trace_qemu_rdma_write_flush(rdma->nb_sent); 2245 } 2246 2247 rdma->current_length = 0; 2248 rdma->current_addr = 0; 2249 2250 return 0; 2251 } 2252 2253 static inline bool qemu_rdma_buffer_mergeable(RDMAContext *rdma, 2254 uint64_t offset, uint64_t len) 2255 { 2256 RDMALocalBlock *block; 2257 uint8_t *host_addr; 2258 uint8_t *chunk_end; 2259 2260 if (rdma->current_index < 0) { 2261 return false; 2262 } 2263 2264 if (rdma->current_chunk < 0) { 2265 return false; 2266 } 2267 2268 block = &(rdma->local_ram_blocks.block[rdma->current_index]); 2269 host_addr = block->local_host_addr + (offset - block->offset); 2270 chunk_end = ram_chunk_end(block, rdma->current_chunk); 2271 2272 if (rdma->current_length == 0) { 2273 return false; 2274 } 2275 2276 /* 2277 * Only merge into chunk sequentially. 2278 */ 2279 if (offset != (rdma->current_addr + rdma->current_length)) { 2280 return false; 2281 } 2282 2283 if (offset < block->offset) { 2284 return false; 2285 } 2286 2287 if ((offset + len) > (block->offset + block->length)) { 2288 return false; 2289 } 2290 2291 if ((host_addr + len) > chunk_end) { 2292 return false; 2293 } 2294 2295 return true; 2296 } 2297 2298 /* 2299 * We're not actually writing here, but doing three things: 2300 * 2301 * 1. Identify the chunk the buffer belongs to. 2302 * 2. If the chunk is full or the buffer doesn't belong to the current 2303 * chunk, then start a new chunk and flush() the old chunk. 2304 * 3. To keep the hardware busy, we also group chunks into batches 2305 * and only require that a batch gets acknowledged in the completion 2306 * queue instead of each individual chunk. 2307 */ 2308 static int qemu_rdma_write(RDMAContext *rdma, 2309 uint64_t block_offset, uint64_t offset, 2310 uint64_t len, Error **errp) 2311 { 2312 uint64_t current_addr = block_offset + offset; 2313 uint64_t index = rdma->current_index; 2314 uint64_t chunk = rdma->current_chunk; 2315 2316 /* If we cannot merge it, we flush the current buffer first. */ 2317 if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) { 2318 if (qemu_rdma_write_flush(rdma, errp) < 0) { 2319 return -1; 2320 } 2321 rdma->current_length = 0; 2322 rdma->current_addr = current_addr; 2323 2324 qemu_rdma_search_ram_block(rdma, block_offset, 2325 offset, len, &index, &chunk); 2326 rdma->current_index = index; 2327 rdma->current_chunk = chunk; 2328 } 2329 2330 /* merge it */ 2331 rdma->current_length += len; 2332 2333 /* flush it if buffer is too large */ 2334 if (rdma->current_length >= RDMA_MERGE_MAX) { 2335 return qemu_rdma_write_flush(rdma, errp); 2336 } 2337 2338 return 0; 2339 } 2340 2341 static void qemu_rdma_cleanup(RDMAContext *rdma) 2342 { 2343 Error *err = NULL; 2344 2345 if (rdma->cm_id && rdma->connected) { 2346 if ((rdma->errored || 2347 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) && 2348 !rdma->received_error) { 2349 RDMAControlHeader head = { .len = 0, 2350 .type = RDMA_CONTROL_ERROR, 2351 .repeat = 1, 2352 }; 2353 warn_report("Early error. Sending error."); 2354 if (qemu_rdma_post_send_control(rdma, NULL, &head, &err) < 0) { 2355 warn_report_err(err); 2356 } 2357 } 2358 2359 rdma_disconnect(rdma->cm_id); 2360 trace_qemu_rdma_cleanup_disconnect(); 2361 rdma->connected = false; 2362 } 2363 2364 if (rdma->channel) { 2365 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL); 2366 } 2367 g_free(rdma->dest_blocks); 2368 rdma->dest_blocks = NULL; 2369 2370 for (int i = 0; i < RDMA_WRID_MAX; i++) { 2371 if (rdma->wr_data[i].control_mr) { 2372 rdma->total_registrations--; 2373 ibv_dereg_mr(rdma->wr_data[i].control_mr); 2374 } 2375 rdma->wr_data[i].control_mr = NULL; 2376 } 2377 2378 if (rdma->local_ram_blocks.block) { 2379 while (rdma->local_ram_blocks.nb_blocks) { 2380 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]); 2381 } 2382 } 2383 2384 if (rdma->qp) { 2385 rdma_destroy_qp(rdma->cm_id); 2386 rdma->qp = NULL; 2387 } 2388 if (rdma->recv_cq) { 2389 ibv_destroy_cq(rdma->recv_cq); 2390 rdma->recv_cq = NULL; 2391 } 2392 if (rdma->send_cq) { 2393 ibv_destroy_cq(rdma->send_cq); 2394 rdma->send_cq = NULL; 2395 } 2396 if (rdma->recv_comp_channel) { 2397 ibv_destroy_comp_channel(rdma->recv_comp_channel); 2398 rdma->recv_comp_channel = NULL; 2399 } 2400 if (rdma->send_comp_channel) { 2401 ibv_destroy_comp_channel(rdma->send_comp_channel); 2402 rdma->send_comp_channel = NULL; 2403 } 2404 if (rdma->pd) { 2405 ibv_dealloc_pd(rdma->pd); 2406 rdma->pd = NULL; 2407 } 2408 if (rdma->cm_id) { 2409 rdma_destroy_id(rdma->cm_id); 2410 rdma->cm_id = NULL; 2411 } 2412 2413 /* the destination side, listen_id and channel is shared */ 2414 if (rdma->listen_id) { 2415 if (!rdma->is_return_path) { 2416 rdma_destroy_id(rdma->listen_id); 2417 } 2418 rdma->listen_id = NULL; 2419 2420 if (rdma->channel) { 2421 if (!rdma->is_return_path) { 2422 rdma_destroy_event_channel(rdma->channel); 2423 } 2424 rdma->channel = NULL; 2425 } 2426 } 2427 2428 if (rdma->channel) { 2429 rdma_destroy_event_channel(rdma->channel); 2430 rdma->channel = NULL; 2431 } 2432 g_free(rdma->host); 2433 rdma->host = NULL; 2434 } 2435 2436 2437 static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp) 2438 { 2439 int ret; 2440 2441 /* 2442 * Will be validated against destination's actual capabilities 2443 * after the connect() completes. 2444 */ 2445 rdma->pin_all = pin_all; 2446 2447 ret = qemu_rdma_resolve_host(rdma, errp); 2448 if (ret < 0) { 2449 goto err_rdma_source_init; 2450 } 2451 2452 ret = qemu_rdma_alloc_pd_cq(rdma, errp); 2453 if (ret < 0) { 2454 goto err_rdma_source_init; 2455 } 2456 2457 ret = qemu_rdma_alloc_qp(rdma); 2458 if (ret < 0) { 2459 error_setg(errp, "RDMA ERROR: rdma migration: error allocating qp!"); 2460 goto err_rdma_source_init; 2461 } 2462 2463 qemu_rdma_init_ram_blocks(rdma); 2464 2465 /* Build the hash that maps from offset to RAMBlock */ 2466 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); 2467 for (int i = 0; i < rdma->local_ram_blocks.nb_blocks; i++) { 2468 g_hash_table_insert(rdma->blockmap, 2469 (void *)(uintptr_t)rdma->local_ram_blocks.block[i].offset, 2470 &rdma->local_ram_blocks.block[i]); 2471 } 2472 2473 for (int i = 0; i < RDMA_WRID_MAX; i++) { 2474 ret = qemu_rdma_reg_control(rdma, i); 2475 if (ret < 0) { 2476 error_setg(errp, "RDMA ERROR: rdma migration: error " 2477 "registering %d control!", i); 2478 goto err_rdma_source_init; 2479 } 2480 } 2481 2482 return 0; 2483 2484 err_rdma_source_init: 2485 qemu_rdma_cleanup(rdma); 2486 return -1; 2487 } 2488 2489 static int qemu_get_cm_event_timeout(RDMAContext *rdma, 2490 struct rdma_cm_event **cm_event, 2491 long msec, Error **errp) 2492 { 2493 int ret; 2494 struct pollfd poll_fd = { 2495 .fd = rdma->channel->fd, 2496 .events = POLLIN, 2497 .revents = 0 2498 }; 2499 2500 do { 2501 ret = poll(&poll_fd, 1, msec); 2502 } while (ret < 0 && errno == EINTR); 2503 2504 if (ret == 0) { 2505 error_setg(errp, "RDMA ERROR: poll cm event timeout"); 2506 return -1; 2507 } else if (ret < 0) { 2508 error_setg(errp, "RDMA ERROR: failed to poll cm event, errno=%i", 2509 errno); 2510 return -1; 2511 } else if (poll_fd.revents & POLLIN) { 2512 if (rdma_get_cm_event(rdma->channel, cm_event) < 0) { 2513 error_setg(errp, "RDMA ERROR: failed to get cm event"); 2514 return -1; 2515 } 2516 return 0; 2517 } else { 2518 error_setg(errp, "RDMA ERROR: no POLLIN event, revent=%x", 2519 poll_fd.revents); 2520 return -1; 2521 } 2522 } 2523 2524 static int qemu_rdma_connect(RDMAContext *rdma, bool return_path, 2525 Error **errp) 2526 { 2527 RDMACapabilities cap = { 2528 .version = RDMA_CONTROL_VERSION_CURRENT, 2529 .flags = 0, 2530 }; 2531 struct rdma_conn_param conn_param = { .initiator_depth = 2, 2532 .retry_count = 5, 2533 .private_data = &cap, 2534 .private_data_len = sizeof(cap), 2535 }; 2536 struct rdma_cm_event *cm_event; 2537 int ret; 2538 2539 /* 2540 * Only negotiate the capability with destination if the user 2541 * on the source first requested the capability. 2542 */ 2543 if (rdma->pin_all) { 2544 trace_qemu_rdma_connect_pin_all_requested(); 2545 cap.flags |= RDMA_CAPABILITY_PIN_ALL; 2546 } 2547 2548 caps_to_network(&cap); 2549 2550 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp); 2551 if (ret < 0) { 2552 goto err_rdma_source_connect; 2553 } 2554 2555 ret = rdma_connect(rdma->cm_id, &conn_param); 2556 if (ret < 0) { 2557 error_setg_errno(errp, errno, 2558 "RDMA ERROR: connecting to destination!"); 2559 goto err_rdma_source_connect; 2560 } 2561 2562 if (return_path) { 2563 ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp); 2564 } else { 2565 ret = rdma_get_cm_event(rdma->channel, &cm_event); 2566 if (ret < 0) { 2567 error_setg_errno(errp, errno, 2568 "RDMA ERROR: failed to get cm event"); 2569 } 2570 } 2571 if (ret < 0) { 2572 goto err_rdma_source_connect; 2573 } 2574 2575 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { 2576 error_setg(errp, "RDMA ERROR: connecting to destination!"); 2577 rdma_ack_cm_event(cm_event); 2578 goto err_rdma_source_connect; 2579 } 2580 rdma->connected = true; 2581 2582 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); 2583 network_to_caps(&cap); 2584 2585 /* 2586 * Verify that the *requested* capabilities are supported by the destination 2587 * and disable them otherwise. 2588 */ 2589 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) { 2590 warn_report("RDMA: Server cannot support pinning all memory. " 2591 "Will register memory dynamically."); 2592 rdma->pin_all = false; 2593 } 2594 2595 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all); 2596 2597 rdma_ack_cm_event(cm_event); 2598 2599 rdma->control_ready_expected = 1; 2600 rdma->nb_sent = 0; 2601 return 0; 2602 2603 err_rdma_source_connect: 2604 qemu_rdma_cleanup(rdma); 2605 return -1; 2606 } 2607 2608 static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) 2609 { 2610 Error *err = NULL; 2611 int ret; 2612 struct rdma_cm_id *listen_id; 2613 char ip[40] = "unknown"; 2614 struct rdma_addrinfo *res, *e; 2615 char port_str[16]; 2616 int reuse = 1; 2617 2618 for (int i = 0; i < RDMA_WRID_MAX; i++) { 2619 rdma->wr_data[i].control_len = 0; 2620 rdma->wr_data[i].control_curr = NULL; 2621 } 2622 2623 if (!rdma->host || !rdma->host[0]) { 2624 error_setg(errp, "RDMA ERROR: RDMA host is not set!"); 2625 rdma->errored = true; 2626 return -1; 2627 } 2628 /* create CM channel */ 2629 rdma->channel = rdma_create_event_channel(); 2630 if (!rdma->channel) { 2631 error_setg(errp, "RDMA ERROR: could not create rdma event channel"); 2632 rdma->errored = true; 2633 return -1; 2634 } 2635 2636 /* create CM id */ 2637 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP); 2638 if (ret < 0) { 2639 error_setg(errp, "RDMA ERROR: could not create cm_id!"); 2640 goto err_dest_init_create_listen_id; 2641 } 2642 2643 snprintf(port_str, 16, "%d", rdma->port); 2644 port_str[15] = '\0'; 2645 2646 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res); 2647 if (ret) { 2648 error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s", 2649 rdma->host); 2650 goto err_dest_init_bind_addr; 2651 } 2652 2653 ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR, 2654 &reuse, sizeof reuse); 2655 if (ret < 0) { 2656 error_setg(errp, "RDMA ERROR: Error: could not set REUSEADDR option"); 2657 goto err_dest_init_bind_addr; 2658 } 2659 2660 /* Try all addresses, saving the first error in @err */ 2661 for (e = res; e != NULL; e = e->ai_next) { 2662 Error **local_errp = err ? NULL : &err; 2663 2664 inet_ntop(e->ai_family, 2665 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); 2666 trace_qemu_rdma_dest_init_trying(rdma->host, ip); 2667 ret = rdma_bind_addr(listen_id, e->ai_dst_addr); 2668 if (ret < 0) { 2669 continue; 2670 } 2671 if (e->ai_family == AF_INET6) { 2672 ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, 2673 local_errp); 2674 if (ret < 0) { 2675 continue; 2676 } 2677 } 2678 error_free(err); 2679 break; 2680 } 2681 2682 rdma_freeaddrinfo(res); 2683 if (!e) { 2684 if (err) { 2685 error_propagate(errp, err); 2686 } else { 2687 error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!"); 2688 } 2689 goto err_dest_init_bind_addr; 2690 } 2691 2692 rdma->listen_id = listen_id; 2693 qemu_rdma_dump_gid("dest_init", listen_id); 2694 return 0; 2695 2696 err_dest_init_bind_addr: 2697 rdma_destroy_id(listen_id); 2698 err_dest_init_create_listen_id: 2699 rdma_destroy_event_channel(rdma->channel); 2700 rdma->channel = NULL; 2701 rdma->errored = true; 2702 return -1; 2703 2704 } 2705 2706 static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path, 2707 RDMAContext *rdma) 2708 { 2709 for (int i = 0; i < RDMA_WRID_MAX; i++) { 2710 rdma_return_path->wr_data[i].control_len = 0; 2711 rdma_return_path->wr_data[i].control_curr = NULL; 2712 } 2713 2714 /*the CM channel and CM id is shared*/ 2715 rdma_return_path->channel = rdma->channel; 2716 rdma_return_path->listen_id = rdma->listen_id; 2717 2718 rdma->return_path = rdma_return_path; 2719 rdma_return_path->return_path = rdma; 2720 rdma_return_path->is_return_path = true; 2721 } 2722 2723 static RDMAContext *qemu_rdma_data_init(InetSocketAddress *saddr, Error **errp) 2724 { 2725 RDMAContext *rdma = NULL; 2726 2727 rdma = g_new0(RDMAContext, 1); 2728 rdma->current_index = -1; 2729 rdma->current_chunk = -1; 2730 2731 rdma->host = g_strdup(saddr->host); 2732 rdma->port = atoi(saddr->port); 2733 return rdma; 2734 } 2735 2736 /* 2737 * QEMUFile interface to the control channel. 2738 * SEND messages for control only. 2739 * VM's ram is handled with regular RDMA messages. 2740 */ 2741 static ssize_t qio_channel_rdma_writev(QIOChannel *ioc, 2742 const struct iovec *iov, 2743 size_t niov, 2744 int *fds, 2745 size_t nfds, 2746 int flags, 2747 Error **errp) 2748 { 2749 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); 2750 RDMAContext *rdma; 2751 int ret; 2752 ssize_t done = 0; 2753 size_t len; 2754 2755 RCU_READ_LOCK_GUARD(); 2756 rdma = qatomic_rcu_read(&rioc->rdmaout); 2757 2758 if (!rdma) { 2759 error_setg(errp, "RDMA control channel output is not set"); 2760 return -1; 2761 } 2762 2763 if (rdma->errored) { 2764 error_setg(errp, 2765 "RDMA is in an error state waiting migration to abort!"); 2766 return -1; 2767 } 2768 2769 /* 2770 * Push out any writes that 2771 * we're queued up for VM's ram. 2772 */ 2773 ret = qemu_rdma_write_flush(rdma, errp); 2774 if (ret < 0) { 2775 rdma->errored = true; 2776 return -1; 2777 } 2778 2779 for (int i = 0; i < niov; i++) { 2780 size_t remaining = iov[i].iov_len; 2781 uint8_t * data = (void *)iov[i].iov_base; 2782 while (remaining) { 2783 RDMAControlHeader head = {}; 2784 2785 len = MIN(remaining, RDMA_SEND_INCREMENT); 2786 remaining -= len; 2787 2788 head.len = len; 2789 head.type = RDMA_CONTROL_QEMU_FILE; 2790 2791 ret = qemu_rdma_exchange_send(rdma, &head, 2792 data, NULL, NULL, NULL, errp); 2793 2794 if (ret < 0) { 2795 rdma->errored = true; 2796 return -1; 2797 } 2798 2799 data += len; 2800 done += len; 2801 } 2802 } 2803 2804 return done; 2805 } 2806 2807 static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, 2808 size_t size, int idx) 2809 { 2810 size_t len = 0; 2811 2812 if (rdma->wr_data[idx].control_len) { 2813 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size); 2814 2815 len = MIN(size, rdma->wr_data[idx].control_len); 2816 memcpy(buf, rdma->wr_data[idx].control_curr, len); 2817 rdma->wr_data[idx].control_curr += len; 2818 rdma->wr_data[idx].control_len -= len; 2819 } 2820 2821 return len; 2822 } 2823 2824 /* 2825 * QEMUFile interface to the control channel. 2826 * RDMA links don't use bytestreams, so we have to 2827 * return bytes to QEMUFile opportunistically. 2828 */ 2829 static ssize_t qio_channel_rdma_readv(QIOChannel *ioc, 2830 const struct iovec *iov, 2831 size_t niov, 2832 int **fds, 2833 size_t *nfds, 2834 int flags, 2835 Error **errp) 2836 { 2837 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); 2838 RDMAContext *rdma; 2839 RDMAControlHeader head; 2840 int ret; 2841 ssize_t done = 0; 2842 size_t len; 2843 2844 RCU_READ_LOCK_GUARD(); 2845 rdma = qatomic_rcu_read(&rioc->rdmain); 2846 2847 if (!rdma) { 2848 error_setg(errp, "RDMA control channel input is not set"); 2849 return -1; 2850 } 2851 2852 if (rdma->errored) { 2853 error_setg(errp, 2854 "RDMA is in an error state waiting migration to abort!"); 2855 return -1; 2856 } 2857 2858 for (int i = 0; i < niov; i++) { 2859 size_t want = iov[i].iov_len; 2860 uint8_t *data = (void *)iov[i].iov_base; 2861 2862 /* 2863 * First, we hold on to the last SEND message we 2864 * were given and dish out the bytes until we run 2865 * out of bytes. 2866 */ 2867 len = qemu_rdma_fill(rdma, data, want, 0); 2868 done += len; 2869 want -= len; 2870 /* Got what we needed, so go to next iovec */ 2871 if (want == 0) { 2872 continue; 2873 } 2874 2875 /* If we got any data so far, then don't wait 2876 * for more, just return what we have */ 2877 if (done > 0) { 2878 break; 2879 } 2880 2881 2882 /* We've got nothing at all, so lets wait for 2883 * more to arrive 2884 */ 2885 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE, 2886 errp); 2887 2888 if (ret < 0) { 2889 rdma->errored = true; 2890 return -1; 2891 } 2892 2893 /* 2894 * SEND was received with new bytes, now try again. 2895 */ 2896 len = qemu_rdma_fill(rdma, data, want, 0); 2897 done += len; 2898 want -= len; 2899 2900 /* Still didn't get enough, so lets just return */ 2901 if (want) { 2902 if (done == 0) { 2903 return QIO_CHANNEL_ERR_BLOCK; 2904 } else { 2905 break; 2906 } 2907 } 2908 } 2909 return done; 2910 } 2911 2912 /* 2913 * Block until all the outstanding chunks have been delivered by the hardware. 2914 */ 2915 static int qemu_rdma_drain_cq(RDMAContext *rdma) 2916 { 2917 Error *err = NULL; 2918 2919 if (qemu_rdma_write_flush(rdma, &err) < 0) { 2920 error_report_err(err); 2921 return -1; 2922 } 2923 2924 while (rdma->nb_sent) { 2925 if (qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL) < 0) { 2926 error_report("rdma migration: complete polling error!"); 2927 return -1; 2928 } 2929 } 2930 2931 qemu_rdma_unregister_waiting(rdma); 2932 2933 return 0; 2934 } 2935 2936 2937 static int qio_channel_rdma_set_blocking(QIOChannel *ioc, 2938 bool blocking, 2939 Error **errp) 2940 { 2941 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); 2942 /* XXX we should make readv/writev actually honour this :-) */ 2943 rioc->blocking = blocking; 2944 return 0; 2945 } 2946 2947 2948 typedef struct QIOChannelRDMASource QIOChannelRDMASource; 2949 struct QIOChannelRDMASource { 2950 GSource parent; 2951 QIOChannelRDMA *rioc; 2952 GIOCondition condition; 2953 }; 2954 2955 static gboolean 2956 qio_channel_rdma_source_prepare(GSource *source, 2957 gint *timeout) 2958 { 2959 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source; 2960 RDMAContext *rdma; 2961 GIOCondition cond = 0; 2962 *timeout = -1; 2963 2964 RCU_READ_LOCK_GUARD(); 2965 if (rsource->condition == G_IO_IN) { 2966 rdma = qatomic_rcu_read(&rsource->rioc->rdmain); 2967 } else { 2968 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout); 2969 } 2970 2971 if (!rdma) { 2972 error_report("RDMAContext is NULL when prepare Gsource"); 2973 return FALSE; 2974 } 2975 2976 if (rdma->wr_data[0].control_len) { 2977 cond |= G_IO_IN; 2978 } 2979 cond |= G_IO_OUT; 2980 2981 return cond & rsource->condition; 2982 } 2983 2984 static gboolean 2985 qio_channel_rdma_source_check(GSource *source) 2986 { 2987 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source; 2988 RDMAContext *rdma; 2989 GIOCondition cond = 0; 2990 2991 RCU_READ_LOCK_GUARD(); 2992 if (rsource->condition == G_IO_IN) { 2993 rdma = qatomic_rcu_read(&rsource->rioc->rdmain); 2994 } else { 2995 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout); 2996 } 2997 2998 if (!rdma) { 2999 error_report("RDMAContext is NULL when check Gsource"); 3000 return FALSE; 3001 } 3002 3003 if (rdma->wr_data[0].control_len) { 3004 cond |= G_IO_IN; 3005 } 3006 cond |= G_IO_OUT; 3007 3008 return cond & rsource->condition; 3009 } 3010 3011 static gboolean 3012 qio_channel_rdma_source_dispatch(GSource *source, 3013 GSourceFunc callback, 3014 gpointer user_data) 3015 { 3016 QIOChannelFunc func = (QIOChannelFunc)callback; 3017 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source; 3018 RDMAContext *rdma; 3019 GIOCondition cond = 0; 3020 3021 RCU_READ_LOCK_GUARD(); 3022 if (rsource->condition == G_IO_IN) { 3023 rdma = qatomic_rcu_read(&rsource->rioc->rdmain); 3024 } else { 3025 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout); 3026 } 3027 3028 if (!rdma) { 3029 error_report("RDMAContext is NULL when dispatch Gsource"); 3030 return FALSE; 3031 } 3032 3033 if (rdma->wr_data[0].control_len) { 3034 cond |= G_IO_IN; 3035 } 3036 cond |= G_IO_OUT; 3037 3038 return (*func)(QIO_CHANNEL(rsource->rioc), 3039 (cond & rsource->condition), 3040 user_data); 3041 } 3042 3043 static void 3044 qio_channel_rdma_source_finalize(GSource *source) 3045 { 3046 QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source; 3047 3048 object_unref(OBJECT(ssource->rioc)); 3049 } 3050 3051 static GSourceFuncs qio_channel_rdma_source_funcs = { 3052 qio_channel_rdma_source_prepare, 3053 qio_channel_rdma_source_check, 3054 qio_channel_rdma_source_dispatch, 3055 qio_channel_rdma_source_finalize 3056 }; 3057 3058 static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc, 3059 GIOCondition condition) 3060 { 3061 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); 3062 QIOChannelRDMASource *ssource; 3063 GSource *source; 3064 3065 source = g_source_new(&qio_channel_rdma_source_funcs, 3066 sizeof(QIOChannelRDMASource)); 3067 ssource = (QIOChannelRDMASource *)source; 3068 3069 ssource->rioc = rioc; 3070 object_ref(OBJECT(rioc)); 3071 3072 ssource->condition = condition; 3073 3074 return source; 3075 } 3076 3077 static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc, 3078 AioContext *read_ctx, 3079 IOHandler *io_read, 3080 AioContext *write_ctx, 3081 IOHandler *io_write, 3082 void *opaque) 3083 { 3084 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); 3085 if (io_read) { 3086 aio_set_fd_handler(read_ctx, rioc->rdmain->recv_comp_channel->fd, 3087 io_read, io_write, NULL, NULL, opaque); 3088 aio_set_fd_handler(read_ctx, rioc->rdmain->send_comp_channel->fd, 3089 io_read, io_write, NULL, NULL, opaque); 3090 } else { 3091 aio_set_fd_handler(write_ctx, rioc->rdmaout->recv_comp_channel->fd, 3092 io_read, io_write, NULL, NULL, opaque); 3093 aio_set_fd_handler(write_ctx, rioc->rdmaout->send_comp_channel->fd, 3094 io_read, io_write, NULL, NULL, opaque); 3095 } 3096 } 3097 3098 struct rdma_close_rcu { 3099 struct rcu_head rcu; 3100 RDMAContext *rdmain; 3101 RDMAContext *rdmaout; 3102 }; 3103 3104 /* callback from qio_channel_rdma_close via call_rcu */ 3105 static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu) 3106 { 3107 if (rcu->rdmain) { 3108 qemu_rdma_cleanup(rcu->rdmain); 3109 } 3110 3111 if (rcu->rdmaout) { 3112 qemu_rdma_cleanup(rcu->rdmaout); 3113 } 3114 3115 g_free(rcu->rdmain); 3116 g_free(rcu->rdmaout); 3117 g_free(rcu); 3118 } 3119 3120 static int qio_channel_rdma_close(QIOChannel *ioc, 3121 Error **errp) 3122 { 3123 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); 3124 RDMAContext *rdmain, *rdmaout; 3125 struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1); 3126 3127 trace_qemu_rdma_close(); 3128 3129 rdmain = rioc->rdmain; 3130 if (rdmain) { 3131 qatomic_rcu_set(&rioc->rdmain, NULL); 3132 } 3133 3134 rdmaout = rioc->rdmaout; 3135 if (rdmaout) { 3136 qatomic_rcu_set(&rioc->rdmaout, NULL); 3137 } 3138 3139 rcu->rdmain = rdmain; 3140 rcu->rdmaout = rdmaout; 3141 call_rcu(rcu, qio_channel_rdma_close_rcu, rcu); 3142 3143 return 0; 3144 } 3145 3146 static int 3147 qio_channel_rdma_shutdown(QIOChannel *ioc, 3148 QIOChannelShutdown how, 3149 Error **errp) 3150 { 3151 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); 3152 RDMAContext *rdmain, *rdmaout; 3153 3154 RCU_READ_LOCK_GUARD(); 3155 3156 rdmain = qatomic_rcu_read(&rioc->rdmain); 3157 rdmaout = qatomic_rcu_read(&rioc->rdmain); 3158 3159 switch (how) { 3160 case QIO_CHANNEL_SHUTDOWN_READ: 3161 if (rdmain) { 3162 rdmain->errored = true; 3163 } 3164 break; 3165 case QIO_CHANNEL_SHUTDOWN_WRITE: 3166 if (rdmaout) { 3167 rdmaout->errored = true; 3168 } 3169 break; 3170 case QIO_CHANNEL_SHUTDOWN_BOTH: 3171 default: 3172 if (rdmain) { 3173 rdmain->errored = true; 3174 } 3175 if (rdmaout) { 3176 rdmaout->errored = true; 3177 } 3178 break; 3179 } 3180 3181 return 0; 3182 } 3183 3184 /* 3185 * Parameters: 3186 * @offset == 0 : 3187 * This means that 'block_offset' is a full virtual address that does not 3188 * belong to a RAMBlock of the virtual machine and instead 3189 * represents a private malloc'd memory area that the caller wishes to 3190 * transfer. 3191 * 3192 * @offset != 0 : 3193 * Offset is an offset to be added to block_offset and used 3194 * to also lookup the corresponding RAMBlock. 3195 * 3196 * @size : Number of bytes to transfer 3197 * 3198 * @pages_sent : User-specificed pointer to indicate how many pages were 3199 * sent. Usually, this will not be more than a few bytes of 3200 * the protocol because most transfers are sent asynchronously. 3201 */ 3202 static int qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset, 3203 ram_addr_t offset, size_t size) 3204 { 3205 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); 3206 Error *err = NULL; 3207 RDMAContext *rdma; 3208 int ret; 3209 3210 RCU_READ_LOCK_GUARD(); 3211 rdma = qatomic_rcu_read(&rioc->rdmaout); 3212 3213 if (!rdma) { 3214 return -1; 3215 } 3216 3217 if (rdma_errored(rdma)) { 3218 return -1; 3219 } 3220 3221 qemu_fflush(f); 3222 3223 /* 3224 * Add this page to the current 'chunk'. If the chunk 3225 * is full, or the page doesn't belong to the current chunk, 3226 * an actual RDMA write will occur and a new chunk will be formed. 3227 */ 3228 ret = qemu_rdma_write(rdma, block_offset, offset, size, &err); 3229 if (ret < 0) { 3230 error_report_err(err); 3231 goto err; 3232 } 3233 3234 /* 3235 * Drain the Completion Queue if possible, but do not block, 3236 * just poll. 3237 * 3238 * If nothing to poll, the end of the iteration will do this 3239 * again to make sure we don't overflow the request queue. 3240 */ 3241 while (1) { 3242 uint64_t wr_id, wr_id_in; 3243 ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL); 3244 3245 if (ret < 0) { 3246 error_report("rdma migration: polling error"); 3247 goto err; 3248 } 3249 3250 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; 3251 3252 if (wr_id == RDMA_WRID_NONE) { 3253 break; 3254 } 3255 } 3256 3257 while (1) { 3258 uint64_t wr_id, wr_id_in; 3259 ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL); 3260 3261 if (ret < 0) { 3262 error_report("rdma migration: polling error"); 3263 goto err; 3264 } 3265 3266 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; 3267 3268 if (wr_id == RDMA_WRID_NONE) { 3269 break; 3270 } 3271 } 3272 3273 return RAM_SAVE_CONTROL_DELAYED; 3274 3275 err: 3276 rdma->errored = true; 3277 return -1; 3278 } 3279 3280 int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset, 3281 ram_addr_t offset, size_t size) 3282 { 3283 if (!migrate_rdma() || migration_in_postcopy()) { 3284 return RAM_SAVE_CONTROL_NOT_SUPP; 3285 } 3286 3287 int ret = qemu_rdma_save_page(f, block_offset, offset, size); 3288 3289 if (ret != RAM_SAVE_CONTROL_DELAYED && 3290 ret != RAM_SAVE_CONTROL_NOT_SUPP) { 3291 if (ret < 0) { 3292 qemu_file_set_error(f, ret); 3293 } 3294 } 3295 return ret; 3296 } 3297 3298 static void rdma_accept_incoming_migration(void *opaque); 3299 3300 static void rdma_cm_poll_handler(void *opaque) 3301 { 3302 RDMAContext *rdma = opaque; 3303 struct rdma_cm_event *cm_event; 3304 MigrationIncomingState *mis = migration_incoming_get_current(); 3305 3306 if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) { 3307 error_report("get_cm_event failed %d", errno); 3308 return; 3309 } 3310 3311 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED || 3312 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { 3313 if (!rdma->errored && 3314 migration_incoming_get_current()->state != 3315 MIGRATION_STATUS_COMPLETED) { 3316 error_report("receive cm event, cm event is %d", cm_event->event); 3317 rdma->errored = true; 3318 if (rdma->return_path) { 3319 rdma->return_path->errored = true; 3320 } 3321 } 3322 rdma_ack_cm_event(cm_event); 3323 if (mis->loadvm_co) { 3324 qemu_coroutine_enter(mis->loadvm_co); 3325 } 3326 return; 3327 } 3328 rdma_ack_cm_event(cm_event); 3329 } 3330 3331 static int qemu_rdma_accept(RDMAContext *rdma) 3332 { 3333 Error *err = NULL; 3334 RDMACapabilities cap; 3335 struct rdma_conn_param conn_param = { 3336 .responder_resources = 2, 3337 .private_data = &cap, 3338 .private_data_len = sizeof(cap), 3339 }; 3340 RDMAContext *rdma_return_path = NULL; 3341 g_autoptr(InetSocketAddress) isock = g_new0(InetSocketAddress, 1); 3342 struct rdma_cm_event *cm_event; 3343 struct ibv_context *verbs; 3344 int ret; 3345 3346 ret = rdma_get_cm_event(rdma->channel, &cm_event); 3347 if (ret < 0) { 3348 goto err_rdma_dest_wait; 3349 } 3350 3351 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { 3352 rdma_ack_cm_event(cm_event); 3353 goto err_rdma_dest_wait; 3354 } 3355 3356 isock->host = rdma->host; 3357 isock->port = g_strdup_printf("%d", rdma->port); 3358 3359 /* 3360 * initialize the RDMAContext for return path for postcopy after first 3361 * connection request reached. 3362 */ 3363 if ((migrate_postcopy() || migrate_return_path()) 3364 && !rdma->is_return_path) { 3365 rdma_return_path = qemu_rdma_data_init(isock, NULL); 3366 if (rdma_return_path == NULL) { 3367 rdma_ack_cm_event(cm_event); 3368 goto err_rdma_dest_wait; 3369 } 3370 3371 qemu_rdma_return_path_dest_init(rdma_return_path, rdma); 3372 } 3373 3374 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); 3375 3376 network_to_caps(&cap); 3377 3378 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) { 3379 error_report("Unknown source RDMA version: %d, bailing...", 3380 cap.version); 3381 rdma_ack_cm_event(cm_event); 3382 goto err_rdma_dest_wait; 3383 } 3384 3385 /* 3386 * Respond with only the capabilities this version of QEMU knows about. 3387 */ 3388 cap.flags &= known_capabilities; 3389 3390 /* 3391 * Enable the ones that we do know about. 3392 * Add other checks here as new ones are introduced. 3393 */ 3394 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) { 3395 rdma->pin_all = true; 3396 } 3397 3398 rdma->cm_id = cm_event->id; 3399 verbs = cm_event->id->verbs; 3400 3401 rdma_ack_cm_event(cm_event); 3402 3403 trace_qemu_rdma_accept_pin_state(rdma->pin_all); 3404 3405 caps_to_network(&cap); 3406 3407 trace_qemu_rdma_accept_pin_verbsc(verbs); 3408 3409 if (!rdma->verbs) { 3410 rdma->verbs = verbs; 3411 } else if (rdma->verbs != verbs) { 3412 error_report("ibv context not matching %p, %p!", rdma->verbs, 3413 verbs); 3414 goto err_rdma_dest_wait; 3415 } 3416 3417 qemu_rdma_dump_id("dest_init", verbs); 3418 3419 ret = qemu_rdma_alloc_pd_cq(rdma, &err); 3420 if (ret < 0) { 3421 error_report_err(err); 3422 goto err_rdma_dest_wait; 3423 } 3424 3425 ret = qemu_rdma_alloc_qp(rdma); 3426 if (ret < 0) { 3427 error_report("rdma migration: error allocating qp!"); 3428 goto err_rdma_dest_wait; 3429 } 3430 3431 qemu_rdma_init_ram_blocks(rdma); 3432 3433 for (int i = 0; i < RDMA_WRID_MAX; i++) { 3434 ret = qemu_rdma_reg_control(rdma, i); 3435 if (ret < 0) { 3436 error_report("rdma: error registering %d control", i); 3437 goto err_rdma_dest_wait; 3438 } 3439 } 3440 3441 /* Accept the second connection request for return path */ 3442 if ((migrate_postcopy() || migrate_return_path()) 3443 && !rdma->is_return_path) { 3444 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, 3445 NULL, 3446 (void *)(intptr_t)rdma->return_path); 3447 } else { 3448 qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler, 3449 NULL, rdma); 3450 } 3451 3452 ret = rdma_accept(rdma->cm_id, &conn_param); 3453 if (ret < 0) { 3454 error_report("rdma_accept failed"); 3455 goto err_rdma_dest_wait; 3456 } 3457 3458 ret = rdma_get_cm_event(rdma->channel, &cm_event); 3459 if (ret < 0) { 3460 error_report("rdma_accept get_cm_event failed"); 3461 goto err_rdma_dest_wait; 3462 } 3463 3464 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { 3465 error_report("rdma_accept not event established"); 3466 rdma_ack_cm_event(cm_event); 3467 goto err_rdma_dest_wait; 3468 } 3469 3470 rdma_ack_cm_event(cm_event); 3471 rdma->connected = true; 3472 3473 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, &err); 3474 if (ret < 0) { 3475 error_report_err(err); 3476 goto err_rdma_dest_wait; 3477 } 3478 3479 qemu_rdma_dump_gid("dest_connect", rdma->cm_id); 3480 3481 return 0; 3482 3483 err_rdma_dest_wait: 3484 rdma->errored = true; 3485 qemu_rdma_cleanup(rdma); 3486 g_free(rdma_return_path); 3487 return -1; 3488 } 3489 3490 static int dest_ram_sort_func(const void *a, const void *b) 3491 { 3492 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index; 3493 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index; 3494 3495 return (a_index < b_index) ? -1 : (a_index != b_index); 3496 } 3497 3498 /* 3499 * During each iteration of the migration, we listen for instructions 3500 * by the source VM to perform dynamic page registrations before they 3501 * can perform RDMA operations. 3502 * 3503 * We respond with the 'rkey'. 3504 * 3505 * Keep doing this until the source tells us to stop. 3506 */ 3507 int rdma_registration_handle(QEMUFile *f) 3508 { 3509 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), 3510 .type = RDMA_CONTROL_REGISTER_RESULT, 3511 .repeat = 0, 3512 }; 3513 RDMAControlHeader unreg_resp = { .len = 0, 3514 .type = RDMA_CONTROL_UNREGISTER_FINISHED, 3515 .repeat = 0, 3516 }; 3517 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT, 3518 .repeat = 1 }; 3519 QIOChannelRDMA *rioc; 3520 Error *err = NULL; 3521 RDMAContext *rdma; 3522 RDMALocalBlocks *local; 3523 RDMAControlHeader head; 3524 RDMARegister *reg, *registers; 3525 RDMACompress *comp; 3526 RDMARegisterResult *reg_result; 3527 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE]; 3528 RDMALocalBlock *block; 3529 void *host_addr; 3530 int ret; 3531 int idx = 0; 3532 3533 if (!migrate_rdma()) { 3534 return 0; 3535 } 3536 3537 RCU_READ_LOCK_GUARD(); 3538 rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); 3539 rdma = qatomic_rcu_read(&rioc->rdmain); 3540 3541 if (!rdma) { 3542 return -1; 3543 } 3544 3545 if (rdma_errored(rdma)) { 3546 return -1; 3547 } 3548 3549 local = &rdma->local_ram_blocks; 3550 do { 3551 trace_rdma_registration_handle_wait(); 3552 3553 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE, &err); 3554 3555 if (ret < 0) { 3556 error_report_err(err); 3557 break; 3558 } 3559 3560 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) { 3561 error_report("rdma: Too many requests in this message (%d)." 3562 "Bailing.", head.repeat); 3563 break; 3564 } 3565 3566 switch (head.type) { 3567 case RDMA_CONTROL_COMPRESS: 3568 comp = (RDMACompress *) rdma->wr_data[idx].control_curr; 3569 network_to_compress(comp); 3570 3571 trace_rdma_registration_handle_compress(comp->length, 3572 comp->block_idx, 3573 comp->offset); 3574 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) { 3575 error_report("rdma: 'compress' bad block index %u (vs %d)", 3576 (unsigned int)comp->block_idx, 3577 rdma->local_ram_blocks.nb_blocks); 3578 goto err; 3579 } 3580 block = &(rdma->local_ram_blocks.block[comp->block_idx]); 3581 3582 host_addr = block->local_host_addr + 3583 (comp->offset - block->offset); 3584 if (comp->value) { 3585 error_report("rdma: Zero page with non-zero (%d) value", 3586 comp->value); 3587 goto err; 3588 } 3589 ram_handle_zero(host_addr, comp->length); 3590 break; 3591 3592 case RDMA_CONTROL_REGISTER_FINISHED: 3593 trace_rdma_registration_handle_finished(); 3594 return 0; 3595 3596 case RDMA_CONTROL_RAM_BLOCKS_REQUEST: 3597 trace_rdma_registration_handle_ram_blocks(); 3598 3599 /* Sort our local RAM Block list so it's the same as the source, 3600 * we can do this since we've filled in a src_index in the list 3601 * as we received the RAMBlock list earlier. 3602 */ 3603 qsort(rdma->local_ram_blocks.block, 3604 rdma->local_ram_blocks.nb_blocks, 3605 sizeof(RDMALocalBlock), dest_ram_sort_func); 3606 for (int i = 0; i < local->nb_blocks; i++) { 3607 local->block[i].index = i; 3608 } 3609 3610 if (rdma->pin_all) { 3611 ret = qemu_rdma_reg_whole_ram_blocks(rdma, &err); 3612 if (ret < 0) { 3613 error_report_err(err); 3614 goto err; 3615 } 3616 } 3617 3618 /* 3619 * Dest uses this to prepare to transmit the RAMBlock descriptions 3620 * to the source VM after connection setup. 3621 * Both sides use the "remote" structure to communicate and update 3622 * their "local" descriptions with what was sent. 3623 */ 3624 for (int i = 0; i < local->nb_blocks; i++) { 3625 rdma->dest_blocks[i].remote_host_addr = 3626 (uintptr_t)(local->block[i].local_host_addr); 3627 3628 if (rdma->pin_all) { 3629 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey; 3630 } 3631 3632 rdma->dest_blocks[i].offset = local->block[i].offset; 3633 rdma->dest_blocks[i].length = local->block[i].length; 3634 3635 dest_block_to_network(&rdma->dest_blocks[i]); 3636 trace_rdma_registration_handle_ram_blocks_loop( 3637 local->block[i].block_name, 3638 local->block[i].offset, 3639 local->block[i].length, 3640 local->block[i].local_host_addr, 3641 local->block[i].src_index); 3642 } 3643 3644 blocks.len = rdma->local_ram_blocks.nb_blocks 3645 * sizeof(RDMADestBlock); 3646 3647 3648 ret = qemu_rdma_post_send_control(rdma, 3649 (uint8_t *) rdma->dest_blocks, &blocks, 3650 &err); 3651 3652 if (ret < 0) { 3653 error_report_err(err); 3654 goto err; 3655 } 3656 3657 break; 3658 case RDMA_CONTROL_REGISTER_REQUEST: 3659 trace_rdma_registration_handle_register(head.repeat); 3660 3661 reg_resp.repeat = head.repeat; 3662 registers = (RDMARegister *) rdma->wr_data[idx].control_curr; 3663 3664 for (int count = 0; count < head.repeat; count++) { 3665 uint64_t chunk; 3666 uint8_t *chunk_start, *chunk_end; 3667 3668 reg = ®isters[count]; 3669 network_to_register(reg); 3670 3671 reg_result = &results[count]; 3672 3673 trace_rdma_registration_handle_register_loop(count, 3674 reg->current_index, reg->key.current_addr, reg->chunks); 3675 3676 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) { 3677 error_report("rdma: 'register' bad block index %u (vs %d)", 3678 (unsigned int)reg->current_index, 3679 rdma->local_ram_blocks.nb_blocks); 3680 goto err; 3681 } 3682 block = &(rdma->local_ram_blocks.block[reg->current_index]); 3683 if (block->is_ram_block) { 3684 if (block->offset > reg->key.current_addr) { 3685 error_report("rdma: bad register address for block %s" 3686 " offset: %" PRIx64 " current_addr: %" PRIx64, 3687 block->block_name, block->offset, 3688 reg->key.current_addr); 3689 goto err; 3690 } 3691 host_addr = (block->local_host_addr + 3692 (reg->key.current_addr - block->offset)); 3693 chunk = ram_chunk_index(block->local_host_addr, 3694 (uint8_t *) host_addr); 3695 } else { 3696 chunk = reg->key.chunk; 3697 host_addr = block->local_host_addr + 3698 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT)); 3699 /* Check for particularly bad chunk value */ 3700 if (host_addr < (void *)block->local_host_addr) { 3701 error_report("rdma: bad chunk for block %s" 3702 " chunk: %" PRIx64, 3703 block->block_name, reg->key.chunk); 3704 goto err; 3705 } 3706 } 3707 chunk_start = ram_chunk_start(block, chunk); 3708 chunk_end = ram_chunk_end(block, chunk + reg->chunks); 3709 /* avoid "-Waddress-of-packed-member" warning */ 3710 uint32_t tmp_rkey = 0; 3711 if (qemu_rdma_register_and_get_keys(rdma, block, 3712 (uintptr_t)host_addr, NULL, &tmp_rkey, 3713 chunk, chunk_start, chunk_end)) { 3714 error_report("cannot get rkey"); 3715 goto err; 3716 } 3717 reg_result->rkey = tmp_rkey; 3718 3719 reg_result->host_addr = (uintptr_t)block->local_host_addr; 3720 3721 trace_rdma_registration_handle_register_rkey(reg_result->rkey); 3722 3723 result_to_network(reg_result); 3724 } 3725 3726 ret = qemu_rdma_post_send_control(rdma, 3727 (uint8_t *) results, ®_resp, &err); 3728 3729 if (ret < 0) { 3730 error_report_err(err); 3731 goto err; 3732 } 3733 break; 3734 case RDMA_CONTROL_UNREGISTER_REQUEST: 3735 trace_rdma_registration_handle_unregister(head.repeat); 3736 unreg_resp.repeat = head.repeat; 3737 registers = (RDMARegister *) rdma->wr_data[idx].control_curr; 3738 3739 for (int count = 0; count < head.repeat; count++) { 3740 reg = ®isters[count]; 3741 network_to_register(reg); 3742 3743 trace_rdma_registration_handle_unregister_loop(count, 3744 reg->current_index, reg->key.chunk); 3745 3746 block = &(rdma->local_ram_blocks.block[reg->current_index]); 3747 3748 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]); 3749 block->pmr[reg->key.chunk] = NULL; 3750 3751 if (ret != 0) { 3752 error_report("rdma unregistration chunk failed: %s", 3753 strerror(errno)); 3754 goto err; 3755 } 3756 3757 rdma->total_registrations--; 3758 3759 trace_rdma_registration_handle_unregister_success(reg->key.chunk); 3760 } 3761 3762 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp, &err); 3763 3764 if (ret < 0) { 3765 error_report_err(err); 3766 goto err; 3767 } 3768 break; 3769 case RDMA_CONTROL_REGISTER_RESULT: 3770 error_report("Invalid RESULT message at dest."); 3771 goto err; 3772 default: 3773 error_report("Unknown control message %s", control_desc(head.type)); 3774 goto err; 3775 } 3776 } while (1); 3777 3778 err: 3779 rdma->errored = true; 3780 return -1; 3781 } 3782 3783 /* Destination: 3784 * Called during the initial RAM load section which lists the 3785 * RAMBlocks by name. This lets us know the order of the RAMBlocks on 3786 * the source. We've already built our local RAMBlock list, but not 3787 * yet sent the list to the source. 3788 */ 3789 int rdma_block_notification_handle(QEMUFile *f, const char *name) 3790 { 3791 int curr; 3792 int found = -1; 3793 3794 if (!migrate_rdma()) { 3795 return 0; 3796 } 3797 3798 RCU_READ_LOCK_GUARD(); 3799 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); 3800 RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmain); 3801 3802 if (!rdma) { 3803 return -1; 3804 } 3805 3806 /* Find the matching RAMBlock in our local list */ 3807 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) { 3808 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) { 3809 found = curr; 3810 break; 3811 } 3812 } 3813 3814 if (found == -1) { 3815 error_report("RAMBlock '%s' not found on destination", name); 3816 return -1; 3817 } 3818 3819 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index; 3820 trace_rdma_block_notification_handle(name, rdma->next_src_index); 3821 rdma->next_src_index++; 3822 3823 return 0; 3824 } 3825 3826 int rdma_registration_start(QEMUFile *f, uint64_t flags) 3827 { 3828 if (!migrate_rdma() || migration_in_postcopy()) { 3829 return 0; 3830 } 3831 3832 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); 3833 RCU_READ_LOCK_GUARD(); 3834 RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmaout); 3835 if (!rdma) { 3836 return -1; 3837 } 3838 3839 if (rdma_errored(rdma)) { 3840 return -1; 3841 } 3842 3843 trace_rdma_registration_start(flags); 3844 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK); 3845 return qemu_fflush(f); 3846 } 3847 3848 /* 3849 * Inform dest that dynamic registrations are done for now. 3850 * First, flush writes, if any. 3851 */ 3852 int rdma_registration_stop(QEMUFile *f, uint64_t flags) 3853 { 3854 QIOChannelRDMA *rioc; 3855 Error *err = NULL; 3856 RDMAContext *rdma; 3857 RDMAControlHeader head = { .len = 0, .repeat = 1 }; 3858 int ret; 3859 3860 if (!migrate_rdma() || migration_in_postcopy()) { 3861 return 0; 3862 } 3863 3864 RCU_READ_LOCK_GUARD(); 3865 rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); 3866 rdma = qatomic_rcu_read(&rioc->rdmaout); 3867 if (!rdma) { 3868 return -1; 3869 } 3870 3871 if (rdma_errored(rdma)) { 3872 return -1; 3873 } 3874 3875 qemu_fflush(f); 3876 ret = qemu_rdma_drain_cq(rdma); 3877 3878 if (ret < 0) { 3879 goto err; 3880 } 3881 3882 if (flags == RAM_CONTROL_SETUP) { 3883 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; 3884 RDMALocalBlocks *local = &rdma->local_ram_blocks; 3885 int reg_result_idx, nb_dest_blocks; 3886 3887 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; 3888 trace_rdma_registration_stop_ram(); 3889 3890 /* 3891 * Make sure that we parallelize the pinning on both sides. 3892 * For very large guests, doing this serially takes a really 3893 * long time, so we have to 'interleave' the pinning locally 3894 * with the control messages by performing the pinning on this 3895 * side before we receive the control response from the other 3896 * side that the pinning has completed. 3897 */ 3898 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp, 3899 ®_result_idx, rdma->pin_all ? 3900 qemu_rdma_reg_whole_ram_blocks : NULL, 3901 &err); 3902 if (ret < 0) { 3903 error_report_err(err); 3904 return -1; 3905 } 3906 3907 nb_dest_blocks = resp.len / sizeof(RDMADestBlock); 3908 3909 /* 3910 * The protocol uses two different sets of rkeys (mutually exclusive): 3911 * 1. One key to represent the virtual address of the entire ram block. 3912 * (dynamic chunk registration disabled - pin everything with one rkey.) 3913 * 2. One to represent individual chunks within a ram block. 3914 * (dynamic chunk registration enabled - pin individual chunks.) 3915 * 3916 * Once the capability is successfully negotiated, the destination transmits 3917 * the keys to use (or sends them later) including the virtual addresses 3918 * and then propagates the remote ram block descriptions to his local copy. 3919 */ 3920 3921 if (local->nb_blocks != nb_dest_blocks) { 3922 error_report("ram blocks mismatch (Number of blocks %d vs %d)", 3923 local->nb_blocks, nb_dest_blocks); 3924 error_printf("Your QEMU command line parameters are probably " 3925 "not identical on both the source and destination."); 3926 rdma->errored = true; 3927 return -1; 3928 } 3929 3930 qemu_rdma_move_header(rdma, reg_result_idx, &resp); 3931 memcpy(rdma->dest_blocks, 3932 rdma->wr_data[reg_result_idx].control_curr, resp.len); 3933 for (int i = 0; i < nb_dest_blocks; i++) { 3934 network_to_dest_block(&rdma->dest_blocks[i]); 3935 3936 /* We require that the blocks are in the same order */ 3937 if (rdma->dest_blocks[i].length != local->block[i].length) { 3938 error_report("Block %s/%d has a different length %" PRIu64 3939 "vs %" PRIu64, 3940 local->block[i].block_name, i, 3941 local->block[i].length, 3942 rdma->dest_blocks[i].length); 3943 rdma->errored = true; 3944 return -1; 3945 } 3946 local->block[i].remote_host_addr = 3947 rdma->dest_blocks[i].remote_host_addr; 3948 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey; 3949 } 3950 } 3951 3952 trace_rdma_registration_stop(flags); 3953 3954 head.type = RDMA_CONTROL_REGISTER_FINISHED; 3955 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL, &err); 3956 3957 if (ret < 0) { 3958 error_report_err(err); 3959 goto err; 3960 } 3961 3962 return 0; 3963 err: 3964 rdma->errored = true; 3965 return -1; 3966 } 3967 3968 static void qio_channel_rdma_finalize(Object *obj) 3969 { 3970 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj); 3971 if (rioc->rdmain) { 3972 qemu_rdma_cleanup(rioc->rdmain); 3973 g_free(rioc->rdmain); 3974 rioc->rdmain = NULL; 3975 } 3976 if (rioc->rdmaout) { 3977 qemu_rdma_cleanup(rioc->rdmaout); 3978 g_free(rioc->rdmaout); 3979 rioc->rdmaout = NULL; 3980 } 3981 } 3982 3983 static void qio_channel_rdma_class_init(ObjectClass *klass, 3984 void *class_data G_GNUC_UNUSED) 3985 { 3986 QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass); 3987 3988 ioc_klass->io_writev = qio_channel_rdma_writev; 3989 ioc_klass->io_readv = qio_channel_rdma_readv; 3990 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking; 3991 ioc_klass->io_close = qio_channel_rdma_close; 3992 ioc_klass->io_create_watch = qio_channel_rdma_create_watch; 3993 ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler; 3994 ioc_klass->io_shutdown = qio_channel_rdma_shutdown; 3995 } 3996 3997 static const TypeInfo qio_channel_rdma_info = { 3998 .parent = TYPE_QIO_CHANNEL, 3999 .name = TYPE_QIO_CHANNEL_RDMA, 4000 .instance_size = sizeof(QIOChannelRDMA), 4001 .instance_finalize = qio_channel_rdma_finalize, 4002 .class_init = qio_channel_rdma_class_init, 4003 }; 4004 4005 static void qio_channel_rdma_register_types(void) 4006 { 4007 type_register_static(&qio_channel_rdma_info); 4008 } 4009 4010 type_init(qio_channel_rdma_register_types); 4011 4012 static QEMUFile *rdma_new_input(RDMAContext *rdma) 4013 { 4014 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA)); 4015 4016 rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc)); 4017 rioc->rdmain = rdma; 4018 rioc->rdmaout = rdma->return_path; 4019 4020 return rioc->file; 4021 } 4022 4023 static QEMUFile *rdma_new_output(RDMAContext *rdma) 4024 { 4025 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA)); 4026 4027 rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc)); 4028 rioc->rdmaout = rdma; 4029 rioc->rdmain = rdma->return_path; 4030 4031 return rioc->file; 4032 } 4033 4034 static void rdma_accept_incoming_migration(void *opaque) 4035 { 4036 RDMAContext *rdma = opaque; 4037 QEMUFile *f; 4038 Error *local_err = NULL; 4039 4040 trace_qemu_rdma_accept_incoming_migration(); 4041 if (qemu_rdma_accept(rdma) < 0) { 4042 error_report("RDMA ERROR: Migration initialization failed"); 4043 return; 4044 } 4045 4046 trace_qemu_rdma_accept_incoming_migration_accepted(); 4047 4048 if (rdma->is_return_path) { 4049 return; 4050 } 4051 4052 f = rdma_new_input(rdma); 4053 if (f == NULL) { 4054 error_report("RDMA ERROR: could not open RDMA for input"); 4055 qemu_rdma_cleanup(rdma); 4056 return; 4057 } 4058 4059 rdma->migration_started_on_destination = 1; 4060 migration_fd_process_incoming(f, &local_err); 4061 if (local_err) { 4062 error_reportf_err(local_err, "RDMA ERROR:"); 4063 } 4064 } 4065 4066 void rdma_start_incoming_migration(InetSocketAddress *host_port, 4067 Error **errp) 4068 { 4069 MigrationState *s = migrate_get_current(); 4070 int ret; 4071 RDMAContext *rdma; 4072 4073 trace_rdma_start_incoming_migration(); 4074 4075 /* Avoid ram_block_discard_disable(), cannot change during migration. */ 4076 if (ram_block_discard_is_required()) { 4077 error_setg(errp, "RDMA: cannot disable RAM discard"); 4078 return; 4079 } 4080 4081 rdma = qemu_rdma_data_init(host_port, errp); 4082 if (rdma == NULL) { 4083 goto err; 4084 } 4085 4086 ret = qemu_rdma_dest_init(rdma, errp); 4087 if (ret < 0) { 4088 goto err; 4089 } 4090 4091 trace_rdma_start_incoming_migration_after_dest_init(); 4092 4093 ret = rdma_listen(rdma->listen_id, 5); 4094 4095 if (ret < 0) { 4096 error_setg(errp, "RDMA ERROR: listening on socket!"); 4097 goto cleanup_rdma; 4098 } 4099 4100 trace_rdma_start_incoming_migration_after_rdma_listen(); 4101 s->rdma_migration = true; 4102 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, 4103 NULL, (void *)(intptr_t)rdma); 4104 return; 4105 4106 cleanup_rdma: 4107 qemu_rdma_cleanup(rdma); 4108 err: 4109 if (rdma) { 4110 g_free(rdma->host); 4111 } 4112 g_free(rdma); 4113 } 4114 4115 void rdma_start_outgoing_migration(void *opaque, 4116 InetSocketAddress *host_port, Error **errp) 4117 { 4118 MigrationState *s = opaque; 4119 RDMAContext *rdma_return_path = NULL; 4120 RDMAContext *rdma; 4121 int ret; 4122 4123 /* Avoid ram_block_discard_disable(), cannot change during migration. */ 4124 if (ram_block_discard_is_required()) { 4125 error_setg(errp, "RDMA: cannot disable RAM discard"); 4126 return; 4127 } 4128 4129 rdma = qemu_rdma_data_init(host_port, errp); 4130 if (rdma == NULL) { 4131 goto err; 4132 } 4133 4134 ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp); 4135 4136 if (ret < 0) { 4137 goto err; 4138 } 4139 4140 trace_rdma_start_outgoing_migration_after_rdma_source_init(); 4141 ret = qemu_rdma_connect(rdma, false, errp); 4142 4143 if (ret < 0) { 4144 goto err; 4145 } 4146 4147 /* RDMA postcopy need a separate queue pair for return path */ 4148 if (migrate_postcopy() || migrate_return_path()) { 4149 rdma_return_path = qemu_rdma_data_init(host_port, errp); 4150 4151 if (rdma_return_path == NULL) { 4152 goto return_path_err; 4153 } 4154 4155 ret = qemu_rdma_source_init(rdma_return_path, 4156 migrate_rdma_pin_all(), errp); 4157 4158 if (ret < 0) { 4159 goto return_path_err; 4160 } 4161 4162 ret = qemu_rdma_connect(rdma_return_path, true, errp); 4163 4164 if (ret < 0) { 4165 goto return_path_err; 4166 } 4167 4168 rdma->return_path = rdma_return_path; 4169 rdma_return_path->return_path = rdma; 4170 rdma_return_path->is_return_path = true; 4171 } 4172 4173 trace_rdma_start_outgoing_migration_after_rdma_connect(); 4174 4175 s->to_dst_file = rdma_new_output(rdma); 4176 s->rdma_migration = true; 4177 migrate_fd_connect(s, NULL); 4178 return; 4179 return_path_err: 4180 qemu_rdma_cleanup(rdma); 4181 err: 4182 g_free(rdma); 4183 g_free(rdma_return_path); 4184 } 4185